VictoriaMetrics/lib/promscrape/discovery/yandexcloud/yandexcloud.go
Aliaksandr Valialkin d5a599badc
lib/promauth: follow-up for e16d3f5639
- Make sure that invalid/missing TLS CA file or TLS client certificate files at vmagent startup
  don't prevent from processing the corresponding scrape targets after the file becomes correct,
  without the need to restart vmagent.
  Previously scrape targets with invalid TLS CA file or TLS client certificate files
  were permanently dropped after the first attempt to initialize them, and they didn't
  appear until the next vmagent reload or the next change in other places of the loaded scrape configs.

- Make sure that TLS CA is properly re-loaded from file after it changes without the need to restart vmagent.
  Previously the old TLS CA was used until vmagent restart.

- Properly handle errors during http request creation for the second attempt to send data to remote system
  at vmagent and vmalert. Previously failed request creation could result in nil pointer dereferencing,
  since the returned request is nil on error.

- Add more context to the logged error during AWS sigv4 request signing before sending the data to -remoteWrite.url at vmagent.
  Previously it could miss details on the source of the request.

- Do not create a new HTTP client per second when generating OAuth2 token needed to put in Authorization header
  of every http request issued by vmagent during service discovery or target scraping.
  Re-use the HTTP client instead until the corresponding scrape config changes.

- Cache error at lib/promauth.Config.GetAuthHeader() in the same way as the auth header is cached,
  e.g. the error is cached for a second now. This should reduce load on CPU and OAuth2 server
  when auth header cannot be obtained because of temporary error.

- Share tls.Config.GetClientCertificate function among multiple scrape targets with the same tls_config.
  Cache the loaded certificate and the error for one second. This should significantly reduce CPU load
  when scraping big number of targets with the same tls_config.

- Allow loading TLS certificates from HTTP and HTTPs urls by specifying these urls at `tls_config->cert_file` and `tls_config->key_file`.

- Improve test coverage at lib/promauth

- Skip unreachable or invalid files specified at `scrape_config_files` during vmagent startup, since these files may become valid later.
  Previously vmagent was exitting in this case.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959
2023-10-25 23:19:37 +02:00

237 lines
7.8 KiB
Go

package yandexcloud
import (
"encoding/json"
"flag"
"fmt"
"net/url"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
// SDCheckInterval defines interval for targets refresh.
var SDCheckInterval = flag.Duration("promscrape.yandexcloudSDCheckInterval", 30*time.Second, "Interval for checking for changes in Yandex Cloud API. "+
"This works only if yandexcloud_sd_configs is configured in '-promscrape.config' file. "+
"See https://docs.victoriametrics.com/sd_configs.html#yandexcloud_sd_configs for details")
// SDConfig is the configuration for Yandex Cloud service discovery.
type SDConfig struct {
Service string `yaml:"service"`
YandexPassportOAuthToken *promauth.Secret `yaml:"yandex_passport_oauth_token,omitempty"`
APIEndpoint string `yaml:"api_endpoint,omitempty"`
TLSConfig *promauth.TLSConfig `yaml:"tls_config,omitempty"`
}
// GetLabels returns labels for Yandex Cloud according to service discover config.
func (sdc *SDConfig) GetLabels(baseDir string) ([]*promutils.Labels, error) {
cfg, err := getAPIConfig(sdc, baseDir)
if err != nil {
return nil, fmt.Errorf("cannot get API config: %w", err)
}
switch sdc.Service {
case "compute":
return getInstancesLabels(cfg)
default:
return nil, fmt.Errorf("skipping unexpected service=%q; only `compute` supported for now", sdc.Service)
}
}
func (cfg *apiConfig) getInstances(folderID string) ([]instance, error) {
instancesURL := cfg.serviceEndpoints["compute"] + "/compute/v1/instances"
instancesURL += "?folderId=" + url.QueryEscape(folderID)
var instances []instance
nextLink := instancesURL
for {
data, err := getAPIResponse(nextLink, cfg)
if err != nil {
return nil, fmt.Errorf("cannot get instances: %w", err)
}
var ip instancesPage
if err := json.Unmarshal(data, &ip); err != nil {
return nil, fmt.Errorf("cannot parse instances response from %q: %w; response body: %s", nextLink, err, data)
}
instances = append(instances, ip.Instances...)
if len(ip.NextPageToken) == 0 {
return instances, nil
}
nextLink = instancesURL + "&pageToken=" + url.QueryEscape(ip.NextPageToken)
}
}
// See https://cloud.yandex.com/en-ru/docs/compute/api-ref/Instance/list
type instancesPage struct {
Instances []instance `json:"instances"`
NextPageToken string `json:"nextPageToken"`
}
type instance struct {
ID string `json:"id"`
Name string `json:"name"`
FQDN string `json:"fqdn"`
Status string `json:"status"`
FolderID string `json:"folderId"`
PlatformID string `json:"platformId"`
Resources resources `json:"resources"`
NetworkInterfaces []networkInterface `json:"networkInterfaces"`
Labels map[string]string `json:"labels,omitempty"`
}
type resources struct {
Cores string `json:"cores"`
CoreFraction string `json:"coreFraction"`
Memory string `json:"memory"`
}
type networkInterface struct {
Index string `json:"index"`
MacAddress string `json:"macAddress"`
SubnetID string `json:"subnetId"`
PrimaryV4Address primaryV4Address `json:"primaryV4Address"`
}
type primaryV4Address struct {
Address string `json:"address"`
OneToOneNat oneToOneNat `json:"oneToOneNat"`
DNSRecords []dnsRecord `json:"dnsRecords"`
}
type oneToOneNat struct {
Address string `json:"address"`
IPVersion string `json:"ipVersion"`
DNSRecords []dnsRecord `json:"dnsRecords"`
}
type dnsRecord struct {
FQDN string `json:"fqdn"`
DNSZoneID string `json:"dnsZoneId"`
TTL string `json:"ttl"`
PTR bool `json:"ptr"`
}
func (cfg *apiConfig) getFolders(clouds []cloud) ([]folder, error) {
foldersURL := cfg.serviceEndpoints["resource-manager"] + "/resource-manager/v1/folders"
var folders []folder
for _, cl := range clouds {
cloudURL := foldersURL + "?cloudId=" + url.QueryEscape(cl.ID)
nextLink := cloudURL
for {
data, err := getAPIResponse(nextLink, cfg)
if err != nil {
return nil, fmt.Errorf("cannot get folders: %w", err)
}
var fp foldersPage
if err := json.Unmarshal(data, &fp); err != nil {
return nil, fmt.Errorf("cannot parse folders response from %q: %w; response body: %s", nextLink, err, data)
}
folders = append(folders, fp.Folders...)
if len(fp.NextPageToken) == 0 {
break
}
nextLink = cloudURL + "&pageToken=" + url.QueryEscape(fp.NextPageToken)
}
}
return folders, nil
}
// See https://cloud.yandex.com/en-ru/docs/resource-manager/api-ref/Folder/list
type foldersPage struct {
Folders []folder `json:"folders"`
NextPageToken string `json:"nextPageToken"`
}
type folder struct {
Name string `json:"name"`
ID string `json:"id"`
CloudID string `json:"cloudId"`
Description string `json:"description"`
Status string `json:"status"`
Labels map[string]string `json:"labels"`
CreatedAt time.Time `json:"createdAt"`
}
func (cfg *apiConfig) getClouds(orgs []organization) ([]cloud, error) {
cloudsURL := cfg.serviceEndpoints["resource-manager"] + "/resource-manager/v1/clouds"
if len(orgs) == 0 {
orgs = append(orgs, organization{
ID: "",
})
}
var clouds []cloud
for _, org := range orgs {
orgURL := cloudsURL
if org.ID != "" {
orgURL += "?organizationId=" + url.QueryEscape(org.ID)
}
nextLink := orgURL
for {
data, err := getAPIResponse(nextLink, cfg)
if err != nil {
return nil, fmt.Errorf("cannot get clouds: %w", err)
}
var cp cloudsPage
if err := json.Unmarshal(data, &cp); err != nil {
return nil, fmt.Errorf("cannot parse clouds response from %q: %w; response body: %s", nextLink, err, data)
}
clouds = append(clouds, cp.Clouds...)
if len(cp.NextPageToken) == 0 {
break
}
nextLink = orgURL + "&pageToken=" + url.QueryEscape(cp.NextPageToken)
}
}
return clouds, nil
}
// See https://cloud.yandex.com/en-ru/docs/resource-manager/api-ref/Cloud/list
type cloudsPage struct {
Clouds []cloud `json:"clouds"`
NextPageToken string `json:"nextPageToken"`
}
type cloud struct {
Name string `json:"name"`
ID string `json:"id"`
Labels map[string]string `json:"labels"`
OrganizationID string `json:"organizationId"`
Description string `json:"description"`
CreatedAt time.Time `json:"createdAt"`
}
func (cfg *apiConfig) getOrganizations() ([]organization, error) {
orgsURL := cfg.serviceEndpoints["organization-manager"] + "/organization-manager/v1/organizations"
var orgs []organization
nextLink := orgsURL
for {
data, err := getAPIResponse(nextLink, cfg)
if err != nil {
return nil, fmt.Errorf("cannot get organizations: %w", err)
}
var op organizationsPage
if err := json.Unmarshal(data, &op); err != nil {
return nil, fmt.Errorf("cannot parse organizations response from %q: %w; response body: %s", nextLink, err, data)
}
orgs = append(orgs, op.Organizations...)
if len(op.NextPageToken) == 0 {
return orgs, nil
}
nextLink = orgsURL + "&pageToken=" + url.QueryEscape(op.NextPageToken)
}
}
// See https://cloud.yandex.com/en-ru/docs/organization/api-ref/Organization/list
type organizationsPage struct {
Organizations []organization `json:"organizations"`
NextPageToken string `json:"nextPageToken"`
}
type organization struct {
Name string `json:"name"`
ID string `json:"id"`
Labels map[string]string `json:"labels"`
Title string `json:"title"`
Description string `json:"description"`
CreatedAt time.Time `json:"createdAt"`
}