From 58a935ddb4bb06108017aa2c786300e984c669f8 Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Mon, 3 Mar 2025 15:21:21 -0500 Subject: [PATCH 1/6] Split refresh interval for tags and volumes and set the default refresh for volumes to 5m --- plugins/processors/ec2tagger/config.go | 3 +- plugins/processors/ec2tagger/ec2tagger.go | 91 ++++++-- .../processors/ec2tagger/ec2tagger_test.go | 21 +- .../sampleConfig/advanced_config_darwin.yaml | 5 +- .../sampleConfig/advanced_config_linux.yaml | 11 +- .../sampleConfig/advanced_config_windows.yaml | 9 +- .../sampleConfig/amp_config_linux.yaml | 3 +- .../sampleConfig/basic_config_linux.yaml | 5 +- .../sampleConfig/basic_config_windows.yaml | 5 +- .../sampleConfig/compass_linux_config.yaml | 5 +- .../sampleConfig/complete_darwin_config.yaml | 9 +- .../sampleConfig/complete_linux_config.yaml | 17 +- .../sampleConfig/delta_config_linux.yaml | 3 +- .../sampleConfig/delta_net_config_linux.yaml | 5 +- .../sampleConfig/drop_origin_linux.yaml | 7 +- .../emf_and_kubernetes_with_gpu_config.yaml | 218 +++++++++--------- .../ignore_append_dimensions.yaml | 3 +- .../sampleConfig/invalid_input_linux.yaml | 5 +- .../sampleConfig/otlp_metrics_config.yaml | 3 +- .../sampleConfig/otlp_metrics_eks_config.yaml | 3 +- .../prometheus_combined_config_linux.yaml | 2 +- .../prometheus_otel_config_linux.yaml | 2 +- .../sampleConfig/standard_config_linux.yaml | 3 +- ...ndard_config_linux_with_common_config.yaml | 7 +- .../sampleConfig/standard_config_windows.yaml | 5 +- ...ard_config_windows_with_common_config.yaml | 7 +- .../ec2taggerprocessor/translator.go | 3 +- .../ec2taggerprocessor/translator_test.go | 9 +- 28 files changed, 275 insertions(+), 194 deletions(-) diff --git a/plugins/processors/ec2tagger/config.go b/plugins/processors/ec2tagger/config.go index 9119139567..733c8320ba 100644 --- a/plugins/processors/ec2tagger/config.go +++ b/plugins/processors/ec2tagger/config.go @@ -22,7 +22,8 @@ const ( ) type Config struct { - RefreshIntervalSeconds time.Duration `mapstructure:"refresh_interval_seconds"` + RefreshTagsInterval time.Duration `mapstructure:"refresh_tags_interval"` + RefreshVolumesInterval time.Duration `mapstructure:"refresh_volumes_interval"` EC2MetadataTags []string `mapstructure:"ec2_metadata_tags"` EC2InstanceTagKeys []string `mapstructure:"ec2_instance_tag_keys"` EBSDeviceKeys []string `mapstructure:"ebs_device_keys,omitempty"` diff --git a/plugins/processors/ec2tagger/ec2tagger.go b/plugins/processors/ec2tagger/ec2tagger.go index 839ea21079..1d5490b580 100644 --- a/plugins/processors/ec2tagger/ec2tagger.go +++ b/plugins/processors/ec2tagger/ec2tagger.go @@ -197,28 +197,23 @@ func (t *Tagger) Shutdown(context.Context) error { return nil } -// refreshLoop handles the refresh ticks and also responds to shutdown signal -func (t *Tagger) refreshLoop(refreshInterval time.Duration, stopAfterFirstSuccess bool) { +// refreshLoopTags handles the refresh ticks for describe tags and also responds to shutdown signal +func (t *Tagger) refreshLoopTags(refreshInterval time.Duration, stopAfterFirstSuccess bool) { refreshTicker := time.NewTicker(refreshInterval) defer refreshTicker.Stop() for { select { case <-refreshTicker.C: - t.logger.Debug("ec2tagger refreshing") + t.logger.Debug("ec2tagger refreshing tags") allTagsRetrieved := t.ec2TagsRetrieved() - allVolumesRetrieved := t.ebsVolumesRetrieved() t.logger.Debug("Retrieve status", - zap.Bool("Ec2AllTagsRetrieved", allTagsRetrieved), - zap.Bool("EbsAllVolumesRetrieved", allVolumesRetrieved)) + zap.Bool("Ec2AllTagsRetrieved", allTagsRetrieved)) refreshTags := len(t.EC2InstanceTagKeys) > 0 - refreshVolumes := len(t.EBSDeviceKeys) > 0 if stopAfterFirstSuccess { // need refresh tags when it is configured and not all ec2 tags are retrieved refreshTags = refreshTags && !allTagsRetrieved - // need refresh volumes when it is configured and not all volumes are retrieved - refreshVolumes = refreshVolumes && !allVolumesRetrieved - if !refreshTags && !refreshVolumes { + if !refreshTags { t.logger.Info("ec2tagger: Refresh is no longer needed, stop refreshTicker.") return } @@ -230,6 +225,34 @@ func (t *Tagger) refreshLoop(refreshInterval time.Duration, stopAfterFirstSucces } } + case <-t.shutdownC: + return + } + } +} + +// refreshLoopVolumes handles the refresh ticks for describe volumes and also responds to shutdown signal +func (t *Tagger) refreshLoopVolumes(refreshInterval time.Duration, stopAfterFirstSuccess bool) { + refreshTicker := time.NewTicker(refreshInterval) + defer refreshTicker.Stop() + for { + select { + case <-refreshTicker.C: + t.logger.Debug("ec2tagger refreshing volumes") + allVolumesRetrieved := t.ebsVolumesRetrieved() + t.logger.Debug("Retrieve status", + zap.Bool("EbsAllVolumesRetrieved", allVolumesRetrieved)) + refreshVolumes := len(t.EBSDeviceKeys) > 0 + + if stopAfterFirstSuccess { + // need refresh volumes when it is configured and not all volumes are retrieved + refreshVolumes = refreshVolumes && !allVolumesRetrieved + if !refreshVolumes { + t.logger.Info("ec2tagger: Refresh is no longer needed, stop refreshTicker.") + return + } + } + if refreshVolumes { if err := t.updateVolumes(); err != nil { t.logger.Warn("ec2tagger: Error refreshing EBS volumes, keeping old values", zap.Error(err)) @@ -333,7 +356,8 @@ func (t *Tagger) Start(ctx context.Context, host component.Host) error { go func() { //Async start of initial retrieval to prevent block of agent start t.initialRetrievalOfTagsAndVolumes() - t.refreshLoopToUpdateTagsAndVolumes() + t.refreshLoopToUpdateTags() + t.refreshLoopToUpdateVolumes() }() t.logger.Info("ec2tagger: EC2 tagger has started initialization.") @@ -343,24 +367,49 @@ func (t *Tagger) Start(ctx context.Context, host component.Host) error { return nil } -func (t *Tagger) refreshLoopToUpdateTagsAndVolumes() { +func (t *Tagger) refreshLoopToUpdateTags() { needRefresh := false stopAfterFirstSuccess := false - refreshInterval := t.RefreshIntervalSeconds - if t.RefreshIntervalSeconds.Seconds() == 0 { + refreshInterval := t.RefreshTagsInterval + if refreshInterval.Seconds() == 0 { //when the refresh interval is 0, this means that customer don't want to - //update tags/volumes values once they are retrieved successfully. In this case, + //update tags values once they are retrieved successfully. In this case, //we still want to do refresh to make sure all the specified keys for tags/volumes //are fetched successfully because initial retrieval might not get all of them. //When the specified key is "*", there is no way for us to check if all - //tags/volumes are fetched. So there is no need to do refresh in this case. - needRefresh = !(len(t.EC2InstanceTagKeys) == 1 && t.EC2InstanceTagKeys[0] == "*") || - !(len(t.EBSDeviceKeys) == 1 && t.EBSDeviceKeys[0] == "*") + //tags are fetched. So there is no need to do refresh in this case. + needRefresh = !(len(t.EC2InstanceTagKeys) == 1 && t.EC2InstanceTagKeys[0] == "*") + + stopAfterFirstSuccess = true + refreshInterval = defaultRefreshInterval + } else if refreshInterval.Seconds() > 0 { + //customer wants to update the tags with the given refresh interval + needRefresh = true + } + + if needRefresh { + go func() { + // randomly stagger the time of the first refresh to mitigate throttling if a whole fleet is + // restarted at the same time + sleepUntilHostJitter(refreshInterval) + t.refreshLoopTags(refreshInterval, stopAfterFirstSuccess) + }() + } +} + +func (t *Tagger) refreshLoopToUpdateVolumes() { + needRefresh := false + stopAfterFirstSuccess := false + + refreshInterval := t.RefreshVolumesInterval + if refreshInterval.Seconds() == 0 { + needRefresh = !(len(t.EBSDeviceKeys) == 1 && t.EBSDeviceKeys[0] == "*") + stopAfterFirstSuccess = true refreshInterval = defaultRefreshInterval - } else if t.RefreshIntervalSeconds.Seconds() > 0 { - //customer wants to update the tags/volumes with the given refresh interval + } else if refreshInterval.Seconds() > 0 { + //customer wants to update the tags with the given refresh interval needRefresh = true } @@ -369,7 +418,7 @@ func (t *Tagger) refreshLoopToUpdateTagsAndVolumes() { // randomly stagger the time of the first refresh to mitigate throttling if a whole fleet is // restarted at the same time sleepUntilHostJitter(refreshInterval) - t.refreshLoop(refreshInterval, stopAfterFirstSuccess) + t.refreshLoopVolumes(refreshInterval, stopAfterFirstSuccess) }() } } diff --git a/plugins/processors/ec2tagger/ec2tagger_test.go b/plugins/processors/ec2tagger/ec2tagger_test.go index 91bc27e1ae..0abf7a23de 100644 --- a/plugins/processors/ec2tagger/ec2tagger_test.go +++ b/plugins/processors/ec2tagger/ec2tagger_test.go @@ -282,7 +282,8 @@ func TestStartFailWithNoMetadata(t *testing.T) { // run Start() and check all tags/volumes are retrieved and saved func TestStartSuccessWithNoTagsVolumesUpdate(t *testing.T) { cfg := createDefaultConfig().(*Config) - cfg.RefreshIntervalSeconds = 0 * time.Second + cfg.RefreshTagsInterval = 0 * time.Second + cfg.RefreshVolumesInterval = 0 * time.Second cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{tagKey1, tagKey2, "AutoScalingGroupName"} cfg.EBSDeviceKeys = []string{device1, device2} @@ -326,7 +327,8 @@ func TestStartSuccessWithNoTagsVolumesUpdate(t *testing.T) { func TestStartSuccessWithTagsVolumesUpdate(t *testing.T) { cfg := createDefaultConfig().(*Config) //use millisecond rather than second to speed up test execution - cfg.RefreshIntervalSeconds = 20 * time.Millisecond + cfg.RefreshTagsInterval = 20 * time.Millisecond + cfg.RefreshVolumesInterval = 20 * time.Millisecond cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{tagKey1, tagKey2, "AutoScalingGroupName"} cfg.EBSDeviceKeys = []string{device1, device2} @@ -382,7 +384,8 @@ func TestStartSuccessWithTagsVolumesUpdate(t *testing.T) { // check there is no attempt to fetch all tags/volumes func TestStartSuccessWithWildcardTagVolumeKey(t *testing.T) { cfg := createDefaultConfig().(*Config) - cfg.RefreshIntervalSeconds = 0 * time.Second + cfg.RefreshTagsInterval = 0 * time.Second + cfg.RefreshVolumesInterval = 0 * time.Second cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{"*"} cfg.EBSDeviceKeys = []string{"*"} @@ -426,7 +429,8 @@ func TestStartSuccessWithWildcardTagVolumeKey(t *testing.T) { func TestApplyWithTagsVolumesUpdate(t *testing.T) { cfg := createDefaultConfig().(*Config) //use millisecond rather than second to speed up test execution - cfg.RefreshIntervalSeconds = 20 * time.Millisecond + cfg.RefreshTagsInterval = 20 * time.Millisecond + cfg.RefreshVolumesInterval = 20 * time.Millisecond cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{tagKey1, tagKey2, "AutoScalingGroupName"} cfg.EBSDeviceKeys = []string{device1, device2} @@ -520,7 +524,8 @@ func TestApplyWithTagsVolumesUpdate(t *testing.T) { // Test metrics are dropped before the initial retrieval is done func TestMetricsDroppedBeforeStarted(t *testing.T) { cfg := createDefaultConfig().(*Config) - cfg.RefreshIntervalSeconds = 0 * time.Millisecond + cfg.RefreshTagsInterval = 0 * time.Millisecond + cfg.RefreshVolumesInterval = 0 * time.Millisecond cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{"*"} cfg.EBSDeviceKeys = []string{"*"} @@ -585,7 +590,8 @@ func TestMetricsDroppedBeforeStarted(t *testing.T) { // Test ec2tagger Start does not block for a long time func TestTaggerStartDoesNotBlock(t *testing.T) { cfg := createDefaultConfig().(*Config) - cfg.RefreshIntervalSeconds = 0 * time.Second + cfg.RefreshTagsInterval = 0 * time.Second + cfg.RefreshVolumesInterval = 0 * time.Second cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} cfg.EC2InstanceTagKeys = []string{"*"} cfg.EBSDeviceKeys = []string{"*"} @@ -628,7 +634,8 @@ func TestTaggerStartDoesNotBlock(t *testing.T) { // Test ec2tagger Start does not block for a long time func TestTaggerStartsWithoutTagOrVolume(t *testing.T) { cfg := createDefaultConfig().(*Config) - cfg.RefreshIntervalSeconds = 0 * time.Second + cfg.RefreshTagsInterval = 0 * time.Second + cfg.RefreshVolumesInterval = 0 * time.Second cfg.EC2MetadataTags = []string{mdKeyInstanceId, mdKeyImageId, mdKeyInstanceType} _, cancel := context.WithCancel(context.Background()) diff --git a/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml b/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml index 477458e8a8..00ffba59b2 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml @@ -51,7 +51,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_cpu: collection_interval: 1m0s @@ -90,11 +91,11 @@ service: - ec2tagger - awsentity/resource receivers: - - telegraf_cpu - telegraf_disk - telegraf_mem - telegraf_netstat - telegraf_swap + - telegraf_cpu metrics/hostDeltaMetrics: exporters: - awscloudwatch diff --git a/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml b/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml index c79fab2ac1..f18abe731c 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml @@ -46,12 +46,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: - - InstanceType - ImageId - InstanceId + - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_cpu: collection_interval: 1m0s @@ -98,13 +99,13 @@ service: - ec2tagger - awsentity/resource receivers: + - telegraf_disk + - telegraf_mem + - telegraf_netstat - telegraf_swap - telegraf_ethtool - telegraf_nvidia_smi - telegraf_cpu - - telegraf_disk - - telegraf_mem - - telegraf_netstat metrics/hostDeltaMetrics: exporters: - awscloudwatch diff --git a/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml b/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml index 1e4ad8c8ab..a7de14495d 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml @@ -41,7 +41,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory @@ -91,13 +92,13 @@ service: - ec2tagger - awsentity/resource receivers: - - telegraf_win_perf_counters/1492679118 - - telegraf_win_perf_counters/3610923661 - - telegraf_win_perf_counters/3446270237 - telegraf_win_perf_counters/3762679655 - telegraf_win_perf_counters/2073218482 - telegraf_win_perf_counters/2039663244 - telegraf_win_perf_counters/4283769065 + - telegraf_win_perf_counters/1492679118 + - telegraf_win_perf_counters/3610923661 + - telegraf_win_perf_counters/3446270237 telemetry: logs: development: false diff --git a/translator/tocwconfig/sampleConfig/amp_config_linux.yaml b/translator/tocwconfig/sampleConfig/amp_config_linux.yaml index ce8e5fde4a..7074aa0eb0 100644 --- a/translator/tocwconfig/sampleConfig/amp_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/amp_config_linux.yaml @@ -108,7 +108,8 @@ processors: - ImageId imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s rollup: attribute_groups: - - ImageId diff --git a/translator/tocwconfig/sampleConfig/basic_config_linux.yaml b/translator/tocwconfig/sampleConfig/basic_config_linux.yaml index 34097d8b2f..25fb71118d 100644 --- a/translator/tocwconfig/sampleConfig/basic_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/basic_config_linux.yaml @@ -36,12 +36,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: + - InstanceType - ImageId - InstanceId - - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_disk: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/basic_config_windows.yaml b/translator/tocwconfig/sampleConfig/basic_config_windows.yaml index b8b7564990..d02635ac7a 100644 --- a/translator/tocwconfig/sampleConfig/basic_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/basic_config_windows.yaml @@ -36,12 +36,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: - - ImageId - InstanceId - InstanceType + - ImageId imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory diff --git a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml index d766fbb16b..e331fb7526 100644 --- a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml +++ b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml @@ -52,7 +52,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_socket_listener: collection_interval: 10s @@ -75,8 +76,8 @@ service: - ec2tagger - awsentity/service/telegraf receivers: - - telegraf_socket_listener - telegraf_statsd + - telegraf_socket_listener telemetry: logs: development: false diff --git a/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml b/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml index 505d6bb301..cf50e481de 100644 --- a/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml @@ -139,7 +139,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s transform: error_mode: propagate flatten_data: false @@ -289,13 +290,13 @@ service: - transform - awsentity/resource receivers: - - telegraf_swap - telegraf_netstat - - telegraf_disk - telegraf_processes + - telegraf_disk + - telegraf_cpu + - telegraf_swap - telegraf_mem - telegraf_procstat/1917393364 - - telegraf_cpu metrics/hostCustomMetrics: exporters: - awscloudwatch diff --git a/translator/tocwconfig/sampleConfig/complete_linux_config.yaml b/translator/tocwconfig/sampleConfig/complete_linux_config.yaml index d5282d96f6..82d4f2298e 100644 --- a/translator/tocwconfig/sampleConfig/complete_linux_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_linux_config.yaml @@ -152,7 +152,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s filter/jmx/0: error_mode: propagate logs: {} @@ -199,11 +200,11 @@ processors: metric_statements: - context: metric statements: - - set(unit, "unit") where name == "disk_free" - - set(name, "DISK_FREE") where name == "disk_free" - set(unit, "unit") where name == "cpu_usage_idle" - set(name, "CPU_USAGE_IDLE") where name == "cpu_usage_idle" - set(unit, "unit") where name == "cpu_usage_nice" + - set(unit, "unit") where name == "disk_free" + - set(name, "DISK_FREE") where name == "disk_free" trace_statements: [] transform/jmx/0: error_mode: propagate @@ -212,9 +213,9 @@ processors: metric_statements: - context: metric statements: + - set(name, "kafka.fetch-rate") where name == "kafka.consumer.fetch-rate" - set(unit, "unit") where name == "jvm.memory.heap.used" - set(name, "JVM_MEM_HEAP_USED") where name == "jvm.memory.heap.used" - - set(name, "kafka.fetch-rate") where name == "kafka.consumer.fetch-rate" trace_statements: [] transform/jmx/1: error_mode: propagate @@ -396,12 +397,12 @@ service: - transform - awsentity/resource receivers: - - telegraf_processes - - telegraf_cpu - telegraf_netstat + - telegraf_swap - telegraf_disk - telegraf_mem - - telegraf_swap + - telegraf_cpu + - telegraf_processes - telegraf_procstat/1917393364 metrics/hostCustomMetrics/cloudwatch: exporters: @@ -411,8 +412,8 @@ service: - transform - awsentity/service/telegraf receivers: - - telegraf_statsd - telegraf_socket_listener + - telegraf_statsd metrics/hostDeltaMetrics/cloudwatch: exporters: - awscloudwatch diff --git a/translator/tocwconfig/sampleConfig/delta_config_linux.yaml b/translator/tocwconfig/sampleConfig/delta_config_linux.yaml index b6ca384858..581faf943a 100644 --- a/translator/tocwconfig/sampleConfig/delta_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/delta_config_linux.yaml @@ -51,7 +51,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s transform: error_mode: propagate flatten_data: false diff --git a/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml b/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml index a67cd3c009..bf49a222eb 100644 --- a/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml @@ -43,12 +43,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: - - ImageId - InstanceId - InstanceType + - ImageId imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_net: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml b/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml index ab322e5b5b..2ea3508f6f 100644 --- a/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml +++ b/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml @@ -41,12 +41,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: - - InstanceType - ImageId - InstanceId + - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s transform: error_mode: propagate flatten_data: false @@ -85,9 +86,9 @@ service: - transform - awsentity/resource receivers: - - telegraf_nvidia_smi - telegraf_cpu - telegraf_disk + - telegraf_nvidia_smi telemetry: logs: development: false diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index ac6b3da95a..fb067d4d4d 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -704,6 +704,90 @@ processors: match_type: regexp new_name: apiserver_request_total_5xx submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: container_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: container_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" - action: insert aggregation_type: "" include: DCGM_FI_DEV_GPU_UTIL @@ -935,95 +1019,39 @@ processors: new_label: "" new_value: "" submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: container_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: pod_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: PodGPU - submatch_case: "" - - action: insert - aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: node_gpu_temperature - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU - submatch_case: "" - - action: insert + - action: update aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: neuroncore_memory_usage_runtime_memory match_type: "" - new_name: container_gpu_power_draw - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: ContainerGPU + new_name: neuroncore_memory_usage_runtime_memory + operations: [] submatch_case: "" - - action: insert + - action: update aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: neuroncore_utilization_ratio match_type: "" - new_name: pod_gpu_power_draw + new_name: neuroncore_utilization operations: - - action: add_label + - action: experimental_scale_value aggregation_type: "" - experimental_scale: 0 + experimental_scale: 100 label: "" label_value: "" - new_label: Type - new_value: PodGPU + new_label: "" + new_value: "" submatch_case: "" - - action: insert + - action: update aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: instance_info match_type: "" - new_name: node_gpu_power_draw - operations: - - action: add_label - aggregation_type: "" - experimental_scale: 0 - label: "" - label_value: "" - new_label: Type - new_value: NodeGPU + new_name: instance_info + operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_errors_total + include: hardware_ecc_events_total match_type: "" - new_name: neuron_execution_errors + new_name: neurondevice_hw_ecc_events operations: [] submatch_case: "" - action: update @@ -1049,30 +1077,23 @@ processors: submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_runtime_memory - match_type: "" - new_name: neuroncore_memory_usage_runtime_memory - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: instance_info + include: neuroncore_memory_usage_model_shared_scratchpad match_type: "" - new_name: instance_info + new_name: neuroncore_memory_usage_model_shared_scratchpad operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuron_hardware + include: execution_latency_seconds match_type: "" - new_name: neuron_hardware + new_name: neuron_execution_latency operations: [] submatch_case: "" - action: update aggregation_type: "" - include: hardware_ecc_events_total + include: execution_errors_total match_type: "" - new_name: neurondevice_hw_ecc_events + new_name: neuron_execution_errors operations: [] submatch_case: "" - action: update @@ -1082,13 +1103,6 @@ processors: new_name: neurondevice_runtime_memory_used_bytes operations: [] submatch_case: "" - - action: update - aggregation_type: "" - include: neuroncore_memory_usage_model_shared_scratchpad - match_type: "" - new_name: neuroncore_memory_usage_model_shared_scratchpad - operations: [] - submatch_case: "" - action: update aggregation_type: "" include: neuroncore_memory_usage_tensors @@ -1098,23 +1112,9 @@ processors: submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_utilization_ratio - match_type: "" - new_name: neuroncore_utilization - operations: - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" - submatch_case: "" - - action: update - aggregation_type: "" - include: execution_latency_seconds + include: neuron_hardware match_type: "" - new_name: neuron_execution_latency + new_name: neuron_hardware operations: [] submatch_case: "" receivers: diff --git a/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml b/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml index b99cedaeb0..7423ecedf8 100644 --- a/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml +++ b/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml @@ -35,7 +35,8 @@ processors: ec2tagger: imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_disk: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml b/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml index 79c2cbb4c9..f0931b2b7f 100644 --- a/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml +++ b/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml @@ -41,7 +41,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_disk: collection_interval: 1m0s @@ -64,8 +65,8 @@ service: - ec2tagger - awsentity/resource receivers: - - telegraf_disk - telegraf_mem + - telegraf_disk telemetry: logs: development: false diff --git a/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml b/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml index 18a4f4bb1f..cd4c936cc9 100644 --- a/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml +++ b/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml @@ -44,7 +44,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: otlp/metrics: protocols: diff --git a/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml b/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml index ce10d3d620..229ab1b2a8 100644 --- a/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml @@ -54,7 +54,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: otlp/metrics: protocols: diff --git a/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml b/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml index 0e1772467e..c2d0b534e0 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml @@ -167,7 +167,7 @@ receivers: enable_http2: true file_sd_configs: - files: - - {ecsSdFileName} + - /var/folders/hv/htgg6_g51hx3hkxl4sdcbw6h0000gq/T/TestCombinedPrometheusConfig3135808568/001/ecs_sd_results.yaml refresh_interval: 5m follow_redirects: true honor_timestamps: true diff --git a/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml b/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml index c39ceb9b77..4160ec10c3 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml @@ -78,7 +78,7 @@ receivers: enable_http2: true file_sd_configs: - files: - - {ecsSdFileName} + - /var/folders/hv/htgg6_g51hx3hkxl4sdcbw6h0000gq/T/TestOtelPrometheusConfig4175189513/001/ecs_sd_results.yaml refresh_interval: 5m follow_redirects: true honor_timestamps: true diff --git a/translator/tocwconfig/sampleConfig/standard_config_linux.yaml b/translator/tocwconfig/sampleConfig/standard_config_linux.yaml index 4f5cdd26b5..3041a2f203 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_linux.yaml @@ -50,7 +50,8 @@ processors: - InstanceId - InstanceType middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_cpu: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml b/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml index b0d4f0bbad..b8d06e58fc 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml @@ -50,13 +50,14 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: + - InstanceType - ImageId - InstanceId - - InstanceType imds_retries: 2 middleware: agenthealth/statuscode profile: AmazonCloudWatchAgent - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s shared_credential_file: fake-path receivers: telegraf_cpu: @@ -92,10 +93,10 @@ service: - ec2tagger - awsentity/resource receivers: - - telegraf_mem - telegraf_swap - telegraf_cpu - telegraf_disk + - telegraf_mem metrics/hostDeltaMetrics: exporters: - awscloudwatch diff --git a/translator/tocwconfig/sampleConfig/standard_config_windows.yaml b/translator/tocwconfig/sampleConfig/standard_config_windows.yaml index 5895060fb9..4be55d5c8a 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_windows.yaml @@ -40,7 +40,8 @@ processors: - InstanceId - InstanceType middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory @@ -80,11 +81,11 @@ service: - ec2tagger - awsentity/resource receivers: - - telegraf_win_perf_counters/1492679118 - telegraf_win_perf_counters/3610923661 - telegraf_win_perf_counters/3446270237 - telegraf_win_perf_counters/3762679655 - telegraf_win_perf_counters/4283769065 + - telegraf_win_perf_counters/1492679118 telemetry: logs: development: false diff --git a/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml b/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml index 4c3b86fd73..7b8feb392a 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml @@ -46,7 +46,8 @@ processors: imds_retries: 2 middleware: agenthealth/statuscode profile: AmazonCloudWatchAgent - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s shared_credential_file: fake-path receivers: telegraf_win_perf_counters/1492679118: @@ -87,11 +88,11 @@ service: - ec2tagger - awsentity/resource receivers: + - telegraf_win_perf_counters/4283769065 + - telegraf_win_perf_counters/1492679118 - telegraf_win_perf_counters/3610923661 - telegraf_win_perf_counters/3446270237 - telegraf_win_perf_counters/3762679655 - - telegraf_win_perf_counters/4283769065 - - telegraf_win_perf_counters/1492679118 telemetry: logs: development: false diff --git a/translator/translate/otel/processor/ec2taggerprocessor/translator.go b/translator/translate/otel/processor/ec2taggerprocessor/translator.go index c92e6962d7..efedb7e757 100644 --- a/translator/translate/otel/processor/ec2taggerprocessor/translator.go +++ b/translator/translate/otel/processor/ec2taggerprocessor/translator.go @@ -65,7 +65,8 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { } cfg.MiddlewareID = &agenthealth.StatusCodeID - cfg.RefreshIntervalSeconds = time.Duration(0) + cfg.RefreshTagsInterval = time.Duration(0) + cfg.RefreshVolumesInterval = 5 * time.Minute cfg.IMDSRetries = retryer.GetDefaultRetryNumber() return cfg, nil diff --git a/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go b/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go index 376f2d862f..0430620163 100644 --- a/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go +++ b/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go @@ -41,7 +41,8 @@ func TestTranslator(t *testing.T) { }, }, want: &ec2tagger.Config{ - RefreshIntervalSeconds: 0 * time.Second, + RefreshTagsInterval: 0 * time.Second, + RefreshVolumesInterval: 5 * time.Minute, EC2MetadataTags: []string{"ImageId", "InstanceId", "InstanceType"}, EC2InstanceTagKeys: []string{"AutoScalingGroupName"}, }, @@ -65,7 +66,8 @@ func TestTranslator(t *testing.T) { }, }, want: &ec2tagger.Config{ - RefreshIntervalSeconds: 0 * time.Second, + RefreshTagsInterval: 0 * time.Second, + RefreshVolumesInterval: 5 * time.Minute, EC2MetadataTags: []string{"ImageId", "InstanceId", "InstanceType"}, EC2InstanceTagKeys: []string{"AutoScalingGroupName"}, DiskDeviceTagKey: "device", @@ -82,7 +84,8 @@ func TestTranslator(t *testing.T) { require.NotNil(t, got) gotCfg, ok := got.(*ec2tagger.Config) require.True(t, ok) - require.Equal(t, tc.want.RefreshIntervalSeconds, gotCfg.RefreshIntervalSeconds) + require.Equal(t, tc.want.RefreshTagsInterval, gotCfg.RefreshTagsInterval) + require.Equal(t, tc.want.RefreshVolumesInterval, gotCfg.RefreshVolumesInterval) sort.Strings(gotCfg.EC2MetadataTags) require.Equal(t, tc.want.EC2MetadataTags, gotCfg.EC2MetadataTags) require.Equal(t, tc.want.EC2InstanceTagKeys, gotCfg.EC2InstanceTagKeys) From 760a5c7f80aa2e3b3140b14ce57389d96b339258 Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Mon, 3 Mar 2025 15:26:03 -0500 Subject: [PATCH 2/6] Revert prometheus yamls --- .../sampleConfig/prometheus_combined_config_linux.yaml | 2 +- .../tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml b/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml index c2d0b534e0..0e1772467e 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_combined_config_linux.yaml @@ -167,7 +167,7 @@ receivers: enable_http2: true file_sd_configs: - files: - - /var/folders/hv/htgg6_g51hx3hkxl4sdcbw6h0000gq/T/TestCombinedPrometheusConfig3135808568/001/ecs_sd_results.yaml + - {ecsSdFileName} refresh_interval: 5m follow_redirects: true honor_timestamps: true diff --git a/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml b/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml index 4160ec10c3..c39ceb9b77 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_otel_config_linux.yaml @@ -78,7 +78,7 @@ receivers: enable_http2: true file_sd_configs: - files: - - /var/folders/hv/htgg6_g51hx3hkxl4sdcbw6h0000gq/T/TestOtelPrometheusConfig4175189513/001/ecs_sd_results.yaml + - {ecsSdFileName} refresh_interval: 5m follow_redirects: true honor_timestamps: true From dbb4a9bed2586043168295f1d5505fedbb9ad1b1 Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Tue, 4 Mar 2025 11:06:24 -0500 Subject: [PATCH 3/6] fix windows unit tests and update readme --- plugins/processors/ec2tagger/README.md | 3 ++- .../tocwconfig/sampleConfig/complete_windows_config.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/plugins/processors/ec2tagger/README.md b/plugins/processors/ec2tagger/README.md index 32777b2df5..2dbecd9a87 100644 --- a/plugins/processors/ec2tagger/README.md +++ b/plugins/processors/ec2tagger/README.md @@ -30,7 +30,8 @@ The IAM User or Role making the calls must have permissions to call the EC2 Desc The following receiver configuration parameters are supported. | Name | Description | Supported Value | Default | |--------------------------| ---------------------------------------------------------------------------------------------------------------| -----------------------------------------| --------| -|`refresh_interval_seconds`| is the frequency for the plugin to refresh the EC2 Instance Tags and ebs Volumes associated with this Instance.| "0s" | "0s" | +|`refresh_tags_interval` | is the frequency for the plugin to refresh the EC2 Instance Tags associated with this Instance. | "0s" | "0s" | +|`refresh_volumes_interval`| is the frequency for the plugin to refresh the EBS Volumes associated with this Instance. | "0s" | "0s" | |`ec2_metadata_tags` | is the option to specify which tags to be scraped from IMDS and add to datapoint attributes | ["InstanceId", "ImageId", "InstanceType"]| [] | |`ec2_instance_tag_keys` | is the option to specific which EC2 Instance tags to be scraped associated with this instance. | ["aws:autoscaling:groupName", "Name"] | [] | |`disk_device_tag_key` | is the option to Specify which tags to use to get the specified disk device name from input metric | [] | [] | diff --git a/translator/tocwconfig/sampleConfig/complete_windows_config.yaml b/translator/tocwconfig/sampleConfig/complete_windows_config.yaml index aa4f59301f..1c6e3d15c5 100644 --- a/translator/tocwconfig/sampleConfig/complete_windows_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_windows_config.yaml @@ -129,7 +129,8 @@ processors: - InstanceType imds_retries: 1 middleware: agenthealth/statuscode - refresh_interval_seconds: 0s + refresh_tags_interval: 0s + refresh_volumes_interval: 5m0s transform: error_mode: propagate flatten_data: false From f99a7a128cb3fe444bc5ae3661fa5b79c9c57b7b Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Tue, 4 Mar 2025 11:59:34 -0500 Subject: [PATCH 4/6] Only set 5m interval if VolumeId is set in append_dimensions --- plugins/processors/ec2tagger/ec2tagger.go | 6 +++--- .../otel/processor/ec2taggerprocessor/translator.go | 5 +++-- .../otel/processor/ec2taggerprocessor/translator_test.go | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/plugins/processors/ec2tagger/ec2tagger.go b/plugins/processors/ec2tagger/ec2tagger.go index 1d5490b580..568ca1dc13 100644 --- a/plugins/processors/ec2tagger/ec2tagger.go +++ b/plugins/processors/ec2tagger/ec2tagger.go @@ -214,7 +214,7 @@ func (t *Tagger) refreshLoopTags(refreshInterval time.Duration, stopAfterFirstSu // need refresh tags when it is configured and not all ec2 tags are retrieved refreshTags = refreshTags && !allTagsRetrieved if !refreshTags { - t.logger.Info("ec2tagger: Refresh is no longer needed, stop refreshTicker.") + t.logger.Info("ec2tagger: Refresh for tags is no longer needed, stop refreshTicker.") return } } @@ -248,7 +248,7 @@ func (t *Tagger) refreshLoopVolumes(refreshInterval time.Duration, stopAfterFirs // need refresh volumes when it is configured and not all volumes are retrieved refreshVolumes = refreshVolumes && !allVolumesRetrieved if !refreshVolumes { - t.logger.Info("ec2tagger: Refresh is no longer needed, stop refreshTicker.") + t.logger.Info("ec2tagger: Refresh for volumes is no longer needed, stop refreshTicker.") return } } @@ -409,7 +409,7 @@ func (t *Tagger) refreshLoopToUpdateVolumes() { stopAfterFirstSuccess = true refreshInterval = defaultRefreshInterval } else if refreshInterval.Seconds() > 0 { - //customer wants to update the tags with the given refresh interval + //customer wants to update the volumes with the given refresh interval needRefresh = true } diff --git a/translator/translate/otel/processor/ec2taggerprocessor/translator.go b/translator/translate/otel/processor/ec2taggerprocessor/translator.go index efedb7e757..2b46b4dd6b 100644 --- a/translator/translate/otel/processor/ec2taggerprocessor/translator.go +++ b/translator/translate/otel/processor/ec2taggerprocessor/translator.go @@ -59,14 +59,15 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { } } + cfg.RefreshTagsInterval = time.Duration(0) + cfg.RefreshVolumesInterval = time.Duration(0) if value, ok := common.GetString(conf, common.ConfigKey(common.MetricsKey, common.MetricsCollectedKey, common.DiskKey, common.AppendDimensionsKey, ec2tagger.AttributeVolumeId)); ok && value == ec2tagger.ValueAppendDimensionVolumeId { + cfg.RefreshVolumesInterval = 5 * time.Minute cfg.EBSDeviceKeys = []string{"*"} cfg.DiskDeviceTagKey = "device" } cfg.MiddlewareID = &agenthealth.StatusCodeID - cfg.RefreshTagsInterval = time.Duration(0) - cfg.RefreshVolumesInterval = 5 * time.Minute cfg.IMDSRetries = retryer.GetDefaultRetryNumber() return cfg, nil diff --git a/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go b/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go index 0430620163..b36c89e45a 100644 --- a/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go +++ b/translator/translate/otel/processor/ec2taggerprocessor/translator_test.go @@ -29,7 +29,7 @@ func TestTranslator(t *testing.T) { JsonKey: Ec2taggerKey, }, }, - "FullEc2TaggerProcessorConfig": { + "FullEc2TaggerProcessorNoVolumeConfig": { input: map[string]interface{}{ "metrics": map[string]interface{}{ "append_dimensions": map[string]interface{}{ @@ -42,7 +42,7 @@ func TestTranslator(t *testing.T) { }, want: &ec2tagger.Config{ RefreshTagsInterval: 0 * time.Second, - RefreshVolumesInterval: 5 * time.Minute, + RefreshVolumesInterval: 0 * time.Minute, EC2MetadataTags: []string{"ImageId", "InstanceId", "InstanceType"}, EC2InstanceTagKeys: []string{"AutoScalingGroupName"}, }, From 1d01b21ca811e747a1f31fb6f2f8aa13d5b78ec7 Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Tue, 4 Mar 2025 13:49:09 -0500 Subject: [PATCH 5/6] Fix test yaml --- translator/tocwconfig/sampleConfig/compass_linux_config.yaml | 2 +- translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml | 2 +- translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml index e331fb7526..ee85e05747 100644 --- a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml +++ b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml @@ -53,7 +53,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_socket_listener: collection_interval: 10s diff --git a/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml b/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml index cd4c936cc9..aa38947620 100644 --- a/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml +++ b/translator/tocwconfig/sampleConfig/otlp_metrics_config.yaml @@ -45,7 +45,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: otlp/metrics: protocols: diff --git a/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml b/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml index 229ab1b2a8..888fc10d47 100644 --- a/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/otlp_metrics_eks_config.yaml @@ -55,7 +55,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: otlp/metrics: protocols: From 4a46abe74e7f5ebcfd211818f34beaca84558589 Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Tue, 4 Mar 2025 16:23:44 -0500 Subject: [PATCH 6/6] actually fix test yamls --- .../sampleConfig/advanced_config_darwin.yaml | 2 +- .../sampleConfig/advanced_config_linux.yaml | 2 +- .../sampleConfig/advanced_config_windows.yaml | 2 +- .../sampleConfig/amp_config_linux.yaml | 2 +- .../sampleConfig/basic_config_linux.yaml | 2 +- .../sampleConfig/basic_config_windows.yaml | 4 +- .../sampleConfig/compass_linux_config.yaml | 2 +- .../sampleConfig/complete_darwin_config.yaml | 2 +- .../sampleConfig/complete_linux_config.yaml | 2 +- .../sampleConfig/complete_windows_config.yaml | 2 +- .../sampleConfig/delta_config_linux.yaml | 2 +- .../sampleConfig/delta_net_config_linux.yaml | 2 +- .../sampleConfig/drop_origin_linux.yaml | 2 +- .../emf_and_kubernetes_with_gpu_config.yaml | 226 +++++++++--------- .../ignore_append_dimensions.yaml | 2 +- .../sampleConfig/invalid_input_linux.yaml | 2 +- .../sampleConfig/standard_config_linux.yaml | 2 +- ...ndard_config_linux_with_common_config.yaml | 2 +- .../sampleConfig/standard_config_windows.yaml | 2 +- ...ard_config_windows_with_common_config.yaml | 2 +- 20 files changed, 133 insertions(+), 133 deletions(-) diff --git a/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml b/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml index 00ffba59b2..afdebc3fae 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_darwin.yaml @@ -52,7 +52,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_cpu: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml b/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml index f18abe731c..353b746af6 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_linux.yaml @@ -52,7 +52,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_cpu: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml b/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml index a7de14495d..df47e58553 100644 --- a/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/advanced_config_windows.yaml @@ -42,7 +42,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory diff --git a/translator/tocwconfig/sampleConfig/amp_config_linux.yaml b/translator/tocwconfig/sampleConfig/amp_config_linux.yaml index 7074aa0eb0..b47c7f0c42 100644 --- a/translator/tocwconfig/sampleConfig/amp_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/amp_config_linux.yaml @@ -109,7 +109,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s rollup: attribute_groups: - - ImageId diff --git a/translator/tocwconfig/sampleConfig/basic_config_linux.yaml b/translator/tocwconfig/sampleConfig/basic_config_linux.yaml index 25fb71118d..c8fad9f1ab 100644 --- a/translator/tocwconfig/sampleConfig/basic_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/basic_config_linux.yaml @@ -42,7 +42,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_disk: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/basic_config_windows.yaml b/translator/tocwconfig/sampleConfig/basic_config_windows.yaml index d02635ac7a..f88e998108 100644 --- a/translator/tocwconfig/sampleConfig/basic_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/basic_config_windows.yaml @@ -36,13 +36,13 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: + - ImageId - InstanceId - InstanceType - - ImageId imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory diff --git a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml index ee85e05747..aa7462cc31 100644 --- a/translator/tocwconfig/sampleConfig/compass_linux_config.yaml +++ b/translator/tocwconfig/sampleConfig/compass_linux_config.yaml @@ -47,9 +47,9 @@ processors: ec2_instance_tag_keys: - AutoScalingGroupName ec2_metadata_tags: + - InstanceType - ImageId - InstanceId - - InstanceType imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s diff --git a/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml b/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml index cf50e481de..e30fe0e162 100644 --- a/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_darwin_config.yaml @@ -140,7 +140,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s transform: error_mode: propagate flatten_data: false diff --git a/translator/tocwconfig/sampleConfig/complete_linux_config.yaml b/translator/tocwconfig/sampleConfig/complete_linux_config.yaml index 82d4f2298e..7fc6bbb1a2 100644 --- a/translator/tocwconfig/sampleConfig/complete_linux_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_linux_config.yaml @@ -153,7 +153,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s filter/jmx/0: error_mode: propagate logs: {} diff --git a/translator/tocwconfig/sampleConfig/complete_windows_config.yaml b/translator/tocwconfig/sampleConfig/complete_windows_config.yaml index 1c6e3d15c5..12cb50e766 100644 --- a/translator/tocwconfig/sampleConfig/complete_windows_config.yaml +++ b/translator/tocwconfig/sampleConfig/complete_windows_config.yaml @@ -130,7 +130,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s transform: error_mode: propagate flatten_data: false diff --git a/translator/tocwconfig/sampleConfig/delta_config_linux.yaml b/translator/tocwconfig/sampleConfig/delta_config_linux.yaml index 581faf943a..9e7d5f23b4 100644 --- a/translator/tocwconfig/sampleConfig/delta_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/delta_config_linux.yaml @@ -52,7 +52,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s transform: error_mode: propagate flatten_data: false diff --git a/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml b/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml index bf49a222eb..accedd3a4b 100644 --- a/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/delta_net_config_linux.yaml @@ -49,7 +49,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_net: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml b/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml index 2ea3508f6f..41aa4c36a6 100644 --- a/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml +++ b/translator/tocwconfig/sampleConfig/drop_origin_linux.yaml @@ -47,7 +47,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s transform: error_mode: propagate flatten_data: false diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index fb067d4d4d..16d79a9587 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -706,9 +706,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: container_gpu_temperature + new_name: container_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -717,12 +717,19 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: pod_gpu_temperature + new_name: pod_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -731,12 +738,19 @@ processors: label_value: "" new_label: Type new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_TEMP + include: DCGM_FI_DEV_FB_USED match_type: "" - new_name: node_gpu_temperature + new_name: node_gpu_memory_used operations: - action: add_label aggregation_type: "" @@ -745,12 +759,19 @@ processors: label_value: "" new_label: Type new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: container_gpu_power_draw + new_name: container_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -759,12 +780,19 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: pod_gpu_power_draw + new_name: pod_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -773,12 +801,19 @@ processors: label_value: "" new_label: Type new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_POWER_USAGE + include: DCGM_FI_DEV_FB_TOTAL match_type: "" - new_name: node_gpu_power_draw + new_name: node_gpu_memory_total operations: - action: add_label aggregation_type: "" @@ -787,12 +822,19 @@ processors: label_value: "" new_label: Type new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: container_gpu_utilization + new_name: container_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -804,9 +846,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: pod_gpu_utilization + new_name: pod_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -818,9 +860,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_GPU_UTIL + include: DCGM_FI_DEV_GPU_TEMP match_type: "" - new_name: node_gpu_utilization + new_name: node_gpu_temperature operations: - action: add_label aggregation_type: "" @@ -832,9 +874,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: container_gpu_memory_utilization + new_name: container_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -843,19 +885,12 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: pod_gpu_memory_utilization + new_name: pod_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -864,19 +899,12 @@ processors: label_value: "" new_label: Type new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED_PERCENT + include: DCGM_FI_DEV_POWER_USAGE match_type: "" - new_name: node_gpu_memory_utilization + new_name: node_gpu_power_draw operations: - action: add_label aggregation_type: "" @@ -885,19 +913,12 @@ processors: label_value: "" new_label: Type new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 100 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: container_gpu_memory_used + new_name: container_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -906,19 +927,12 @@ processors: label_value: "" new_label: Type new_value: ContainerGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: pod_gpu_memory_used + new_name: pod_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -927,19 +941,12 @@ processors: label_value: "" new_label: Type new_value: PodGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_USED + include: DCGM_FI_DEV_GPU_UTIL match_type: "" - new_name: node_gpu_memory_used + new_name: node_gpu_utilization operations: - action: add_label aggregation_type: "" @@ -948,19 +955,12 @@ processors: label_value: "" new_label: Type new_value: NodeGPU - - action: experimental_scale_value - aggregation_type: "" - experimental_scale: 1.048576e+06 - label: "" - label_value: "" - new_label: "" - new_value: "" submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: container_gpu_memory_total + new_name: container_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -971,7 +971,7 @@ processors: new_value: ContainerGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" @@ -979,9 +979,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: pod_gpu_memory_total + new_name: pod_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -992,7 +992,7 @@ processors: new_value: PodGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" @@ -1000,9 +1000,9 @@ processors: submatch_case: "" - action: insert aggregation_type: "" - include: DCGM_FI_DEV_FB_TOTAL + include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: node_gpu_memory_total + new_name: node_gpu_memory_utilization operations: - action: add_label aggregation_type: "" @@ -1013,12 +1013,19 @@ processors: new_value: NodeGPU - action: experimental_scale_value aggregation_type: "" - experimental_scale: 1.048576e+06 + experimental_scale: 100 label: "" label_value: "" new_label: "" new_value: "" submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_model_shared_scratchpad + match_type: "" + new_name: neuroncore_memory_usage_model_shared_scratchpad + operations: [] + submatch_case: "" - action: update aggregation_type: "" include: neuroncore_memory_usage_runtime_memory @@ -1026,6 +1033,13 @@ processors: new_name: neuroncore_memory_usage_runtime_memory operations: [] submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_tensors + match_type: "" + new_name: neuroncore_memory_usage_tensors + operations: [] + submatch_case: "" - action: update aggregation_type: "" include: neuroncore_utilization_ratio @@ -1042,79 +1056,65 @@ processors: submatch_case: "" - action: update aggregation_type: "" - include: instance_info - match_type: "" - new_name: instance_info - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: hardware_ecc_events_total - match_type: "" - new_name: neurondevice_hw_ecc_events - operations: [] - submatch_case: "" - - action: update - aggregation_type: "" - include: execution_status_total + include: neuron_hardware match_type: "" - new_name: neuron_execution_status + new_name: neuron_hardware operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_constants + include: execution_latency_seconds match_type: "" - new_name: neuroncore_memory_usage_constants + new_name: neuron_execution_latency operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_model_code + include: execution_errors_total match_type: "" - new_name: neuroncore_memory_usage_model_code + new_name: neuron_execution_errors operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_model_shared_scratchpad + include: execution_status_total match_type: "" - new_name: neuroncore_memory_usage_model_shared_scratchpad + new_name: neuron_execution_status operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_latency_seconds + include: neuron_runtime_memory_used_bytes match_type: "" - new_name: neuron_execution_latency + new_name: neurondevice_runtime_memory_used_bytes operations: [] submatch_case: "" - action: update aggregation_type: "" - include: execution_errors_total + include: neuroncore_memory_usage_constants match_type: "" - new_name: neuron_execution_errors + new_name: neuroncore_memory_usage_constants operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuron_runtime_memory_used_bytes + include: neuroncore_memory_usage_model_code match_type: "" - new_name: neurondevice_runtime_memory_used_bytes + new_name: neuroncore_memory_usage_model_code operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuroncore_memory_usage_tensors + include: instance_info match_type: "" - new_name: neuroncore_memory_usage_tensors + new_name: instance_info operations: [] submatch_case: "" - action: update aggregation_type: "" - include: neuron_hardware + include: hardware_ecc_events_total match_type: "" - new_name: neuron_hardware + new_name: neurondevice_hw_ecc_events operations: [] submatch_case: "" receivers: diff --git a/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml b/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml index 7423ecedf8..7f31f7919e 100644 --- a/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml +++ b/translator/tocwconfig/sampleConfig/ignore_append_dimensions.yaml @@ -36,7 +36,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_disk: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml b/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml index f0931b2b7f..8ee36bea83 100644 --- a/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml +++ b/translator/tocwconfig/sampleConfig/invalid_input_linux.yaml @@ -42,7 +42,7 @@ processors: imds_retries: 1 middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_disk: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/standard_config_linux.yaml b/translator/tocwconfig/sampleConfig/standard_config_linux.yaml index 3041a2f203..aa33959532 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_linux.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_linux.yaml @@ -51,7 +51,7 @@ processors: - InstanceType middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_cpu: collection_interval: 1m0s diff --git a/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml b/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml index b8d06e58fc..d498996f4f 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_linux_with_common_config.yaml @@ -57,7 +57,7 @@ processors: middleware: agenthealth/statuscode profile: AmazonCloudWatchAgent refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s shared_credential_file: fake-path receivers: telegraf_cpu: diff --git a/translator/tocwconfig/sampleConfig/standard_config_windows.yaml b/translator/tocwconfig/sampleConfig/standard_config_windows.yaml index 4be55d5c8a..96d6c2409c 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_windows.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_windows.yaml @@ -41,7 +41,7 @@ processors: - InstanceType middleware: agenthealth/statuscode refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s receivers: telegraf_win_perf_counters/1492679118: alias_name: Memory diff --git a/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml b/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml index 7b8feb392a..0417a1f8e6 100644 --- a/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml +++ b/translator/tocwconfig/sampleConfig/standard_config_windows_with_common_config.yaml @@ -47,7 +47,7 @@ processors: middleware: agenthealth/statuscode profile: AmazonCloudWatchAgent refresh_tags_interval: 0s - refresh_volumes_interval: 5m0s + refresh_volumes_interval: 0s shared_credential_file: fake-path receivers: telegraf_win_perf_counters/1492679118: