Skip to content

Commit

Permalink
metrics: introduce client config to include alloc metadata as part of…
Browse files Browse the repository at this point in the history
… the base labels
  • Loading branch information
mvegter committed Sep 27, 2024
1 parent ec42aa2 commit 0ef9371
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .changelog/23964.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
metrics: introduce client config to include alloc metadata as part of the base labels
```
13 changes: 13 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,19 @@ func (tr *TaskRunner) initLabels() {
},
}

if tr.clientConfig.IncludeAllocMetadataInMetrics {
for meta, metaValue := range tr.task.Meta {
if len(tr.clientConfig.AllowedMetadataKeysInMetrics) > 0 && !slices.Contains(tr.clientConfig.AllowedMetadataKeysInMetrics, meta) {
continue
}

tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: strings.ReplaceAll(meta, "-", "_"),
Value: metaValue,
})
}
}

if tr.alloc.Job.ParentID != "" {
tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: "parent_id",
Expand Down
34 changes: 34 additions & 0 deletions client/allocrunner/taskrunner/task_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2866,6 +2866,40 @@ func TestTaskRunner_BaseLabels(t *testing.T) {
require.Equal(alloc.Namespace, labels["namespace"])
}

// TestTaskRunner_BaseLabels_IncludesAllocMetadata tests that the base labels include
// the allocation metadata fields using the provided allowed list of keys
func TestTaskRunner_BaseLabels_IncludesAllocMetadata(t *testing.T) {
ci.Parallel(t)

alloc := mock.BatchAlloc()
alloc.Namespace = "not-default"
job := alloc.Job
job.Meta = map[string]string{"owner": "HashiCorp", "my-key": "my-value", "some_dynamic_value": "now()"}
task := job.TaskGroups[0].Tasks[0]
task.Driver = "raw_exec"
task.Config = map[string]interface{}{
"command": "whoami",
}

trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name, nil)
defer cleanup()

trConfig.ClientConfig.IncludeAllocMetadataInMetrics = true
trConfig.ClientConfig.AllowedMetadataKeysInMetrics = []string{"owner", "my-key"}

tr, err := NewTaskRunner(trConfig)
must.NoError(t, err)

labels := map[string]string{}
for _, e := range tr.baseLabels {
labels[e.Name] = e.Value
}

must.Eq(t, "HashiCorp", labels["owner"])
must.Eq(t, "my-value", labels["my-key"])
must.Nil(t, labels["some_dynamic_value"])
}

// TestTaskRunner_IdentityHook_Enabled asserts that the identity hook exposes a
// workload identity to a task.
func TestTaskRunner_IdentityHook_Enabled(t *testing.T) {
Expand Down
8 changes: 8 additions & 0 deletions client/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,14 @@ type Config struct {
// allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool

// IncludeAllocMetadataInMetrics determines whether nomad should include the
// allocation metadata as labels in the metrics to remote Telemetry sinks
IncludeAllocMetadataInMetrics bool

// AllowedMetadataKeysInMetrics when provided nomad will only include the
// configured metadata keys as part of the metrics to remote Telemetry sinks
AllowedMetadataKeysInMetrics []string

// TLSConfig holds various TLS related configurations
TLSConfig *structsc.TLSConfig

Expand Down
2 changes: 2 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) {
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics
conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics
conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics

// Set the TLS related configs
conf.TLSConfig = agentConfig.TLSConfig
Expand Down
32 changes: 21 additions & 11 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -969,17 +969,19 @@ type Telemetry struct {
InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"`
inMemoryRetentionPeriod time.Duration `hcl:"-"`

StatsiteAddr string `hcl:"statsite_address"`
StatsdAddr string `hcl:"statsd_address"`
DataDogAddr string `hcl:"datadog_address"`
DataDogTags []string `hcl:"datadog_tags"`
PrometheusMetrics bool `hcl:"prometheus_metrics"`
DisableHostname bool `hcl:"disable_hostname"`
UseNodeName bool `hcl:"use_node_name"`
CollectionInterval string `hcl:"collection_interval"`
collectionInterval time.Duration `hcl:"-"`
PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"`
PublishNodeMetrics bool `hcl:"publish_node_metrics"`
StatsiteAddr string `hcl:"statsite_address"`
StatsdAddr string `hcl:"statsd_address"`
DataDogAddr string `hcl:"datadog_address"`
DataDogTags []string `hcl:"datadog_tags"`
PrometheusMetrics bool `hcl:"prometheus_metrics"`
DisableHostname bool `hcl:"disable_hostname"`
UseNodeName bool `hcl:"use_node_name"`
CollectionInterval string `hcl:"collection_interval"`
collectionInterval time.Duration `hcl:"-"`
PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"`
PublishNodeMetrics bool `hcl:"publish_node_metrics"`
IncludeAllocMetadataInMetrics bool `hcl:"include_alloc_metadata_in_metrics"`
AllowedMetadataKeysInMetrics []string `hcl:"allowed_metadata_keys_in_metrics"`

// PrefixFilter allows for filtering out metrics from being collected
PrefixFilter []string `hcl:"prefix_filter"`
Expand Down Expand Up @@ -1343,6 +1345,8 @@ func DevConfig(mode *devModeConfig) *Config {
conf.Telemetry.PrometheusMetrics = true
conf.Telemetry.PublishAllocationMetrics = true
conf.Telemetry.PublishNodeMetrics = true
conf.Telemetry.IncludeAllocMetadataInMetrics = true
conf.Telemetry.AllowedMetadataKeysInMetrics = []string{}

if mode.consulMode {
conf.Consuls[0].ServiceIdentity = &config.WorkloadIdentityConfig{
Expand Down Expand Up @@ -2524,6 +2528,12 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
if b.PublishAllocationMetrics {
result.PublishAllocationMetrics = true
}
if b.IncludeAllocMetadataInMetrics {
result.IncludeAllocMetadataInMetrics = true
}
if b.AllowedMetadataKeysInMetrics != nil {
result.AllowedMetadataKeysInMetrics = b.AllowedMetadataKeysInMetrics
}
if b.CirconusAPIToken != "" {
result.CirconusAPIToken = b.CirconusAPIToken
}
Expand Down
9 changes: 9 additions & 0 deletions website/content/docs/configuration/telemetry.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ The following options are available on all telemetry configurations.
- `publish_allocation_metrics` `(bool: false)` - Specifies if Nomad should
publish runtime metrics of allocations.

- `include_alloc_metadata_in_metrics` `(bool: false)` - This controls whether
allocation metadata should be part of the metric labels. Defaults to false
as enabling this exposes the risk of high cardinality labels. It is therefore
recommended to configure [allowed_metadata_keys_in_metrics](#allowed_metadata_keys_in_metrics).

- `allowed_metadata_keys_in_metrics` `(list: [])` - This filters the metadata
keys to be included in the metric publishing. By default it will not filter
out any keys and thus include all metadata.

- `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish
runtime metrics of nodes.

Expand Down

0 comments on commit 0ef9371

Please sign in to comment.