Skip to content

Commit

Permalink
Update metrics.md
Browse files Browse the repository at this point in the history
Signed-off-by: torredil <[email protected]>
Co-authored-by: Connor Catlett <[email protected]>
  • Loading branch information
torredil and ConnorJC3 committed Dec 6, 2024
1 parent a445e6f commit 212f289
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 30 deletions.
34 changes: 16 additions & 18 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ $ helm repo add prometheus-community https://prometheus-community.github.io/helm
$ helm repo update
$ helm install prometheus prometheus-community/kube-prometheus-stack
```
2. Enable metrics by setting `enableMetrics: true` in [values.yaml](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/charts/aws-ebs-csi-driver/values.yaml).
2. Enable metrics by configuring `controller.enableMetrics` and `node.enableMetrics`.

3. Deploy EBS CSI Driver:
```sh
Expand All @@ -21,26 +21,24 @@ Installing the Prometheus Operator and enabling metrics will deploy a [Service](

## AWS API Metrics

The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `enableMetrics: true` has been configured in the Helm chart.
The EBS CSI Driver will emit [AWS API](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/OperationList-query.html) metrics to the following TCP endpoint: `0.0.0.0:3301/metrics` if `controller.enableMetrics: true` has been configured in the Helm chart.

The metrics will appear in the following format:
```sh
# HELP cloudprovider_aws_api_request_duration_seconds [ALPHA] Latency of AWS API calls
# TYPE cloudprovider_aws_api_request_duration_seconds histogram
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1
cloudprovider_aws_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1
cloudprovider_aws_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574
cloudprovider_aws_api_request_duration_seconds_count{request="AttachVolume"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.005"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.01"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.025"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.05"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.1"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.25"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="0.5"} 0
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="1"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="2.5"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="5"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="10"} 1
aws_ebs_csi_api_request_duration_seconds_bucket{request="AttachVolume",le="+Inf"} 1
aws_ebs_csi_api_request_duration_seconds_sum{request="AttachVolume"} 0.547694574
aws_ebs_csi_api_request_duration_seconds_count{request="AttachVolume"} 1
...
```

Expand Down
21 changes: 12 additions & 9 deletions pkg/metrics/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ const (
metricReadLatency = namespace + "read_io_latency_seconds"
metricWriteLatency = namespace + "write_io_latency_seconds"
metricCollectorDuration = namespace + "nvme_collector_duration_seconds"

// Conversion factor.
microsecondsInSeconds = 1e6
)

// EBSMetrics represents the parsed metrics from the NVMe log page.
Expand Down Expand Up @@ -144,8 +147,8 @@ func NewNVMECollector(path, instanceID string) *NVMECollector {
metricEC2ExceededIOPS: prometheus.NewDesc(metricEC2ExceededIOPS, "The total time, in seconds, that the EBS volume exceeded the attached Amazon EC2 instance's maximum IOPS performance.", variableLabels, constLabels),
metricEC2ExceededTP: prometheus.NewDesc(metricEC2ExceededTP, "The total time, in seconds, that the EBS volume exceeded the attached Amazon EC2 instance's maximum throughput performance.", variableLabels, constLabels),
metricVolumeQueueLength: prometheus.NewDesc(metricVolumeQueueLength, "The number of read and write operations waiting to be completed.", variableLabels, constLabels),
metricReadLatency: prometheus.NewDesc(metricReadLatency, "The number of read operations completed within each latency bin, in microseconds.", variableLabels, constLabels),
metricWriteLatency: prometheus.NewDesc(metricWriteLatency, "The number of write operations completed within each latency bin, in microseconds.", variableLabels, constLabels),
metricReadLatency: prometheus.NewDesc(metricReadLatency, "The number of read operations completed within each latency bin, in seconds.", variableLabels, constLabels),
metricWriteLatency: prometheus.NewDesc(metricWriteLatency, "The number of write operations completed within each latency bin, in seconds.", variableLabels, constLabels),
},
// Clean CSI mount point path to normalize path
// Add trailing slash back that Clean prunes
Expand Down Expand Up @@ -235,12 +238,12 @@ func (c *NVMECollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteOps], prometheus.CounterValue, float64(metrics.WriteOps), volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricReadBytes], prometheus.CounterValue, float64(metrics.ReadBytes), volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteBytes], prometheus.CounterValue, float64(metrics.WriteBytes), volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricReadOpsSeconds], prometheus.CounterValue, float64(metrics.TotalReadTime)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteOpsSeconds], prometheus.CounterValue, float64(metrics.TotalWriteTime)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededIOPS], prometheus.CounterValue, float64(metrics.EBSIOPSExceeded)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededTP], prometheus.CounterValue, float64(metrics.EBSThroughputExceeded)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededIOPS], prometheus.CounterValue, float64(metrics.EC2IOPSExceeded)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededTP], prometheus.CounterValue, float64(metrics.EC2ThroughputExceeded)/1e6, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricReadOpsSeconds], prometheus.CounterValue, float64(metrics.TotalReadTime)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricWriteOpsSeconds], prometheus.CounterValue, float64(metrics.TotalWriteTime)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededIOPS], prometheus.CounterValue, float64(metrics.EBSIOPSExceeded)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricExceededTP], prometheus.CounterValue, float64(metrics.EBSThroughputExceeded)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededIOPS], prometheus.CounterValue, float64(metrics.EC2IOPSExceeded)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricEC2ExceededTP], prometheus.CounterValue, float64(metrics.EC2ThroughputExceeded)/microsecondsInSeconds, volumeID)
ch <- prometheus.MustNewConstMetric(c.metrics[metricVolumeQueueLength], prometheus.GaugeValue, float64(metrics.QueueLength), volumeID)

// Read Latency Histogram
Expand Down Expand Up @@ -272,7 +275,7 @@ func convertHistogram(hist Histogram) (uint64, map[float64]uint64) {

for i := uint64(0); i < hist.BinCount && i < 64; i++ {
count += hist.Bins[i].Count
buckets[float64(hist.Bins[i].Upper)] = count
buckets[float64(hist.Bins[i].Upper)/microsecondsInSeconds] = count
}

return count, buckets
Expand Down
6 changes: 3 additions & 3 deletions pkg/metrics/nvme_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ func TestConvertHistogram(t *testing.T) {
},
wantCount: 10,
wantBuckets: map[float64]uint64{
100: 5,
200: 8,
300: 10,
100 / 1e6: 5,
200 / 1e6: 8,
300 / 1e6: 10,
},
},
}
Expand Down

0 comments on commit 212f289

Please sign in to comment.