Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
kumarvin123 authored Jan 17, 2025
2 parents fcc625d + a3088c8 commit 0f4b15a
Show file tree
Hide file tree
Showing 27 changed files with 461 additions and 300 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
branches: [main]
workflow_dispatch:

permissions:
actions: read
contents: read
Expand All @@ -15,6 +16,7 @@ permissions:
pull-requests: write
security-events: write
issues: write

jobs:
test-image:
runs-on: ubuntu-latest
Expand All @@ -32,8 +34,9 @@ jobs:
PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
run: |
make test-image IMAGE_NAMESPACE=${{ github.repository }} PLATFORM=linux/amd64
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-files
path: ./coverage*
path: ./output/coverage*
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,5 @@ image-metadata-*.json
*results*.json
netperf-*.json
netperf-*.csv

.certs/
3 changes: 0 additions & 3 deletions .golangci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@ issues:
- path: pkg/metrics/types_windows.go
linters:
- revive
- mnd
- var-naming
- path: pkg/metrics/types_linux.go
linters:
- revive
- mnd
- var-naming
linters:
presets:
Expand All @@ -31,7 +29,6 @@ linters:
- gocritic
- gocyclo
- gofmt
- mnd
- goprintffuncname
- gosimple
- lll
Expand Down
2 changes: 1 addition & 1 deletion .pipelines/cg-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ stages:
artifactName: $(BUILDER_ADO_ARTIFACTE_NAME) # Replace with your artifact name
itemPattern: "**/*builder*.tar"
downloadPath: '$(Pipeline.Workspace)\artifacts'

- task: PowerShell@2
displayName: "Load Builder Image"
inputs:
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ container-docker: buildx # util target to build container images using docker bu
image_metadata_filename="image-metadata-$$image_name-$(TAG).json"; \
touch $$image_metadata_filename; \
echo "Building $$image_name for $$os/$$arch "; \
mkdir -p $(OUTPUT_DIR); \
docker buildx build \
--platform $(PLATFORM) \
--metadata-file=$$image_metadata_filename \
Expand All @@ -253,6 +254,7 @@ container-docker: buildx # util target to build container images using docker bu
--build-arg VERSION=$(VERSION) $(EXTRA_BUILD_ARGS) \
--target=$(TARGET) \
-t $(IMAGE_REGISTRY)/$(IMAGE):$(TAG) \
--output type=local,dest=$(OUTPUT_DIR) \
$(BUILDX_ACTION) \
$(CONTEXT_DIR)

Expand Down Expand Up @@ -549,6 +551,9 @@ get-certs:
hubble config set tls true
hubble config set tls-server-name instance.hubble-relay.cilium.io

# Replaces every '.' in $(1) with '\.'
escape_dot = $(subst .,\.,$(1))

.PHONY: clean-certs
clean-certs:
rm -rf $(CERT_DIR)
Expand All @@ -564,7 +569,7 @@ docs:
echo $(PWD)
docker run -it -p 3000:3000 -v $(PWD):/retina -w /retina/ node:20-alpine sh ./site/start-dev.sh

.PHONY: docs-pod
.PHONY: docs-prod
docs-prod:
docker run -i -p 3000:3000 -v $(PWD):/retina -w /retina/ node:20-alpine npm install --prefix site && npm run build --prefix site

Expand Down
3 changes: 2 additions & 1 deletion cli/cmd/capture/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ import (

const BlobURL = "BLOB_URL"

var ErrEmptyBlobURL = errors.New("BLOB_URL must be set/exported")
var ErrEmptyBlobURL = errors.Errorf("%s environment variable is empty. It must be set/exported", BlobURL)

var downloadCapture = &cobra.Command{
Use: "download",
Short: "Download Retina Captures",
RunE: func(*cobra.Command, []string) error {
viper.AutomaticEnv()
blobURL := viper.GetString(BlobURL)
if blobURL == "" {
return ErrEmptyBlobURL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ data:
metricsInterval: {{ .Values.metricsInterval }}
metricsIntervalDuration: {{ .Values.metricsIntervalDuration }}
enableTelemetry: {{ .Values.enableTelemetry }}
enablePodLevel: {{ .Values.enablePodLevel }}
enablePodLevel: false
remoteContext: {{ .Values.remoteContext }}
bypassLookupIPOfInterest: {{ .Values.bypassLookupIPOfInterest }}
{{- end}}
Expand Down
2 changes: 1 addition & 1 deletion deploy/hubble/manifests/controller/helm/retina/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ logLevel: info
enabledPlugin_linux: '["linuxutil","packetforward","packetparser","dns", "dropreason"]'
enabledPlugin_win: '["hnsstats"]'

enableTelemetry: true
enableTelemetry: false

# Interval, in duration, to scrape/publish metrics.
metricsIntervalDuration: "10s"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,13 @@ spec:
containerPort: {{ .Values.retinaPort }}
workingDir: $env:CONTAINER_SANDBOX_MOUNT_POINT
command:
- controller.exe --config ./retina/config.yaml
- powershell.exe
- -command
{{- if semverCompare ">=1.28" .Capabilities.KubeVersion.GitVersion }}
- $env:CONTAINER_SANDBOX_MOUNT_POINT/controller.exe --config ./retina/config.yaml
{{- else }}
- .\setkubeconfigpath.ps1; ./controller.exe --config ./retina/config.yaml --kubeconfig ./kubeconfig
{{- end }}
env:
- name: POD_NAME
valueFrom:
Expand Down
4 changes: 3 additions & 1 deletion docs/02-Installation/01-Setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ Note: you can also run captures with just the [CLI](./02-CLI.md).

## Installation

Requires Helm version >= v3.8.0.
### Requirements

- Helm version >= v3.8.0.

### Basic Mode

Expand Down
59 changes: 47 additions & 12 deletions docs/02-Installation/03-Config.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,60 @@

## Overview

To customize metrics and other options, modify the `retina-config` ConfigMap. Default settings for each component are specified in *deploy/legacy/manifests/controller/helm/retina/values.yaml*.
### Default Configuration

## Agent Config
Default settings for each component are specified in [Values file](../../deploy/legacy/manifests/controller/helm/retina/values.yaml).

### Deployed Configuration

Configuration of an active Retina deployment can be seen in `retina-config` and `retina-operator-config` configmaps.

```shell
kubectl get configmap retina-config -n kube-system -o yaml
kubectl get configmap retina-operator-config -n kube-system -o yaml
```

### Updating Configuration

If the Retina installation was done via Helm, configuration updates should be done via `helm upgrade` defining the specific attribute name and value as part of the command.

The example below enables gathering of advance pod-level metrics.

```shell
VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
--version $VERSION \
--namespace kube-system \
--set image.tag=$VERSION \
--set operator.tag=$VERSION \
--set logLevel=info \
--set enabledPlugin_linux="\[dropreason\,packetforward\,linuxutil\,dns\]"
--set enablePodLevel=true
```

## General Configuration

Apply to both Agent and Operator.

* `enableTelemetry`: Enables telemetry for the agent for managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
* `remoteContext`: Enables Retina to watch Pods on the cluster.
* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `enableRetinaEndpoint` to be enabled.
* `enabledPlugin`: List of enabled plugins.

## Agent Configuration

* `logLevel`: Define the level of logs to store.
* `enabledPlugin_linux`: List of enabled plugins.
* `metricsInterval`: Interval for gathering metrics (in seconds). (@deprecated, use `metricsIntervalDuration` instead)
* `metricsIntervalDuration`: Interval for gathering metrics (in `time.Duration`).
* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
* `enableConntrackMetrics`: Enables conntrack metrics for packets and bytes forwarded/received.
* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `operator.enableRetinaEndpoint` to be enabled.
* `bypassLookupIPOfInterest`: If true, plugins like `packetparser` and `dropreason` will bypass IP lookup, generating an event for each packet regardless. `enableAnnotations` will not work if this is true.
* `dataAggregationLevel`: Defines the level of data aggregation for Retina. See [Data Aggregation](../05-Concepts/data-aggregation.md) for more details.

## Operator Config
## Operator Configuration

* `installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
* `enableTelemetry`: Enables telemetry for the operator in managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
* `captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to *pkg/capture/utils/capture_image.go* for details on how the debug capture image version is selected.
* `captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
* `enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
* `enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
* `operator.installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
* `operator.enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
* `capture.captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to [Capture Image file](../../pkg/capture/utils/capture_image.go) for details on how the debug capture image version is selected.
* `capture.captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
* `capture.enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
9 changes: 7 additions & 2 deletions docs/02-Installation/04-prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b

1. Create a Kubernetes cluster.
2. Install Retina DaemonSet (see [Quick Installation](./01-Setup.md)).
3. Clone [Retina Repository](https://github.com/microsoft/retina) or download [Prometheus Values File](../../deploy/legacy/prometheus/values.yaml).

## Install Prometheus via Helm

Expand All @@ -19,13 +20,17 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b
1. Install the Prometheus chart

```shell
helm install prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
# The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
helm install prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
```

Or if you already have the chart installed, upgrade how you see fit, providing the new job name as an additional scrape config, ex:

```shell
helm upgrade prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
# The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
helm upgrade prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
```

> Note: Grafana and kube-state metrics may schedule on Windows nodes, the current chart doesn't have node affinity for those components. Some manual intervention may be required.
Expand Down
12 changes: 12 additions & 0 deletions docs/03-Metrics/01-metrics-intro.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Metrics

Prometheus metrics available depend on the Retina control plane deployed.

## Control Planes

There are two control planes used in the Retina project: Hubble and the legacy control plane. Both control planes create metrics and traces which are generated by the eBPF data plane, which has a single implementation. Only one control plane should be deployed at a given time. Helm charts for the deployment are found under `deploy/hubble/manifests/controller/helm/retina` and `deploy/legacy/manifests/controller/helm/retina`.

1. [Hubble metrics](./02-hubble_metrics.md)
2. [Legacy metrics](./modes/modes.md)

> Note: Hubble offers additional features and metrics that the legacy control plane does not support. The plan is to deprecate the legacy control plane in favor of Hubble. For further documentation on Hubble, check [Cilium/Hubble repository](https://github.com/cilium/hubble/?tab=readme-ov-file#features) and official [Hubble metrics documentation](https://docs.cilium.io/en/stable/observability/metrics/#hubble-metrics)
53 changes: 53 additions & 0 deletions docs/03-Metrics/02-hubble_metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Hubble Metrics

When Retina is deployed with Hubble control plane, the metrics include Node-level and Pod-level. Metrics are stored in Prometheus format, and can be viewed in Grafana.

## Metrics

* Node-Level Metrics: These metrics provide insights into traffic volume, dropped packets, number of connections, etc. by node.
* Hubble Metrics (DNS and Pod-Level Metrics): These metrics include source and destination pod information allowing to pinpoint network-related issues at a granular level. Metrics cover traffic volume, dropped packets, TCP resets, L4/L7 packet flows, etc. DNS metrics include DNS errors and DNS requests missing responses.

### Node-Level Metrics

The following metrics are aggregated per node. All metrics include labels:

* `cluster`
* `instance` (Node name)

Retina provides metrics for both Linux and Windows operating systems.
The table below outlines the different metrics generated.

| Metric Name | Description | Extra Labels | Linux | Windows |
|------------------------------------------------|-------------|--------------|-------|---------|
| **networkobservability_forward_count** | Total forwarded packet count | `direction` |||
| **networkobservability_forward_bytes** | Total forwarded byte count | `direction` |||
| **networkobservability_drop_count** | Total dropped packet count | `direction`, `reason` |||
| **networkobservability_drop_bytes** | Total dropped byte count | `direction`, `reason` |||
| **networkobservability_tcp_state** | TCP currently active socket count by TCP state. | `state` |||
| **networkobservability_tcp_connection_remote** | TCP currently active socket count by remote IP/port. | `address` (IP), `port` |||
| **networkobservability_tcp_connection_stats** | TCP connection statistics. (ex: Delayed ACKs, TCPKeepAlive, TCPSackFailures) | `statistic` |||
| **networkobservability_tcp_flag_counters** | TCP packets count by flag. | `flag` |||
| **networkobservability_ip_connection_stats** | IP connection statistics. | `statistic` |||
| **networkobservability_udp_connection_stats** | UDP connection statistics. | `statistic` |||
| **networkobservability_udp_active_sockets** | UDP currently active socket count | |||
| **networkobservability_interface_stats** | Interface statistics. | InterfaceName, `statistic` |||

### Pod-Level Metrics (Hubble Metrics)

The following metrics are aggregated per pod (node information is preserved). All metrics include labels:

* `cluster`
* `instance` (Node name)
* `source`
* `destination`

For *outgoing traffic*, there will be a `source` label with source pod namespace/name.
For *incoming traffic*, there will be a `destination` label with destination pod namespace/name.

| Metric Name | Description | Extra Labels | Linux | Windows |
|----------------------------------|------------------------------|-----------------------|-------|---------|
| **hubble_dns_queries_total** | Total DNS requests by query | `source` or `destination`, `query`, `qtypes` (query type) |||
| **hubble_dns_responses_total** | Total DNS responses by query/response | `source` or `destination`, `query`, `qtypes` (query type), `rcode` (return code), `ips_returned` (number of IPs) |||
| **hubble_drop_total** | Total dropped packet count | `source` or `destination`, `protocol`, `reason` |||
| **hubble_tcp_flags_total** | Total TCP packets count by flag. | `source` or `destination`, `flag` |||
| **hubble_flows_processed_total** | Total network flows processed (L4/L7 traffic) | `source` or `destination`, `protocol`, `verdict`, `type`, `subtype` |||
4 changes: 2 additions & 2 deletions docs/03-Metrics/modes/modes.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
sidebar_position: 1
sidebar_position: 2
---
# Metric Modes
# Legacy Metric Modes

Retina provides **three modes** with their own metrics and scale capabilities.
Each mode is **fully customizable** (only create the metrics/labels you need).
Expand Down
Loading

0 comments on commit 0f4b15a

Please sign in to comment.