Merge branch 'microsoft:main' into main

microsoft · Jan 17, 2025 · 0f4b15a · 0f4b15a
2 parents fcc625d + a3088c8
commit 0f4b15a
Show file tree

Hide file tree

Showing 27 changed files with 461 additions and 300 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches: [main]
   workflow_dispatch:
+
 permissions:
   actions: read
   contents: read
@@ -15,6 +16,7 @@ permissions:
   pull-requests: write
   security-events: write
   issues: write
+
 jobs:
   test-image:
     runs-on: ubuntu-latest
@@ -32,8 +34,9 @@ jobs:
           PULL_REQUEST_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           make test-image IMAGE_NAMESPACE=${{ github.repository }} PLATFORM=linux/amd64
+
       - name: Upload Artifacts
         uses: actions/upload-artifact@v4
         with:
           name: coverage-files
-          path: ./coverage*
+          path: ./output/coverage*
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,5 @@ image-metadata-*.json
 *results*.json
 netperf-*.json
 netperf-*.csv
+
+.certs/
diff --git a/.golangci.yaml b/.golangci.yaml
@@ -7,12 +7,10 @@ issues:
     - path: pkg/metrics/types_windows.go
       linters:
         - revive
-        - mnd
         - var-naming
     - path: pkg/metrics/types_linux.go
       linters:
         - revive
-        - mnd
         - var-naming
 linters:
   presets: 
@@ -31,7 +29,6 @@ linters:
   - gocritic
   - gocyclo
   - gofmt
-  - mnd
   - goprintffuncname
   - gosimple
   - lll

diff --git a/.pipelines/cg-pipeline.yaml b/.pipelines/cg-pipeline.yaml
@@ -224,7 +224,7 @@ stages:
               artifactName: $(BUILDER_ADO_ARTIFACTE_NAME) # Replace with your artifact name
               itemPattern: "**/*builder*.tar"
               downloadPath: '$(Pipeline.Workspace)\artifacts'
-
+              
           - task: PowerShell@2
             displayName: "Load Builder Image"
             inputs:

diff --git a/Makefile b/Makefile
@@ -241,6 +241,7 @@ container-docker: buildx # util target to build container images using docker bu
 	image_metadata_filename="image-metadata-$$image_name-$(TAG).json"; \
 	touch $$image_metadata_filename; \
 	echo "Building $$image_name for $$os/$$arch "; \
+	mkdir -p $(OUTPUT_DIR); \
 	docker buildx build \
 		--platform $(PLATFORM) \
 		--metadata-file=$$image_metadata_filename \
@@ -253,6 +254,7 @@ container-docker: buildx # util target to build container images using docker bu
 		--build-arg VERSION=$(VERSION) $(EXTRA_BUILD_ARGS) \
 		--target=$(TARGET) \
 		-t $(IMAGE_REGISTRY)/$(IMAGE):$(TAG) \
+		--output type=local,dest=$(OUTPUT_DIR) \
 		$(BUILDX_ACTION) \
 		$(CONTEXT_DIR) 
 
@@ -549,6 +551,9 @@ get-certs:
 	hubble config set tls true
 	hubble config set tls-server-name instance.hubble-relay.cilium.io
 
+# Replaces every '.' in $(1) with '\.'
+escape_dot = $(subst .,\.,$(1))
+
 .PHONY: clean-certs
 clean-certs:
 	rm -rf $(CERT_DIR)
@@ -564,7 +569,7 @@ docs:
 	echo $(PWD)
 	docker run -it -p 3000:3000 -v $(PWD):/retina -w /retina/ node:20-alpine sh ./site/start-dev.sh
 
-.PHONY: docs-pod
+.PHONY: docs-prod
 docs-prod:
 	docker run -i -p 3000:3000 -v $(PWD):/retina -w /retina/ node:20-alpine npm install --prefix site && npm run build --prefix site
 

diff --git a/cli/cmd/capture/download.go b/cli/cmd/capture/download.go
@@ -18,12 +18,13 @@ import (
 
 const BlobURL = "BLOB_URL"
 
-var ErrEmptyBlobURL = errors.New("BLOB_URL must be set/exported")
+var ErrEmptyBlobURL = errors.Errorf("%s environment variable is empty. It must be set/exported", BlobURL)
 
 var downloadCapture = &cobra.Command{
 	Use:   "download",
 	Short: "Download Retina Captures",
 	RunE: func(*cobra.Command, []string) error {
+		viper.AutomaticEnv()
 		blobURL := viper.GetString(BlobURL)
 		if blobURL == "" {
 			return ErrEmptyBlobURL

diff --git a/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml b/deploy/hubble/manifests/controller/helm/retina/templates/agent/configmap.yaml
@@ -132,7 +132,7 @@ data:
     metricsInterval: {{ .Values.metricsInterval }}
     metricsIntervalDuration: {{ .Values.metricsIntervalDuration }}
     enableTelemetry: {{ .Values.enableTelemetry }}
-    enablePodLevel: {{ .Values.enablePodLevel }}
+    enablePodLevel: false
     remoteContext: {{ .Values.remoteContext }}
     bypassLookupIPOfInterest: {{ .Values.bypassLookupIPOfInterest }}
 {{- end}}

diff --git a/deploy/hubble/manifests/controller/helm/retina/values.yaml b/deploy/hubble/manifests/controller/helm/retina/values.yaml
@@ -90,7 +90,7 @@ logLevel: info
 enabledPlugin_linux: '["linuxutil","packetforward","packetparser","dns", "dropreason"]'
 enabledPlugin_win: '["hnsstats"]'
 
-enableTelemetry: true
+enableTelemetry: false
 
 # Interval, in duration, to scrape/publish metrics.
 metricsIntervalDuration: "10s"

diff --git a/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml b/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml
@@ -203,7 +203,13 @@ spec:
             containerPort: {{ .Values.retinaPort }}
           workingDir: $env:CONTAINER_SANDBOX_MOUNT_POINT
           command:
-            - controller.exe --config ./retina/config.yaml 
+            - powershell.exe
+            - -command
+            {{- if semverCompare ">=1.28" .Capabilities.KubeVersion.GitVersion }}
+            - $env:CONTAINER_SANDBOX_MOUNT_POINT/controller.exe --config ./retina/config.yaml
+            {{- else }}
+            - .\setkubeconfigpath.ps1; ./controller.exe --config ./retina/config.yaml --kubeconfig ./kubeconfig
+            {{- end }}
           env:
           - name: POD_NAME
             valueFrom:

diff --git a/docs/02-Installation/01-Setup.md b/docs/02-Installation/01-Setup.md
@@ -6,7 +6,9 @@ Note: you can also run captures with just the [CLI](./02-CLI.md).
 
 ## Installation
 
-Requires Helm version >= v3.8.0.
+### Requirements
+
+- Helm version >= v3.8.0.
 
 ### Basic Mode
 

diff --git a/docs/02-Installation/03-Config.md b/docs/02-Installation/03-Config.md
@@ -2,25 +2,60 @@
 
 ## Overview
 
-To customize metrics and other options, modify the `retina-config` ConfigMap. Default settings for each component are specified in *deploy/legacy/manifests/controller/helm/retina/values.yaml*.
+### Default Configuration
 
-## Agent Config
+Default settings for each component are specified in [Values file](../../deploy/legacy/manifests/controller/helm/retina/values.yaml).
+
+### Deployed Configuration
+
+Configuration of an active Retina deployment can be seen in `retina-config` and `retina-operator-config` configmaps.
+
+```shell
+kubectl get configmap retina-config -n kube-system -o yaml
+kubectl get configmap retina-operator-config -n kube-system -o yaml
+```
+
+### Updating Configuration
+
+If the Retina installation was done via Helm, configuration updates should be done via `helm upgrade` defining the specific attribute name and value as part of the command.
+
+The example below enables gathering of advance pod-level metrics.
+
+```shell
+VERSION=$( curl -sL https://api.github.com/repos/microsoft/retina/releases/latest | jq -r .name)
+helm upgrade --install retina oci://ghcr.io/microsoft/retina/charts/retina \
+    --version $VERSION \
+    --namespace kube-system \
+    --set image.tag=$VERSION \
+    --set operator.tag=$VERSION \
+    --set logLevel=info \
+    --set enabledPlugin_linux="\[dropreason\,packetforward\,linuxutil\,dns\]"
+    --set enablePodLevel=true
+```
+
+## General Configuration
+
+Apply to both Agent and Operator.
 
 * `enableTelemetry`: Enables telemetry for the agent for managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
-* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
 * `remoteContext`: Enables Retina to watch Pods on the cluster.
-* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `enableRetinaEndpoint` to be enabled.
-* `enabledPlugin`: List of enabled plugins.
+
+## Agent Configuration
+
+* `logLevel`: Define the level of logs to store.
+* `enabledPlugin_linux`: List of enabled plugins.
 * `metricsInterval`: Interval for gathering metrics (in seconds). (@deprecated, use `metricsIntervalDuration` instead)
 * `metricsIntervalDuration`: Interval for gathering metrics (in `time.Duration`).
+* `enablePodLevel`: Enables gathering of advanced pod-level metrics, attaching pods' metadata to Retina's metrics.
+* `enableConntrackMetrics`: Enables conntrack metrics for packets and bytes forwarded/received.
+* `enableAnnotations`: Enables gathering of metrics for annotated resources. Resources can be annotated with `retina.sh=observe`. Requires the operator and `operator.enableRetinaEndpoint` to be enabled.
 * `bypassLookupIPOfInterest`: If true, plugins like `packetparser` and `dropreason` will bypass IP lookup, generating an event for each packet regardless. `enableAnnotations` will not work if this is true.
 * `dataAggregationLevel`: Defines the level of data aggregation for Retina. See [Data Aggregation](../05-Concepts/data-aggregation.md) for more details.
 
-## Operator Config
+## Operator Configuration
 
-* `installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
-* `enableTelemetry`: Enables telemetry for the operator in managed AKS clusters. Requires `buildinfo.ApplicationInsightsID` to be set if enabled.
-* `captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to *pkg/capture/utils/capture_image.go* for details on how the debug capture image version is selected.
-* `captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
-* `enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
-* `enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
+* `operator.installCRDs`: Allows the operator to manage the installation of Retina-related CRDs.
+* `operator.enableRetinaEndpoint`: Allows the operator to monitor and update the cache with Pod metadata.
+* `capture.captureDebug`: Toggles debug mode for captures. If true, the operator uses the image from the test container registry for the capture workload. Refer to [Capture Image file](../../pkg/capture/utils/capture_image.go) for details on how the debug capture image version is selected.
+* `capture.captureJobNumLimit`: Sets the maximum number of jobs that can be created for each Capture.
+* `capture.enableManagedStorageAccount`: Enables the use of a managed storage account for storing artifacts.
diff --git a/docs/02-Installation/04-prometheus.md b/docs/02-Installation/04-prometheus.md
@@ -6,6 +6,7 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b
 
 1. Create a Kubernetes cluster.
 2. Install Retina DaemonSet (see [Quick Installation](./01-Setup.md)).
+3. Clone [Retina Repository](https://github.com/microsoft/retina) or download [Prometheus Values File](../../deploy/legacy/prometheus/values.yaml).
 
 ## Install Prometheus via Helm
 
@@ -19,13 +20,17 @@ Prometheus is an open-source system monitoring and alerting toolkit originally b
 1. Install the Prometheus chart
 
   ```shell
-  helm install prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
+  # The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
+  VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
+  helm install prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
   ```
 
   Or if you already have the chart installed, upgrade how you see fit, providing the new job name as an additional scrape config, ex:
 
   ```shell
-  helm upgrade prometheus -n kube-system -f deploy/legacy/prometheus/values.yaml prometheus-community/kube-prometheus-stack
+  # The value of VALUE_FILE_PATH is relative to the repo root folder. Update this according to the location of your file.
+  VALUE_FILE_PATH=deploy/legacy/prometheus/values.yaml
+  helm upgrade prometheus -n kube-system -f $VALUE_FILE_PATH prometheus-community/kube-prometheus-stack
   ```
 
 > Note: Grafana and kube-state metrics may schedule on Windows nodes, the current chart doesn't have node affinity for those components. Some manual intervention may be required.

diff --git a/docs/03-Metrics/01-metrics-intro.md b/docs/03-Metrics/01-metrics-intro.md
@@ -0,0 +1,12 @@
+# Metrics
+
+Prometheus metrics available depend on the Retina control plane deployed.
+
+## Control Planes
+
+There are two control planes used in the Retina project: Hubble and the legacy control plane. Both control planes create metrics and traces which are generated by the eBPF data plane, which has a single implementation. Only one control plane should be deployed at a given time. Helm charts for the deployment are found under `deploy/hubble/manifests/controller/helm/retina` and `deploy/legacy/manifests/controller/helm/retina`.
+
+1. [Hubble metrics](./02-hubble_metrics.md)
+2. [Legacy metrics](./modes/modes.md)
+
+> Note: Hubble offers additional features and metrics that the legacy control plane does not support. The plan is to deprecate the legacy control plane in favor of Hubble. For further documentation on Hubble, check [Cilium/Hubble repository](https://github.com/cilium/hubble/?tab=readme-ov-file#features) and official [Hubble metrics documentation](https://docs.cilium.io/en/stable/observability/metrics/#hubble-metrics)
diff --git a/docs/03-Metrics/02-hubble_metrics.md b/docs/03-Metrics/02-hubble_metrics.md
@@ -0,0 +1,53 @@
+# Hubble Metrics
+
+When Retina is deployed with Hubble control plane, the metrics include Node-level and Pod-level. Metrics are stored in Prometheus format, and can be viewed in Grafana.
+
+## Metrics
+
+* Node-Level Metrics: These metrics provide insights into traffic volume, dropped packets, number of connections, etc. by node.
+* Hubble Metrics (DNS and Pod-Level Metrics): These metrics include source and destination pod information allowing to pinpoint network-related issues at a granular level. Metrics cover traffic volume, dropped packets, TCP resets, L4/L7 packet flows, etc. DNS metrics include DNS errors and DNS requests missing responses.
+
+### Node-Level Metrics
+
+The following metrics are aggregated per node. All metrics include labels:
+
+* `cluster`
+* `instance` (Node name)
+
+Retina provides metrics for both Linux and Windows operating systems.
+The table below outlines the different metrics generated.
+
+| Metric Name                                    | Description | Extra Labels | Linux | Windows |
+|------------------------------------------------|-------------|--------------|-------|---------|
+| **networkobservability_forward_count**         | Total forwarded packet count | `direction` | ✅ | ✅ |
+| **networkobservability_forward_bytes**         | Total forwarded byte count | `direction` | ✅ | ✅ |
+| **networkobservability_drop_count**            | Total dropped packet count | `direction`, `reason` | ✅ | ✅ |
+| **networkobservability_drop_bytes**            | Total dropped byte count | `direction`, `reason` | ✅ | ✅ |
+| **networkobservability_tcp_state**             | TCP currently active socket count by TCP state. | `state` | ✅ | ✅ |
+| **networkobservability_tcp_connection_remote** | TCP currently active socket count by remote IP/port. | `address` (IP), `port` | ✅ | ❌ |
+| **networkobservability_tcp_connection_stats**  | TCP connection statistics. (ex: Delayed ACKs, TCPKeepAlive, TCPSackFailures) | `statistic` | ✅ | ✅ |
+| **networkobservability_tcp_flag_counters**     | TCP packets count by flag. | `flag` | ❌ | ✅ |
+| **networkobservability_ip_connection_stats**   | IP connection statistics. | `statistic` | ✅ | ❌ |
+| **networkobservability_udp_connection_stats**  | UDP connection statistics. | `statistic` | ✅ | ❌ |
+| **networkobservability_udp_active_sockets**    | UDP currently active socket count |  | ✅ | ❌ |
+| **networkobservability_interface_stats**       | Interface statistics. | InterfaceName, `statistic` | ✅ | ✅ |
+
+### Pod-Level Metrics (Hubble Metrics)
+
+The following metrics are aggregated per pod (node information is preserved). All metrics include labels:
+
+* `cluster`
+* `instance` (Node name)
+* `source`
+* `destination`
+
+For *outgoing traffic*, there will be a `source` label with source pod namespace/name.
+For *incoming traffic*, there will be a `destination` label with destination pod namespace/name.
+
+| Metric Name                      | Description                  | Extra Labels          | Linux | Windows |
+|----------------------------------|------------------------------|-----------------------|-------|---------|
+| **hubble_dns_queries_total**     | Total DNS requests by query  | `source` or `destination`, `query`, `qtypes` (query type) | ✅ | ❌ |
+| **hubble_dns_responses_total**   | Total DNS responses by query/response | `source` or `destination`, `query`, `qtypes` (query type), `rcode` (return code), `ips_returned` (number of IPs) | ✅ | ❌ |
+| **hubble_drop_total**            | Total dropped packet count | `source` or `destination`, `protocol`, `reason` | ✅ | ❌ |
+| **hubble_tcp_flags_total**       | Total TCP packets count by flag. | `source` or `destination`, `flag` | ✅ | ❌ |
+| **hubble_flows_processed_total** | Total network flows processed (L4/L7 traffic) | `source` or `destination`, `protocol`, `verdict`, `type`, `subtype` | ✅ | ❌ |
diff --git a/docs/03-Metrics/modes/modes.md b/docs/03-Metrics/modes/modes.md
@@ -1,7 +1,7 @@
 ---
-sidebar_position: 1
+sidebar_position: 2
 ---
-# Metric Modes
+# Legacy Metric Modes
 
 Retina provides **three modes** with their own metrics and scale capabilities.
 Each mode is **fully customizable** (only create the metrics/labels you need).