stackrox · erthalion · May 22, 2024 · Apr 23, 2024 · May 7, 2024 · May 7, 2024
@@ -2,5 +2,7 @@
 ignore = 
     # line-too-long PEP8 errors (handled by the max-line-length field below)
     E501
+    # line break before binary operator
+    W504
 max-line-length = 79
 max-complexity = 18
@@ -34,9 +34,10 @@ jobs:
         with:
           python-version: "3.10"
 
-      - uses: 'google-github-actions/auth@v2'
+      - name: Authenticate with GCP
+        uses: 'google-github-actions/auth@v2'
         with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}'
+          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}'
 
       - uses: 'google-github-actions/setup-gcloud@v2'
 
@@ -48,29 +49,41 @@ jobs:
       - name: Install python deps
         run: python3 -m pip install -r ./integration-tests/scripts/baseline/requirements.txt
 
-      - name: Calculate Baseline
+      - name: Compare with the Baseline
         run: |
-          jq -s 'flatten' perf.json > integration-tests/perf-all.json
+          jq -s 'flatten' ./container-logs/*/*/perf.json > integration-tests/perf-all.json
+
+          if [[ "${RUNNER_DEBUG}" == "1" ]]; then
+              cat integration-tests/perf-all.json
+          fi
+
+          ./integration-tests/scripts/baseline/main.py --test integration-tests/perf-all.json \
+            | sort \
+            | awk -F "," -f ./integration-tests/scripts/baseline/format-cpu.awk > benchmark-cpu.md
 
           ./integration-tests/scripts/baseline/main.py --test integration-tests/perf-all.json \
             | sort \
-            | awk -f ./integration-tests/scripts/baseline/format.awk > benchmark.md
+            | awk -F "," -f ./integration-tests/scripts/baseline/format-mem.awk > benchmark-mem.md
 
           delimiter=$(openssl rand -hex 8)
           {
             echo "PERF_TABLE<<${delimiter}"
-            cat benchmark.md
+            cat benchmark-cpu.md
+            echo "---"
+            cat benchmark-mem.md
             echo "${delimiter}"
           } >> "$GITHUB_ENV"
         env:
-          GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}"
+          GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}"
 
       - name: Update Baseline
-        if: github.ref == 'master'
+        if: |
+          ((github.event_name == 'push' && github.ref == 'master') ||
+          contains(github.event.pull_request.labels.*.name, 'update-baseline'))
         run: |
           ./integration-tests/scripts/baseline/main.py --update integration-tests/perf-all.json
         env:
-          GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}"
+          GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}"
 
       - name: Comment on PR
         if: github.event_name == 'pull_request'

@@ -158,9 +158,10 @@ jobs:
     with:
       collector-tag: ${{ needs.init.outputs.collector-tag }}
       collector-qa-tag: ${{ needs.init.outputs.collector-qa-tag }}
-    # Temporarily disable benchmarks due to baseline calculation
-    # inconsistencies. This will be fixed and re-enabled in the future.
-    if: false # ${{ !contains(github.event.pull_request.labels.*.name, 'skip-integration-tests') }}
+    if: |
+      always() &&
+      ((github.event_name == 'push' && github.ref_name == 'master') ||
+      contains(github.event.pull_request.labels.*.name, 'run-benchmark'))
     needs:
     - init
     - build-collector-slim

@@ -52,7 +52,11 @@
   roles:
     - run-test-target
   vars:
-    collector_test: TestBenchmark
+    # This is going to be the prefix for tests to be run. Originally it was
+    # just TestBenchmark, to include both TestBenchmarkBaseline and
+    # TestBenchmarkCollector, but since now we do benchmarking directly there
+    # is no need in baseline results.
+    collector_test: TestBenchmarkCollector
   tags:
     - run-benchmarks
 

@@ -0,0 +1,7 @@
+# List of configuration GHA labels
+
+* `run-benchmark`: trigger a benchmark CI step on a PR (normally it's being
+  performed only on the main branch).
+
+* `update-baseline`: include benchmark results from the PR into the baseline
+  (normally the baseline is maintained purely from the main branch).
@@ -451,6 +451,30 @@ $ curl collector:8080/profile/heap
 The resulting profile could be processed with `pprof` to get a human-readable
 output with debugging symbols.
 
+## Benchmark CI step
+
+Whenever in doubt about performance implications of your changes, there is an
+option to run a benchmark against the PR and compare the results with the known
+baseline numbers. To do that, add a "run-benchmark" label to the PR. The
+performance comparison will be reported via commentary to the PR with metrics
+for CPU utilization and memory consumption. Along with the median values for
+each resource a p-value sign will be reported, which could be interpreted as
+how high are the chances that the observed difference is purely due to the
+noise. The underlying mechanism is t-test, more than 80% probability it's just
+a noise will result in green, less than that -- in red.
+
+The benchmark will be conducted with two short simultaneous workloads. You can
+find the configuration for them inside the berserker integration test image.
+The baseline numbers are stored on GCS, and contains last 10 runs from the main
+branch. The reporting filters out distinctly different results with small median
+difference, to not bother without significant reasons, e.g. differences in CPU
+utilization less than 1% and in memory less than 10 MiB are ignored. Keep in
+mind that false-positives are definitely possible due to the noisiness of the
+CI platform.
+
+If for development purposes it's necessary to update the baseline from a PR,
+not only just for the main branch, you can add "update-baseline" label.
+
 ## Introspection endpoints
 
 Another method for troubleshooting collector during development cycles is to

@@ -1 +1 @@
-1.1.2
+1.1.3
@@ -0,0 +1,7 @@
+FROM  quay.io/rhacs-eng/qa:berserker-1.0-59-g87ad0d870e
+
+COPY workloads/ /etc/berserker/
+
+ENV PATH="${PATH}:/usr/local/bin"
+
+ENTRYPOINT ["/usr/local/bin/berserker"]
@@ -0,0 +1,23 @@
+BASE_PATH = .
+include ../Makefile-constants.mk
+
+DEFAULT_GOAL = all
+
+COLLECTOR_QA_BENCHMARK_TAG := berserker
+
+ifneq ($(COLLECTOR_QA_TAG),)
+COLLECTOR_QA_BENCHMARK_TAG=berserker-$(COLLECTOR_QA_TAG)
+endif
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build:
+	@docker buildx build --load --platform ${PLATFORM} \
+                -t quay.io/rhacs-eng/collector-performance:$(COLLECTOR_QA_BENCHMARK_TAG) .
+
+.PHONY: build-and-push
+build-and-push:
+	@docker buildx build --push --platform ${PLATFORM} \
+		-t quay.io/rhacs-eng/collector-performance:$(COLLECTOR_QA_BENCHMARK_TAG) .
@@ -0,0 +1,8 @@
+restart_interval = 10
+duration = 60
+
+[workload]
+type = "endpoints"
+distribution = "zipf"
+n_ports = 8000
+exponent = 1.4
@@ -0,0 +1,8 @@
+restart_interval = 10
+duration = 60
+
+[workload]
+type = "processes"
+arrival_rate = 200.0
+departure_rate = 200.0
+random_process = true
@@ -1,4 +1,6 @@
-FROM alpine:latest
+# Pinned to 3.18.4, which will mean that socat 1.7.4.4 is installed
+# Updating to newer versions has caused test failures in socat tests
+FROM alpine:3.18.4
 
 RUN apk update && apk upgrade && apk add socat
 

@@ -1,5 +1,6 @@
 qa:
   performance-phoronix: quay.io/rhacs-eng/collector-performance:phoronix
+  performance-berserker: quay.io/rhacs-eng/collector-performance:berserker
   performance-json-label: quay.io/rhacs-eng/collector-performance:json-label
   performance-stats: quay.io/rhacs-eng/collector-performance:stats
   performance-perf: quay.io/rhacs-eng/collector-performance:perf

@@ -0,0 +1,15 @@
+function abs(v) {return v < 0 ? -v : v}
+
+BEGIN {
+    print "|VM|Method|Baseline CPU median (%)|Test CPU median (%)|CPU P-value|";
+    print "|---|---|---|---|---|"
+}
+{
+    # Values might be distinctly different, but if the difference is small, we
+    # don't bother. Currently the cut-off line is 5%, which means any
+    # difference in median values less than 5% will be reported as green.
+    diff = abs($3 - $4);
+
+    cpu_warn = ($5 < 0.8 && diff > 5) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,cpu_warn,ORS
+}
@@ -0,0 +1,14 @@
+function abs(v) {return v < 0 ? -v : v}
+
+BEGIN {
+    print "|VM|Method|Baseline Memory median (MiB)|Test Memory median (MiB)|Memory P-value|";
+    print "|---|---|---|---|---|"
+}
+{
+    # Values might be distinctly different, but if the difference is small, we
+    # don't bother. Currently the cut-off line is 10MiB, which means any
+    # difference in median values less than 10MiB will be reported as green.
+    diff = abs($6 - $7);
+    mem_warn = ($8 < 0.8 && diff > 10) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s",$1,$2,$6,$7,mem_warn,ORS
+}
@@ -1,8 +1,9 @@
 BEGIN {
-    print "|Kernel|Method|Without Collector Time (secs)| With Collector Time (secs)|Baseline median (secs)|Collector median (secs)|PValue|";
-    print "|---|---|---|---|---|---|---|"
+    print "|VM|Method|Baseline CPU median (cores)|Test CPU median (cores)|CPU P-value|Baseline Memory median (MiB)|Test Memory median (MiB)|Memory P-value|";
+    print "|---|---|---|---|---|---|---|---|"
 }
 {
-    warn = ($7==1) ? ":red_circle:" : ":green_circle:";
-    printf "|%s|%s|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,$5,$6,warn,ORS
+    cpu_warn = ($5 < 0.6) ? ":red_circle:" : ":green_circle:";
+    mem_warn = ($8 < 0.6) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,cpu_warn,$6,$7,mem_warn,ORS
 }