diff --git a/.flake8 b/.flake8
index ad95cfe30d..bdd2d30fde 100644
--- a/.flake8
+++ b/.flake8
@@ -2,5 +2,7 @@
 ignore = 
     # line-too-long PEP8 errors (handled by the max-line-length field below)
     E501
+    # line break before binary operator
+    W504
 max-line-length = 79
 max-complexity = 18
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index fd38d73e95..b1261486ce 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -34,9 +34,10 @@ jobs:
         with:
           python-version: "3.10"
 
-      - uses: 'google-github-actions/auth@v2'
+      - name: Authenticate with GCP
+        uses: 'google-github-actions/auth@v2'
         with:
-          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}'
+          credentials_json: '${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}'
 
       - uses: 'google-github-actions/setup-gcloud@v2'
 
@@ -48,29 +49,41 @@ jobs:
       - name: Install python deps
         run: python3 -m pip install -r ./integration-tests/scripts/baseline/requirements.txt
 
-      - name: Calculate Baseline
+      - name: Compare with the Baseline
         run: |
-          jq -s 'flatten' perf.json > integration-tests/perf-all.json
+          jq -s 'flatten' ./container-logs/*/*/perf.json > integration-tests/perf-all.json
+
+          if [[ "${RUNNER_DEBUG}" == "1" ]]; then
+              cat integration-tests/perf-all.json
+          fi
+
+          ./integration-tests/scripts/baseline/main.py --test integration-tests/perf-all.json \
+            | sort \
+            | awk -F "," -f ./integration-tests/scripts/baseline/format-cpu.awk > benchmark-cpu.md
 
           ./integration-tests/scripts/baseline/main.py --test integration-tests/perf-all.json \
             | sort \
-            | awk -f ./integration-tests/scripts/baseline/format.awk > benchmark.md
+            | awk -F "," -f ./integration-tests/scripts/baseline/format-mem.awk > benchmark-mem.md
 
           delimiter=$(openssl rand -hex 8)
           {
             echo "PERF_TABLE<<${delimiter}"
-            cat benchmark.md
+            cat benchmark-cpu.md
+            echo "---"
+            cat benchmark-mem.md
             echo "${delimiter}"
           } >> "$GITHUB_ENV"
         env:
-          GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}"
+          GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}"
 
       - name: Update Baseline
-        if: github.ref == 'master'
+        if: |
+          ((github.event_name == 'push' && github.ref == 'master') ||
+          contains(github.event.pull_request.labels.*.name, 'update-baseline'))
         run: |
           ./integration-tests/scripts/baseline/main.py --update integration-tests/perf-all.json
         env:
-          GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT }}"
+          GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT: "${{ secrets.GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT }}"
 
       - name: Comment on PR
         if: github.event_name == 'pull_request'
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 911856639d..7a9e839818 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -158,9 +158,10 @@ jobs:
     with:
       collector-tag: ${{ needs.init.outputs.collector-tag }}
       collector-qa-tag: ${{ needs.init.outputs.collector-qa-tag }}
-    # Temporarily disable benchmarks due to baseline calculation
-    # inconsistencies. This will be fixed and re-enabled in the future.
-    if: false # ${{ !contains(github.event.pull_request.labels.*.name, 'skip-integration-tests') }}
+    if: |
+      always() &&
+      ((github.event_name == 'push' && github.ref_name == 'master') ||
+      contains(github.event.pull_request.labels.*.name, 'run-benchmark'))
     needs:
     - init
     - build-collector-slim
diff --git a/ansible/benchmarks.yml b/ansible/benchmarks.yml
index 13cb249b17..d0d90daaff 100644
--- a/ansible/benchmarks.yml
+++ b/ansible/benchmarks.yml
@@ -52,7 +52,11 @@
   roles:
     - run-test-target
   vars:
-    collector_test: TestBenchmark
+    # This is going to be the prefix for tests to be run. Originally it was
+    # just TestBenchmark, to include both TestBenchmarkBaseline and
+    # TestBenchmarkCollector, but since now we do benchmarking directly there
+    # is no need in baseline results.
+    collector_test: TestBenchmarkCollector
   tags:
     - run-benchmarks
 
diff --git a/docs/labels.md b/docs/labels.md
new file mode 100644
index 0000000000..d5ed252e5e
--- /dev/null
+++ b/docs/labels.md
@@ -0,0 +1,7 @@
+# List of configuration GHA labels
+
+* `run-benchmark`: trigger a benchmark CI step on a PR (normally it's being
+  performed only on the main branch).
+
+* `update-baseline`: include benchmark results from the PR into the baseline
+  (normally the baseline is maintained purely from the main branch).
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
index c0bb98cb4b..0ecf6b59db 100644
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -451,6 +451,30 @@ $ curl collector:8080/profile/heap
 The resulting profile could be processed with `pprof` to get a human-readable
 output with debugging symbols.
 
+## Benchmark CI step
+
+Whenever in doubt about performance implications of your changes, there is an
+option to run a benchmark against the PR and compare the results with the known
+baseline numbers. To do that, add a "run-benchmark" label to the PR. The
+performance comparison will be reported via commentary to the PR with metrics
+for CPU utilization and memory consumption. Along with the median values for
+each resource a p-value sign will be reported, which could be interpreted as
+how high are the chances that the observed difference is purely due to the
+noise. The underlying mechanism is t-test, more than 80% probability it's just
+a noise will result in green, less than that -- in red.
+
+The benchmark will be conducted with two short simultaneous workloads. You can
+find the configuration for them inside the berserker integration test image.
+The baseline numbers are stored on GCS, and contains last 10 runs from the main
+branch. The reporting filters out distinctly different results with small median
+difference, to not bother without significant reasons, e.g. differences in CPU
+utilization less than 1% and in memory less than 10 MiB are ignored. Keep in
+mind that false-positives are definitely possible due to the noisiness of the
+CI platform.
+
+If for development purposes it's necessary to update the baseline from a PR,
+not only just for the main branch, you can add "update-baseline" label.
+
 ## Introspection endpoints
 
 Another method for troubleshooting collector during development cycles is to
diff --git a/integration-tests/container/QA_TAG b/integration-tests/container/QA_TAG
index 45a1b3f445..781dcb07cd 100644
--- a/integration-tests/container/QA_TAG
+++ b/integration-tests/container/QA_TAG
@@ -1 +1 @@
-1.1.2
+1.1.3
diff --git a/integration-tests/container/berserker/Dockerfile b/integration-tests/container/berserker/Dockerfile
new file mode 100644
index 0000000000..ea60cbc931
--- /dev/null
+++ b/integration-tests/container/berserker/Dockerfile
@@ -0,0 +1,7 @@
+FROM  quay.io/rhacs-eng/qa:berserker-1.0-59-g87ad0d870e
+
+COPY workloads/ /etc/berserker/
+
+ENV PATH="${PATH}:/usr/local/bin"
+
+ENTRYPOINT ["/usr/local/bin/berserker"]
diff --git a/integration-tests/container/berserker/Makefile b/integration-tests/container/berserker/Makefile
new file mode 100644
index 0000000000..f0cbe3e6bb
--- /dev/null
+++ b/integration-tests/container/berserker/Makefile
@@ -0,0 +1,23 @@
+BASE_PATH = .
+include ../Makefile-constants.mk
+
+DEFAULT_GOAL = all
+
+COLLECTOR_QA_BENCHMARK_TAG := berserker
+
+ifneq ($(COLLECTOR_QA_TAG),)
+COLLECTOR_QA_BENCHMARK_TAG=berserker-$(COLLECTOR_QA_TAG)
+endif
+
+.PHONY: all
+all: build
+
+.PHONY: build
+build:
+	@docker buildx build --load --platform ${PLATFORM} \
+                -t quay.io/rhacs-eng/collector-performance:$(COLLECTOR_QA_BENCHMARK_TAG) .
+
+.PHONY: build-and-push
+build-and-push:
+	@docker buildx build --push --platform ${PLATFORM} \
+		-t quay.io/rhacs-eng/collector-performance:$(COLLECTOR_QA_BENCHMARK_TAG) .
diff --git a/integration-tests/container/berserker/workloads/endpoints/workload.toml b/integration-tests/container/berserker/workloads/endpoints/workload.toml
new file mode 100644
index 0000000000..25cec4e924
--- /dev/null
+++ b/integration-tests/container/berserker/workloads/endpoints/workload.toml
@@ -0,0 +1,8 @@
+restart_interval = 10
+duration = 60
+
+[workload]
+type = "endpoints"
+distribution = "zipf"
+n_ports = 8000
+exponent = 1.4
diff --git a/integration-tests/container/berserker/workloads/processes/workload.toml b/integration-tests/container/berserker/workloads/processes/workload.toml
new file mode 100644
index 0000000000..964431eb4c
--- /dev/null
+++ b/integration-tests/container/berserker/workloads/processes/workload.toml
@@ -0,0 +1,8 @@
+restart_interval = 10
+duration = 60
+
+[workload]
+type = "processes"
+arrival_rate = 200.0
+departure_rate = 200.0
+random_process = true
diff --git a/integration-tests/container/socat/Dockerfile b/integration-tests/container/socat/Dockerfile
index 05778d0f00..5cf91ddb4b 100644
--- a/integration-tests/container/socat/Dockerfile
+++ b/integration-tests/container/socat/Dockerfile
@@ -1,4 +1,6 @@
-FROM alpine:latest
+# Pinned to 3.18.4, which will mean that socat 1.7.4.4 is installed
+# Updating to newer versions has caused test failures in socat tests
+FROM alpine:3.18.4
 
 RUN apk update && apk upgrade && apk add socat
 
diff --git a/integration-tests/images.yml b/integration-tests/images.yml
index ac639a4eee..b89115787c 100644
--- a/integration-tests/images.yml
+++ b/integration-tests/images.yml
@@ -1,5 +1,6 @@
 qa:
   performance-phoronix: quay.io/rhacs-eng/collector-performance:phoronix
+  performance-berserker: quay.io/rhacs-eng/collector-performance:berserker
   performance-json-label: quay.io/rhacs-eng/collector-performance:json-label
   performance-stats: quay.io/rhacs-eng/collector-performance:stats
   performance-perf: quay.io/rhacs-eng/collector-performance:perf
diff --git a/integration-tests/scripts/baseline/format-cpu.awk b/integration-tests/scripts/baseline/format-cpu.awk
new file mode 100644
index 0000000000..bfec7f5c8e
--- /dev/null
+++ b/integration-tests/scripts/baseline/format-cpu.awk
@@ -0,0 +1,15 @@
+function abs(v) {return v < 0 ? -v : v}
+
+BEGIN {
+    print "|VM|Method|Baseline CPU median (%)|Test CPU median (%)|CPU P-value|";
+    print "|---|---|---|---|---|"
+}
+{
+    # Values might be distinctly different, but if the difference is small, we
+    # don't bother. Currently the cut-off line is 5%, which means any
+    # difference in median values less than 5% will be reported as green.
+    diff = abs($3 - $4);
+
+    cpu_warn = ($5 < 0.8 && diff > 5) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,cpu_warn,ORS
+}
diff --git a/integration-tests/scripts/baseline/format-mem.awk b/integration-tests/scripts/baseline/format-mem.awk
new file mode 100644
index 0000000000..19ff6b5b78
--- /dev/null
+++ b/integration-tests/scripts/baseline/format-mem.awk
@@ -0,0 +1,14 @@
+function abs(v) {return v < 0 ? -v : v}
+
+BEGIN {
+    print "|VM|Method|Baseline Memory median (MiB)|Test Memory median (MiB)|Memory P-value|";
+    print "|---|---|---|---|---|"
+}
+{
+    # Values might be distinctly different, but if the difference is small, we
+    # don't bother. Currently the cut-off line is 10MiB, which means any
+    # difference in median values less than 10MiB will be reported as green.
+    diff = abs($6 - $7);
+    mem_warn = ($8 < 0.8 && diff > 10) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s",$1,$2,$6,$7,mem_warn,ORS
+}
diff --git a/integration-tests/scripts/baseline/format.awk b/integration-tests/scripts/baseline/format.awk
index 27f691b6f2..16a80ec098 100644
--- a/integration-tests/scripts/baseline/format.awk
+++ b/integration-tests/scripts/baseline/format.awk
@@ -1,8 +1,9 @@
 BEGIN {
-    print "|Kernel|Method|Without Collector Time (secs)| With Collector Time (secs)|Baseline median (secs)|Collector median (secs)|PValue|";
-    print "|---|---|---|---|---|---|---|"
+    print "|VM|Method|Baseline CPU median (cores)|Test CPU median (cores)|CPU P-value|Baseline Memory median (MiB)|Test Memory median (MiB)|Memory P-value|";
+    print "|---|---|---|---|---|---|---|---|"
 }
 {
-    warn = ($7==1) ? ":red_circle:" : ":green_circle:";
-    printf "|%s|%s|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,$5,$6,warn,ORS
+    cpu_warn = ($5 < 0.6) ? ":red_circle:" : ":green_circle:";
+    mem_warn = ($8 < 0.6) ? ":red_circle:" : ":green_circle:";
+    printf "|%s|%s|%s|%s|%s|%s|%s|%s|%s",$1,$2,$3,$4,cpu_warn,$6,$7,mem_warn,ORS
 }
diff --git a/integration-tests/scripts/baseline/main.py b/integration-tests/scripts/baseline/main.py
index a7453ff14f..5e139986b4 100755
--- a/integration-tests/scripts/baseline/main.py
+++ b/integration-tests/scripts/baseline/main.py
@@ -24,20 +24,20 @@
 import json
 import os
 import sys
-import time
 
-from collections import Counter
 from itertools import groupby
+from functools import partial
+from datetime import (datetime, timedelta)
 from scipy import stats
-import numpy as np
 
 from google.oauth2 import service_account
 from google.cloud import storage
 from google.api_core.exceptions import NotFound
 
 
-DEFAULT_GCS_BUCKET = "stackrox-ci-results"
-DEFAULT_BASELINE_FILE = "circleci/collector/baseline/all.json"
+DEFAULT_GCS_BUCKET = "stackrox-ci-artifacts"
+DEFAULT_BASELINE_PATH = ""
+DEFAULT_BASELINE_FILE = "collector/baseline/all.json"
 DEFAULT_BASELINE_THRESHOLD = 10
 
 # For the sake of simplicity Baseline data stored on GCS is simply an array of
@@ -46,9 +46,60 @@
 # in the empty document to be explicit.
 EMPTY_BASELINE_STRUCTURE = []
 
+# In which format we expect to find dates in the baseline files
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+
+def object_parser_hook(data):
+    result = data
+
+    for f in (datetime_parser, memory_parser):
+        result = f(result)
+
+    return result
+
+
+def memory_parser(data):
+    result = {}
+    multipliers = {
+        "GiB": 1024,
+        "MiB": 1,
+        "KiB": 1 / 1024,
+    }
+
+    for k, v in data.items():
+        if k == 'Mem' and isinstance(v, str) and len(v) > 3:
+            suffix = v[-3:]
+            multiplier = multipliers.get(suffix, 0)
+
+            if multiplier == 0:
+                result[k] = 0
+            else:
+                result[k] = float(v[:-3]) * multiplier
+        else:
+            result[k] = v
+
+    return result
+
+
+def datetime_parser(data):
+    result = {}
+
+    for k, v in data.items():
+        if k in ('Timestamp', 'LoadStartTs', 'LoadStopTs'):
+            try:
+                result[k] = datetime.strptime(v, DATE_FORMAT)
+            except Exception as ex:
+                print(f"Could not parse timestamp {v}: {ex}")
+                result[k] = v
+        else:
+            result[k] = v
+
+    return result
+
 
 def get_gcs_blob(bucket_name, filename):
-    credentials = json.loads(os.environ["GOOGLE_CREDENTIALS_COLLECTOR_SVC_ACCT"])
+    credentials = json.loads(os.environ["GOOGLE_CREDENTIALS_COLLECTOR_CI_VM_SVC_ACCT"])
     storage_credentials = service_account.Credentials.from_service_account_info(credentials)
     storage_client = storage.Client(credentials=storage_credentials)
 
@@ -56,12 +107,17 @@ def get_gcs_blob(bucket_name, filename):
     return bucket.blob(filename)
 
 
-def load_baseline_file(bucket_name, baseline_file):
+def load_baseline_from_file(file_path, baseline_file):
+    with open(f"{file_path}/{baseline_file}", "r") as baseline:
+        return json.loads(baseline.read(), object_hook=object_parser_hook)
+
+
+def load_baseline_from_bucket(bucket_name, baseline_file):
     blob = get_gcs_blob(bucket_name, baseline_file)
 
     try:
         contents = blob.download_as_string()
-        return json.loads(contents)
+        return json.loads(contents, object_hook=object_parser_hook)
 
     except NotFound:
         print(f"File gs://{bucket_name}/{baseline_file} not found. "
@@ -71,9 +127,15 @@ def load_baseline_file(bucket_name, baseline_file):
         return []
 
 
-def save_baseline_file(bucket_name, baseline_file, data):
+def save_baseline_to_file(file_path, baseline_file, data):
+    with open(f"{file_path}/{baseline_file}", "w") as baseline:
+        baseline.write(json.dumps(data, default=str))
+        baseline.truncate()
+
+
+def save_baseline_to_bucket(bucket_name, baseline_file, data):
     blob = get_gcs_blob(bucket_name, baseline_file)
-    blob.upload_from_string(json.dumps(data))
+    blob.upload_from_string(json.dumps(data, default=str))
 
 
 def is_test(test_name, record):
@@ -81,9 +143,7 @@ def is_test(test_name, record):
 
 
 def get_timestamp(record):
-    field = record.get("Timestamp")
-    parsed = time.strptime(field, "%Y-%m-%d %H:%M:%S")
-    return time.mktime(parsed)
+    return record.get("Timestamp")
 
 
 def add_to_baseline_file(input_file_name, baseline_data, threshold):
@@ -91,7 +151,7 @@ def add_to_baseline_file(input_file_name, baseline_data, threshold):
         verify_data(baseline_data)
 
     with open(input_file_name, "r") as measure:
-        new_measurement = json.load(measure)
+        new_measurement = json.load(measure, object_hook=object_parser_hook)
         verify_data(new_measurement)
 
         new_measurement_keys = [
@@ -132,31 +192,12 @@ def add_to_baseline_file(input_file_name, baseline_data, threshold):
 
 def verify_data(data):
     """
-    There are some assumptions made about the date we operate with for both
+    There are some assumptions made about the data we operate with for both
     baseline series and new measurements, they're going to be verified below.
     """
 
     assert data, "Benchmark data must be not empty"
 
-    data_grouped = process(data)
-
-    for group, values in data_grouped:
-        counter = Counter()
-        for v in values:
-            counter.update(v.keys())
-
-        # Data provides equal number of with/without collector benchmark runs.
-        #
-        # NOTE: We rely only on baseline/collector hackbench at the moment. As
-        # soon as data for more tests will be collected, this section has to be
-        # extended accordingly.
-        no_overhead_count = counter.get("baseline_benchmark", 0) + counter.get("TestBenchmarkBaseline", 0)
-        overhead_count = counter.get("collector_benchmark", 0) + counter.get("TestBenchmarkBaseline", 0)
-
-        if (no_overhead_count != overhead_count):
-            raise Exception(f"Number of with/without overhead do not match:"
-                            f" {no_overhead_count}, {overhead_count} ")
-
 
 def intersection(baseline_data, new_data):
     """
@@ -201,27 +242,41 @@ def normalize_collection_method(method):
 
 def process(content):
     """
-    Transform benchmark data into the format CI scripts work with, and group by
-    VmConfig and CollectionMethod fields.
+    Transform benchmark data into the format CI scripts work with (a flat list
+    [record, statistics]), and group by VmConfig and CollectionMethod fields.
     """
-
-    processed = [
-        {
+    flat = []
+
+    def filter_stat(record, stats):
+        # Filter statistics from Collector only, bounded to the timeframe when
+        # load was actually running. LoadStopTs is adjusted by 1 sec, since
+        # this timestamp is taken after the workload container was stopped, so
+        # that the actual load might have stopped slightly earlier.
+        return (
+            stats.get("Name") == "collector" and
+            stats.get("Timestamp") > record.get("LoadStartTs") and
+            stats.get("Timestamp") < record.get("LoadStopTs") - timedelta(seconds=1)
+        )
+
+    for record in content:
+        filter_fn = partial(filter_stat, record)
+        stats = filter(filter_fn, record.get("ContainerStats"))
+
+        flat += [{
             "kernel": record.get("VmConfig").replace('_', '.'),
             "collection_method": normalize_collection_method(record.get("CollectionMethod")),
-            "timestamp": record.get("Timestamp"),
-            record["TestName"]: record.get("Metrics").get("hackbench_avg_time")
-        }
-        for record in content
-        if record.get("Metrics", {}).get("hackbench_avg_time")
-    ]
+            "timestamp": s.get("Timestamp"),
+            "test": record.get("TestName"),
+            "cpu": s.get("Cpu"),
+            "mem": s.get("Mem")
+        } for s in stats]
 
-    return group_data(processed, "kernel", "collection_method")
+    return group_data(flat, "kernel", "collection_method")
 
 
 def group_data(content, *columns):
     def record_id(record):
-        return " ".join([record.get(c) for c in columns])
+        return ",".join([record.get(c) for c in columns])
 
     ordered = sorted(content, key=record_id)
     return [
@@ -265,6 +320,26 @@ def collector_overhead(measurements):
 
 
 def compare(input_file_name, baseline_data):
+    """
+    Compare the test with the baseline data. The output goes to stdout is in
+    the following comma-separated format:
+        vm:                     What type of VM the test was performed on.
+        collection method:      Which collection method was used.
+        baseline_cpu_median:    For this VM type, the median for CPU
+                                utilization data from the baseline.
+        test_cpu_median:        For this VM type, the median for CPU
+                                utilization from the test.
+        cpu_pvalue:             P-value of a t-test for baseline/test CPU data,
+                                high values indicates that the difference is
+                                due to the noise.
+        baseline_mem_median:    For this VM type, the median for memory
+                                utilization data from the baseline.
+        test_mem_median:        For this VM type, the median for memory
+                                utilization from the test.
+        mem_pvalue:             P-value of a t-test for baseline/test memory data,
+                                high values indicates that the difference is
+                                due to the noise.
+    """
     if not baseline_data:
         print("Baseline file is empty, nothing to compare.", file=sys.stderr)
         return
@@ -272,7 +347,7 @@ def compare(input_file_name, baseline_data):
     verify_data(baseline_data)
 
     with open(input_file_name, "r") as measurement:
-        test_data = json.load(measurement)
+        test_data = json.load(measurement, object_hook=object_parser_hook)
         verify_data(test_data)
 
         baseline_grouped, test_grouped = intersection(baseline_data, test_data)
@@ -283,28 +358,31 @@ def compare(input_file_name, baseline_data):
 
             assert bgroup == tgroup, "Kernel/Method must not be differrent"
 
-            baseline_overhead = collector_overhead(bvalues)
-            test_overhead = collector_overhead(tvalues)[0]
-            # The original implementation used single sample ttest, but it's
-            # too sensitive for such variance.
-            # result, pvalue = stats.ttest_1samp(baseline_overhead,
-            #                                    test_overhead)
+            baseline_cpu = [v.get("cpu") for v in bvalues]
+            baseline_mem = [v.get("mem") for v in bvalues]
 
-            # Test the new data to be a 1.5 outlier
-            iqr = stats.iqr(baseline_overhead)
-            q1, q3 = np.percentile(baseline_overhead, [25, 75])
-            lower = q1 - 1.5 * iqr
-            upper = q3 + 1.5 * iqr
-            outlier = 0 if lower < test_overhead < upper else 1
+            test_cpu = [v.get("cpu") for v in tvalues]
+            test_mem = [v.get("mem") for v in tvalues]
 
-            benchmark_baseline, benchmark_collector = split_benchmark(bvalues)
-            test_baseline, test_collector = split_benchmark(tvalues)
+            # Originally we were doing one-sample test for
+            # the the new data to be a 1.5 outlier
+            #
+            # iqr = stats.iqr(baseline_cpu)
+            # q1, q3 = np.percentile(baseline_cpu, [25, 75])
+            # lower = q1 - 1.5 * iqr
+            # upper = q3 + 1.5 * iqr
+            # outlier = 0 if lower < test_cpu < upper else 1
+            cpu_stats, cpu_pvalue = stats.ttest_ind(baseline_cpu, test_cpu)
+            mem_stats, mem_pvalue = stats.ttest_ind(baseline_mem, test_mem)
 
-            baseline_median = round(stats.tmean(benchmark_baseline), 2)
-            collector_median = round(stats.tmean(benchmark_collector), 2)
+            baseline_cpu_median = round(stats.tmean(baseline_cpu), 2)
+            baseline_mem_median = round(stats.tmean(baseline_mem), 2)
 
-            print(f"{bgroup} {test_baseline[0]} {test_collector[0]} "
-                  f"{baseline_median} {collector_median} {outlier}")
+            test_cpu_median = round(stats.tmean(test_cpu), 2)
+            test_mem_median = round(stats.tmean(test_mem), 2)
+
+            print(f"{bgroup},{baseline_cpu_median},{test_cpu_median},{cpu_pvalue},"
+                  f"{baseline_mem_median},{test_mem_median},{mem_pvalue}")
 
 
 if __name__ == "__main__":
@@ -329,16 +407,34 @@ def compare(input_file_name, baseline_data):
                         default=DEFAULT_BASELINE_THRESHOLD,
                         help=('Maximum number of benchmark runs in baseline'))
 
+    parser.add_argument('--baseline-path', nargs='?', default=DEFAULT_BASELINE_PATH,
+                        help=('If specified, overrides --gcs-bucket and instructs'
+                              ' to manage baselines by specified file path'))
+
+    parser.add_argument('--reset-baseline', default=False, action='store_true',
+                        help=('Instructs to cleanup the baseline file'))
+
     args = parser.parse_args()
 
+    if args.baseline_path:
+        baseline_data = load_baseline_from_file(args.baseline_path, args.baseline_file)
+
+        save_baseline_file = partial(save_baseline_to_file, args.baseline_path, args.baseline_file)
+    else:
+        baseline_data = load_baseline_from_bucket(args.gcs_bucket, args.baseline_file)
+
+        save_baseline_file = partial(save_baseline_to_bucket, args.gcs_bucket, args.baseline_file)
+
     if args.test:
-        baseline_data = load_baseline_file(args.gcs_bucket, args.baseline_file)
         compare(args.test, baseline_data)
         sys.exit(0)
 
     if args.update:
-        baseline_data = load_baseline_file(args.gcs_bucket, args.baseline_file)
         result = add_to_baseline_file(args.update, baseline_data,
                                       args.baseline_threshold)
-        save_baseline_file(args.gcs_bucket, args.baseline_file, result)
+        save_baseline_file(result)
+        sys.exit(0)
+
+    if args.reset_baseline:
+        save_baseline_file(EMPTY_BASELINE_STRUCTURE)
         sys.exit(0)
diff --git a/integration-tests/suites/base.go b/integration-tests/suites/base.go
index 7bef6d06c1..e02d62311f 100644
--- a/integration-tests/suites/base.go
+++ b/integration-tests/suites/base.go
@@ -2,10 +2,10 @@ package suites
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
-	"regexp"
 	"strconv"
 	"strings"
 	"time"
@@ -42,6 +42,8 @@ type IntegrationTestSuiteBase struct {
 	sensor    *mock_sensor.MockSensor
 	metrics   map[string]float64
 	stats     []ContainerStat
+	start     time.Time
+	stop      time.Time
 }
 
 type ContainerStat struct {
@@ -60,6 +62,8 @@ type PerformanceResult struct {
 	CollectionMethod string
 	Metrics          map[string]float64
 	ContainerStats   []ContainerStat
+	LoadStartTs      string
+	LoadStopTs       string
 }
 
 // StartCollector will start the collector container and optionally
@@ -203,11 +207,42 @@ func (s *IntegrationTestSuiteBase) GetContainerStats() []ContainerStat {
 	return s.stats
 }
 
+// Convert memory string from docker stats into numeric value in MiB
+func Mem2Numeric(value string) (float64, error) {
+	size := len(value)
+
+	// Byte units are too short for following logic
+	if size < 3 {
+		return 0, nil
+	}
+
+	numericPart := value[:size-3]
+	unitsPart := value[size-3 : size]
+
+	if unitsPart == "MiB" {
+		return strconv.ParseFloat(numericPart, 32)
+	} else if unitsPart == "GiB" {
+		value, err := strconv.ParseFloat(numericPart, 32)
+		return value * 1024, err
+	} else if unitsPart == "KiB" {
+		value, err := strconv.ParseFloat(numericPart, 32)
+		return value / 1024, err
+	} else {
+		return 0, errors.New(fmt.Sprintf("Invalid units, %s", value))
+	}
+}
+
 func (s *IntegrationTestSuiteBase) PrintContainerStats() {
 	cpuStats := map[string][]float64{}
+	memStats := map[string][]float64{}
 
 	for _, stat := range s.GetContainerStats() {
 		cpuStats[stat.Name] = append(cpuStats[stat.Name], stat.Cpu)
+
+		memValue, err := Mem2Numeric(stat.Mem)
+		s.Require().NoError(err)
+
+		memStats[stat.Name] = append(memStats[stat.Name], memValue)
 	}
 
 	for name, cpu := range cpuStats {
@@ -217,6 +252,14 @@ func (s *IntegrationTestSuiteBase) PrintContainerStats() {
 		fmt.Printf("CPU: Container %s, Mean %v, StdDev %v\n",
 			name, stat.Mean(cpu, nil), stat.StdDev(cpu, nil))
 	}
+
+	for name, mem := range memStats {
+		s.AddMetric(fmt.Sprintf("%s_mem_mean", name), stat.Mean(mem, nil))
+		s.AddMetric(fmt.Sprintf("%s_mem_stddev", name), stat.StdDev(mem, nil))
+
+		fmt.Printf("Mem: Container %s, Mean %v MiB, StdDev %v MiB\n",
+			name, stat.Mean(mem, nil), stat.StdDev(mem, nil))
+	}
 }
 
 func (s *IntegrationTestSuiteBase) WritePerfResults() {
@@ -230,6 +273,8 @@ func (s *IntegrationTestSuiteBase) WritePerfResults() {
 		CollectionMethod: config.CollectionMethod(),
 		Metrics:          s.metrics,
 		ContainerStats:   s.GetContainerStats(),
+		LoadStartTs:      s.start.Format("2006-01-02 15:04:05"),
+		LoadStopTs:       s.stop.Format("2006-01-02 15:04:05"),
 	}
 
 	perfJson, _ := json.Marshal(perf)
@@ -455,39 +500,6 @@ func (s *IntegrationTestSuiteBase) getPort(containerName string) (string, error)
 	return "", fmt.Errorf("no port mapping found: %v %v", rawString, portMap)
 }
 
-func (s *IntegrationTestSuiteBase) RunCollectorBenchmark() {
-	benchmarkName := "benchmark"
-	benchmarkImage := config.Images().QaImageByKey("performance-phoronix")
-
-	err := s.Executor().PullImage(benchmarkImage)
-	s.Require().NoError(err)
-
-	benchmarkArgs := []string{
-		"--env", "FORCE_TIMES_TO_RUN=1",
-		benchmarkImage,
-		"batch-benchmark", "collector",
-	}
-
-	containerID, err := s.launchContainer(benchmarkName, benchmarkArgs...)
-	s.Require().NoError(err)
-
-	_, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds, 0)
-	s.Require().NoError(err)
-
-	benchmarkLogs, err := s.containerLogs("benchmark")
-	re := regexp.MustCompile(`Average: ([0-9.]+) Seconds`)
-	matches := re.FindSubmatch([]byte(benchmarkLogs))
-	if matches != nil {
-		fmt.Printf("Benchmark Time: %s\n", matches[1])
-		f, err := strconv.ParseFloat(string(matches[1]), 64)
-		s.Require().NoError(err)
-		s.AddMetric("hackbench_avg_time", f)
-	} else {
-		fmt.Printf("Benchmark Time: Not found! Logs: %s\n", benchmarkLogs)
-		assert.FailNow(s.T(), "Benchmark Time not found")
-	}
-}
-
 func (s *IntegrationTestSuiteBase) StartContainerStats() {
 	image := config.Images().QaImageByKey("performance-stats")
 	args := []string{"-v", executor.RuntimeSocket + ":/var/run/docker.sock", image}
diff --git a/integration-tests/suites/benchmark.go b/integration-tests/suites/benchmark.go
index eefd3b0eb5..c414ce095e 100644
--- a/integration-tests/suites/benchmark.go
+++ b/integration-tests/suites/benchmark.go
@@ -19,11 +19,13 @@ type BenchmarkBaselineTestSuite struct {
 
 type BenchmarkCollectorTestSuite struct {
 	BenchmarkTestSuiteBase
+	workloads []string
 }
 
 type BenchmarkTestSuiteBase struct {
 	IntegrationTestSuiteBase
 	perfContainers []string
+	loadContainers []string
 }
 
 func (b *BenchmarkTestSuiteBase) StartPerfTools() {
@@ -111,6 +113,18 @@ func (b *BenchmarkTestSuiteBase) startContainer(name string, image string, args
 	b.perfContainers = append(b.perfContainers, containerID)
 }
 
+func (b *BenchmarkTestSuiteBase) FetchWorkloadLogs() {
+	fmt.Println("Berserker logs:")
+	for _, container := range b.loadContainers {
+		log, err := b.containerLogs(container)
+		require.NoError(b.T(), err)
+
+		fmt.Println(log)
+	}
+
+	b.loadContainers = nil
+}
+
 func (b *BenchmarkTestSuiteBase) StopPerfTools() {
 	b.stopContainers(b.perfContainers...)
 
@@ -126,7 +140,8 @@ func (b *BenchmarkTestSuiteBase) StopPerfTools() {
 }
 
 func (s *BenchmarkCollectorTestSuite) SetupSuite() {
-	s.RegisterCleanup("perf", "bcc", "bpftrace", "init")
+	s.RegisterCleanup("perf", "bcc", "bpftrace", "init",
+		"benchmark-processes", "benchmark-endpoints")
 	s.StartContainerStats()
 
 	s.StartPerfTools()
@@ -134,12 +149,59 @@ func (s *BenchmarkCollectorTestSuite) SetupSuite() {
 	s.StartCollector(false, nil)
 }
 
+func (s *BenchmarkTestSuiteBase) SpinBerserker(workload string) (string, error) {
+	benchmarkName := fmt.Sprintf("benchmark-%s", workload)
+	benchmarkImage := config.Images().QaImageByKey("performance-berserker")
+
+	err := s.Executor().PullImage(benchmarkImage)
+	if err != nil {
+		return "", err
+	}
+
+	configFile := fmt.Sprintf("/etc/berserker/%s/workload.toml", workload)
+	benchmarkArgs := []string{benchmarkImage, configFile}
+
+	containerID, err := s.launchContainer(benchmarkName, benchmarkArgs...)
+	if err != nil {
+		return "", err
+	}
+
+	s.loadContainers = append(s.loadContainers, containerID)
+	return containerID, nil
+}
+
+func (s *BenchmarkTestSuiteBase) RunCollectorBenchmark() {
+	procContainerID, err := s.SpinBerserker("processes")
+	s.Require().NoError(err)
+
+	endpointsContainerID, err := s.SpinBerserker("endpoints")
+	s.Require().NoError(err)
+
+	s.start = time.Now().UTC()
+
+	// The assumption is that the benchmark is short, and to get better
+	// resolution into when relevant metrics start and stop, tick more
+	// frequently
+	waitTick := 1 * time.Second
+
+	// Container name here is used only for reporting
+	_, err = s.waitForContainerToExit("berserker", procContainerID, waitTick, 0)
+	s.Require().NoError(err)
+
+	_, err = s.waitForContainerToExit("berserker", endpointsContainerID, waitTick, 0)
+
+	s.Require().NoError(err)
+
+	s.stop = time.Now().UTC()
+}
+
 func (s *BenchmarkCollectorTestSuite) TestBenchmarkCollector() {
 	s.RunCollectorBenchmark()
 }
 
 func (s *BenchmarkCollectorTestSuite) TearDownSuite() {
 	s.StopPerfTools()
+	s.FetchWorkloadLogs()
 
 	s.StopCollector()
 
@@ -148,6 +210,7 @@ func (s *BenchmarkCollectorTestSuite) TearDownSuite() {
 }
 
 func (s *BenchmarkBaselineTestSuite) SetupSuite() {
+	s.RegisterCleanup("benchmark-processes", "benchmark-endpoints")
 	s.StartContainerStats()
 	s.StartPerfTools()
 }
@@ -158,6 +221,7 @@ func (s *BenchmarkBaselineTestSuite) TestBenchmarkBaseline() {
 
 func (s *BenchmarkBaselineTestSuite) TearDownSuite() {
 	s.StopPerfTools()
+	s.FetchWorkloadLogs()
 	s.cleanupContainers("benchmark")
 	s.WritePerfResults()
 }