wip

internetstandards · Oct 15, 2024 · f4225c9 · f4225c9
1 parent fdaa0eb
commit f4225c9
Show file tree

Hide file tree

Showing 7 changed files with 290 additions and 0 deletions.
diff --git a/docker/batch-test.env b/docker/batch-test.env
@@ -115,5 +115,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True
 
 # selftest runs against public domain (example.(nl|com)) which will never work in the test environment
 CRON_15MIN_RUN_TESTS=False
+CRON_15MIN_RUN_TESTS_BATCH=False
 
 INTERNETNL_BRANDING=True
diff --git a/docker/cron/periodic/15min/tests-batch.py b/docker/cron/periodic/15min/tests-batch.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+
+# run tests on example domains and write metrics to prometheus textfile
+
+# for iterative development
+# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
+# ghcr.io/internetstandards/cron:latest /tests.py --debug
+
+import sys
+import os
+import time
+from prometheus_client import REGISTRY, Gauge, generate_latest
+import prometheus_client
+import logging
+import requests
+import datetime
+
+log = logging.getLogger(__name__)
+
+DEBUG = "--debug" in sys.argv
+
+# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
+OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests-batch.prom"
+
+
+DEFAULT_TEST_TIMEOUT = 200
+TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL", DEFAULT_TEST_TIMEOUT))
+REQUEST_TIMEOUT = 30
+
+TESTS = ["web", "mail"]
+
+IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
+INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
+# talk directly to the internal app container as the webserver might
+# have access restrictions in place
+URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
+HEADERS = {"Host": INTERNETNL_DOMAINNAME}
+
+TEST_DOMAINS = {
+    # domain's to use in website tests
+    "web": [
+        "internet.nl",
+        "example.nl",
+        "example.com",
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        "forumstandaardisatie.nl",
+        "minez.nl",
+    ],
+    # domain's to use in mail tests
+    "mail": [
+        "internetsociety.org",
+        "ripe.net",
+        "surf.nl",
+        "ecp.nl",
+        # these are currently really slow and will probably improve when
+        # we switch to sslyze, for now disable these in monitoring
+        # "internet.nl",
+        # "forumstandaardisatie.nl",
+        # "minez.nl",
+    ],
+}
+
+# METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test"])
+# METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test"])
+# METRIC_PROBE_RUNTIME = Gauge(
+#     "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test"]
+# )
+# METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test"])
+# METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test"])
+
+METRIC_BATCH_RUN = Gauge("tests_batch_run_total", "Test that have been run.", ["test"])
+METRIC_BATCH_FAILURE = Gauge("tests_batch_failure_total", "Test runs that failed.", ["test"])
+METRIC_BATCH_SUCCESS = Gauge("tests_batch_success_total", "Test runs that succeeded.", ["test"])
+METRIC_BATCH_TIMEOUT = Gauge("tests_batch_timeout_total", "Test that ran into timeout.", ["test"])
+METRIC_BATCH_RUNTIME = Gauge("tests_batch_runtime_seconds", "Amount of time test ran before done.", ["test"])
+METRIC_BATCH_STAGE_RUNTIME = Gauge("tests_batch_stage_runtime_seconds", "Amount of time test ran before done.", ["test", "stage"])
+
+METRIC_BATCH_DOMAIN_OK = Gauge("tests_batch_domain_ok", "Total score of all probes in the test.", ["test", "domain"])
+METRIC_BATCH_DOMAIN_SCORE = Gauge("tests_batch_domain_score", "Total score of all probes in the test.", ["test", "domain"])
+
+
+def run_tests_on_domain(test, domain):
+    test_start = int(time.time())
+
+    # initiate the test
+    r = requests.get(
+        f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
+        timeout=REQUEST_TIMEOUT,
+        allow_redirects=False,
+        headers=HEADERS,
+    )
+    r.raise_for_status()
+    log.debug(r.text)
+
+    # abort early if cached result
+    probes = r.json()
+    if not [p for p in probes if not p["done"]]:
+        METRIC_TEST_CACHE.labels(test, domain).set(1)
+        return
+
+    # poll probes until done
+    finished_probes = set()
+    while int(time.time()) < test_start + TEST_TIMEOUT:
+        # get probe status
+        r = requests.get(
+            f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
+            timeout=REQUEST_TIMEOUT,
+            allow_redirects=False,
+            headers=HEADERS,
+        )
+        r.raise_for_status()
+        log.debug(r.text)
+
+        # record probe statuses for probes that are finished
+        probes = r.json()
+        for probe in probes:
+            if probe["name"] in finished_probes:
+                continue
+            METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
+            if probe["done"]:
+                METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
+                METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
+                finished_probes.add(probe["name"])
+
+        # stop when all probes are finished
+        if not [p for p in probes if not p["done"]]:
+            METRIC_BATCH_SUCCESS.labels(test, domain).set(1)
+            break
+
+        time.sleep(1)
+    else:
+        # test timed out because one or more of the probes was not done within time
+        METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
+        for probe in probes:
+            if probe["name"] in finished_probes:
+                continue
+            # record not finished probes as failed
+            METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
+            METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
+            if probe["done"]:
+                METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
+
+    METRIC_BATCH_RUNTIME.labels(test, domain).set(int(time.time() - test_start))
+
+    # get additional metrics like score
+    scores = list()
+    for probe_name in finished_probes:
+        try:
+            r = requests.get(
+                f"{URL_BASE}/{test}/{probe_name}/{domain}/?{time.time()}",
+                timeout=REQUEST_TIMEOUT,
+                allow_redirects=False,
+                headers=HEADERS,
+            )
+            r.raise_for_status()
+            if r.status_code == 200:
+                probe_result = r.json()
+                # only measure probe scores that count towards total score
+                if probe_result["maxscore"]:
+                    METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
+                    scores.append(probe_result["totalscore"])
+                METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
+        except Exception:
+            log.exception("failed to get probe score")
+
+    if scores:
+        METRIC_TEST_SCORE.labels(test, domain).set(max(min(int(sum(scores) / len(scores)), 100), 0))
+    else:
+        METRIC_TEST_SCORE.labels(test, domain).set(0)
+
+def wait_for_request_status(url: str, expected_status: str, timeout: int=10, interval: int=1, auth=None):
+    """Poll url and parse JSON for request.status, return if value matches expected status or
+    fail when timeout expires."""
+    max_tries = int(timeout / interval)
+
+    tries = 0
+    while tries < max_tries:
+        status_response = requests.get(url, auth=auth, headers=HEADERS)
+        status_response.raise_for_status()
+
+        log.debug(status_response.text)
+        status_data = status_response.json()
+        if status_data["request"]["status"] == expected_status:
+            break
+        time.sleep(interval)
+        tries += 1
+    else:
+        raise TimeoutError(f"request status never reached '{expected_status}' state")
+
+
+def run_test_batch(test: str, domains: list[str]):
+  request_data = {"type": "web", "domains": domains, "name": f"periodic"}
+
+  auth = ('periodic_tests', 'periodic_tests')
+  api_url = URL_BASE + "/api/batch/v2/"
+
+  test_start = int(time.time())
+
+  # start batch request
+  register_response = requests.post(api_url + "requests", json=request_data, auth=auth, headers=HEADERS)
+  register_response.raise_for_status()
+  log.debug(register_response.text)
+
+  # get test_id from register data
+  register_data = register_response.json()
+  test_id = register_data["request"]["request_id"]
+
+  # wait for batch tests to start
+  with METRIC_BATCH_STAGE_RUNTIME.labels(test, "registering").time():
+    wait_for_request_status(api_url + "requests/" + test_id, "registering", timeout=TEST_TIMEOUT, auth=auth)
+
+  # wait for batch tests to start
+  with METRIC_BATCH_STAGE_RUNTIME.labels(test, "running").time():
+    wait_for_request_status(api_url + "requests/" + test_id, "running", timeout=TEST_TIMEOUT, auth=auth)
+
+  # wait for batch tests to complete and report to be generated
+  with METRIC_BATCH_STAGE_RUNTIME.labels(test, "generating").time():
+    wait_for_request_status(api_url + "requests/" + test_id, "generating", timeout=TEST_TIMEOUT, auth=auth)
+
+  # wait for report generation and batch to be done
+  with METRIC_BATCH_STAGE_RUNTIME.labels(test, "done").time():
+    wait_for_request_status(api_url + "requests/" + test_id, "done", timeout=TEST_TIMEOUT, auth=auth)
+
+  # get batch results
+  results_response = requests.get(api_url + "requests/" + test_id + "/results", auth=auth, headers=HEADERS)
+  results_response.raise_for_status()
+  log.debug(results_response.text)
+
+  results_response_data = results_response.json()
+
+  METRIC_BATCH_SUCCESS.labels(test).set(1 if results_response_data["request"]["status"] == "done" else 0)
+  METRIC_BATCH_FAILURE.labels(test).set(0 if results_response_data["request"]["status"] == "done" else 1)
+  METRIC_BATCH_RUNTIME.labels(test).set(int(time.time() - test_start))
+
+  for domain, results in results_response_data["domains"].items():
+    METRIC_BATCH_DOMAIN_OK.labels(test, domain).set(1 if results["status"] == "ok" else 0)
+    METRIC_BATCH_DOMAIN_SCORE.labels(test, domain).set(results["scoring"]["percentage"])
+
+
+
+def run_batch_tests():
+    for test in TESTS:
+        domains = TEST_DOMAINS[test]
+        log.info(f"testing: {test} {domains}")
+
+        METRIC_BATCH_RUN.labels(test).set(1)
+        METRIC_BATCH_FAILURE.labels(test).set(0)
+        METRIC_BATCH_TIMEOUT.labels(test).set(0)
+        METRIC_BATCH_SUCCESS.labels(test).set(0)
+        try:
+            run_test_batch(test, domains)
+
+        except Exception:
+            log.exception("Error during test")
+            METRIC_BATCH_FAILURE.labels(test).set(1)
+
+
+def main():
+    logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)
+
+    # disable internal metrics
+    REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
+    REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)
+
+    # run test probes against domains and collect metrics
+    run_batch_tests()
+
+    # write metrics to stdout or file in prometheus textfile format
+    if DEBUG:
+        print(generate_latest(REGISTRY).decode())
+    else:
+        with open(OUTPUT_TEXTFILE, "w") as f:
+            f.write(generate_latest(REGISTRY).decode())
+
+
+if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS_BATCH", "False") == "True":
+    main()
diff --git a/docker/defaults.env b/docker/defaults.env
@@ -240,6 +240,9 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=False
 # enable running tests every 15 minutes for metrics collection
 CRON_15MIN_RUN_TESTS=True
 
+# enable running batch tests every 15 minutes for metrics collection, enable in local.env for batch deployments
+CRON_15MIN_RUN_TESTS_BATCH=False
+
 # enables internet.nl specific content (eg: contact information, faq, security.txt), only enable for internet.nl
 # instances. For customization see: documentation/Customize.md
 INTERNETNL_BRANDING=False

diff --git a/docker/develop.env b/docker/develop.env
@@ -64,6 +64,7 @@ LOGGING_DRIVER=json-file
 CRON_DAILY_POSTGRESQL_BACKUP=False
 CRON_WEEKLY_POSTGRESQL_BACKUP=False
 CRON_15MIN_RUN_TESTS=False
+CRON_15MIN_RUN_TESTS_BATCH=False
 
 INTERNETNL_BRANDING=False
 

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -674,6 +674,7 @@ services:
       - DB_PASSWORD=password
       - CRON_DAILY_POSTGRESQL_BACKUP
       - CRON_WEEKLY_POSTGRESQL_BACKUP
+      - CRON_15MIN_RUN_TESTS_BATCH
       - IPV4_IP_APP_INTERNAL
       - INTERNETNL_DOMAINNAME
       - INTERNETNL_CACHE_TTL
@@ -701,6 +702,7 @@ services:
       - postgres-backups:/var/lib/postgresql/backups
       - nginx-logs-exporter:/var/log/nginx/prometheus-nginxlog-exporter/
       - prometheus-textfile-directory:/prometheus-textfile-directory
+      - ./cron/periodic:/etc/periodic
 
     healthcheck:
       test: ["CMD", "pgrep", "crond"]

diff --git a/docker/test.env b/docker/test.env
@@ -112,5 +112,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True
 
 # selftest runs against public domain (example.(nl|com)) which will never work in the test environment
 CRON_15MIN_RUN_TESTS=False
+CRON_15MIN_RUN_TESTS_BATCH=False
 
 INTERNETNL_BRANDING=True
diff --git a/documentation/Docker-deployment-batch.md b/documentation/Docker-deployment-batch.md
@@ -111,6 +111,7 @@ For example:
     cat >> docker/local.env <<EOF
     ENABLE_BATCH=True
     ENABLE_HOF=False
+    CRON_15MIN_RUN_TESTS_BATCH=True
     # user/password(s) for access to /grafana monitoring
     MONITORING_AUTH_RAW=user:<htpasswd hash>
     # allowed IP's to visit web interface without password