Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
aequitas committed Oct 15, 2024
1 parent fdaa0eb commit f4225c9
Show file tree
Hide file tree
Showing 7 changed files with 290 additions and 0 deletions.
1 change: 1 addition & 0 deletions docker/batch-test.env
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True

# selftest runs against public domain (example.(nl|com)) which will never work in the test environment
CRON_15MIN_RUN_TESTS=False
CRON_15MIN_RUN_TESTS_BATCH=False

INTERNETNL_BRANDING=True
281 changes: 281 additions & 0 deletions docker/cron/periodic/15min/tests-batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#!/usr/bin/env python3

# run tests on example domains and write metrics to prometheus textfile

# for iterative development
# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \
# ghcr.io/internetstandards/cron:latest /tests.py --debug

import sys
import os
import time
from prometheus_client import REGISTRY, Gauge, generate_latest
import prometheus_client
import logging
import requests
import datetime

log = logging.getLogger(__name__)

DEBUG = "--debug" in sys.argv

# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector
OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests-batch.prom"


DEFAULT_TEST_TIMEOUT = 200
TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL", DEFAULT_TEST_TIMEOUT))
REQUEST_TIMEOUT = 30

TESTS = ["web", "mail"]

IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL")
INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME")
# talk directly to the internal app container as the webserver might
# have access restrictions in place
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080"
HEADERS = {"Host": INTERNETNL_DOMAINNAME}

TEST_DOMAINS = {
# domain's to use in website tests
"web": [
"internet.nl",
"example.nl",
"example.com",
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
"forumstandaardisatie.nl",
"minez.nl",
],
# domain's to use in mail tests
"mail": [
"internetsociety.org",
"ripe.net",
"surf.nl",
"ecp.nl",
# these are currently really slow and will probably improve when
# we switch to sslyze, for now disable these in monitoring
# "internet.nl",
# "forumstandaardisatie.nl",
# "minez.nl",
],
}

# METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test"])
# METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test"])
# METRIC_PROBE_RUNTIME = Gauge(
# "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test"]
# )
# METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test"])
# METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test"])

METRIC_BATCH_RUN = Gauge("tests_batch_run_total", "Test that have been run.", ["test"])
METRIC_BATCH_FAILURE = Gauge("tests_batch_failure_total", "Test runs that failed.", ["test"])
METRIC_BATCH_SUCCESS = Gauge("tests_batch_success_total", "Test runs that succeeded.", ["test"])
METRIC_BATCH_TIMEOUT = Gauge("tests_batch_timeout_total", "Test that ran into timeout.", ["test"])
METRIC_BATCH_RUNTIME = Gauge("tests_batch_runtime_seconds", "Amount of time test ran before done.", ["test"])
METRIC_BATCH_STAGE_RUNTIME = Gauge("tests_batch_stage_runtime_seconds", "Amount of time test ran before done.", ["test", "stage"])

METRIC_BATCH_DOMAIN_OK = Gauge("tests_batch_domain_ok", "Total score of all probes in the test.", ["test", "domain"])
METRIC_BATCH_DOMAIN_SCORE = Gauge("tests_batch_domain_score", "Total score of all probes in the test.", ["test", "domain"])


def run_tests_on_domain(test, domain):
test_start = int(time.time())

# initiate the test
r = requests.get(
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
timeout=REQUEST_TIMEOUT,
allow_redirects=False,
headers=HEADERS,
)
r.raise_for_status()
log.debug(r.text)

# abort early if cached result
probes = r.json()
if not [p for p in probes if not p["done"]]:
METRIC_TEST_CACHE.labels(test, domain).set(1)
return

# poll probes until done
finished_probes = set()
while int(time.time()) < test_start + TEST_TIMEOUT:
# get probe status
r = requests.get(
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}",
timeout=REQUEST_TIMEOUT,
allow_redirects=False,
headers=HEADERS,
)
r.raise_for_status()
log.debug(r.text)

# record probe statuses for probes that are finished
probes = r.json()
for probe in probes:
if probe["name"] in finished_probes:
continue
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
if probe["done"]:
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
finished_probes.add(probe["name"])

# stop when all probes are finished
if not [p for p in probes if not p["done"]]:
METRIC_BATCH_SUCCESS.labels(test, domain).set(1)
break

time.sleep(1)
else:
# test timed out because one or more of the probes was not done within time
METRIC_TEST_TIMEOUT.labels(test, domain).set(1)
for probe in probes:
if probe["name"] in finished_probes:
continue
# record not finished probes as failed
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"])
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start))
if probe["done"]:
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"])

METRIC_BATCH_RUNTIME.labels(test, domain).set(int(time.time() - test_start))

# get additional metrics like score
scores = list()
for probe_name in finished_probes:
try:
r = requests.get(
f"{URL_BASE}/{test}/{probe_name}/{domain}/?{time.time()}",
timeout=REQUEST_TIMEOUT,
allow_redirects=False,
headers=HEADERS,
)
r.raise_for_status()
if r.status_code == 200:
probe_result = r.json()
# only measure probe scores that count towards total score
if probe_result["maxscore"]:
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"])
scores.append(probe_result["totalscore"])
METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed")
except Exception:
log.exception("failed to get probe score")

if scores:
METRIC_TEST_SCORE.labels(test, domain).set(max(min(int(sum(scores) / len(scores)), 100), 0))
else:
METRIC_TEST_SCORE.labels(test, domain).set(0)

def wait_for_request_status(url: str, expected_status: str, timeout: int=10, interval: int=1, auth=None):
"""Poll url and parse JSON for request.status, return if value matches expected status or
fail when timeout expires."""
max_tries = int(timeout / interval)

tries = 0
while tries < max_tries:
status_response = requests.get(url, auth=auth, headers=HEADERS)
status_response.raise_for_status()

log.debug(status_response.text)
status_data = status_response.json()
if status_data["request"]["status"] == expected_status:
break
time.sleep(interval)
tries += 1
else:
raise TimeoutError(f"request status never reached '{expected_status}' state")


def run_test_batch(test: str, domains: list[str]):
request_data = {"type": "web", "domains": domains, "name": f"periodic"}

auth = ('periodic_tests', 'periodic_tests')
api_url = URL_BASE + "/api/batch/v2/"

test_start = int(time.time())

# start batch request
register_response = requests.post(api_url + "requests", json=request_data, auth=auth, headers=HEADERS)
register_response.raise_for_status()
log.debug(register_response.text)

# get test_id from register data
register_data = register_response.json()
test_id = register_data["request"]["request_id"]

# wait for batch tests to start
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "registering").time():
wait_for_request_status(api_url + "requests/" + test_id, "registering", timeout=TEST_TIMEOUT, auth=auth)

# wait for batch tests to start
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "running").time():
wait_for_request_status(api_url + "requests/" + test_id, "running", timeout=TEST_TIMEOUT, auth=auth)

# wait for batch tests to complete and report to be generated
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "generating").time():
wait_for_request_status(api_url + "requests/" + test_id, "generating", timeout=TEST_TIMEOUT, auth=auth)

# wait for report generation and batch to be done
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "done").time():
wait_for_request_status(api_url + "requests/" + test_id, "done", timeout=TEST_TIMEOUT, auth=auth)

# get batch results
results_response = requests.get(api_url + "requests/" + test_id + "/results", auth=auth, headers=HEADERS)
results_response.raise_for_status()
log.debug(results_response.text)

results_response_data = results_response.json()

METRIC_BATCH_SUCCESS.labels(test).set(1 if results_response_data["request"]["status"] == "done" else 0)
METRIC_BATCH_FAILURE.labels(test).set(0 if results_response_data["request"]["status"] == "done" else 1)
METRIC_BATCH_RUNTIME.labels(test).set(int(time.time() - test_start))

for domain, results in results_response_data["domains"].items():
METRIC_BATCH_DOMAIN_OK.labels(test, domain).set(1 if results["status"] == "ok" else 0)
METRIC_BATCH_DOMAIN_SCORE.labels(test, domain).set(results["scoring"]["percentage"])



def run_batch_tests():
for test in TESTS:
domains = TEST_DOMAINS[test]
log.info(f"testing: {test} {domains}")

METRIC_BATCH_RUN.labels(test).set(1)
METRIC_BATCH_FAILURE.labels(test).set(0)
METRIC_BATCH_TIMEOUT.labels(test).set(0)
METRIC_BATCH_SUCCESS.labels(test).set(0)
try:
run_test_batch(test, domains)

except Exception:
log.exception("Error during test")
METRIC_BATCH_FAILURE.labels(test).set(1)


def main():
logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR)

# disable internal metrics
REGISTRY.unregister(prometheus_client.GC_COLLECTOR)
REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR)
REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR)

# run test probes against domains and collect metrics
run_batch_tests()

# write metrics to stdout or file in prometheus textfile format
if DEBUG:
print(generate_latest(REGISTRY).decode())
else:
with open(OUTPUT_TEXTFILE, "w") as f:
f.write(generate_latest(REGISTRY).decode())


if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS_BATCH", "False") == "True":
main()
3 changes: 3 additions & 0 deletions docker/defaults.env
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=False
# enable running tests every 15 minutes for metrics collection
CRON_15MIN_RUN_TESTS=True

# enable running batch tests every 15 minutes for metrics collection, enable in local.env for batch deployments
CRON_15MIN_RUN_TESTS_BATCH=False

# enables internet.nl specific content (eg: contact information, faq, security.txt), only enable for internet.nl
# instances. For customization see: documentation/Customize.md
INTERNETNL_BRANDING=False
Expand Down
1 change: 1 addition & 0 deletions docker/develop.env
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ LOGGING_DRIVER=json-file
CRON_DAILY_POSTGRESQL_BACKUP=False
CRON_WEEKLY_POSTGRESQL_BACKUP=False
CRON_15MIN_RUN_TESTS=False
CRON_15MIN_RUN_TESTS_BATCH=False

INTERNETNL_BRANDING=False

Expand Down
2 changes: 2 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ services:
- DB_PASSWORD=password
- CRON_DAILY_POSTGRESQL_BACKUP
- CRON_WEEKLY_POSTGRESQL_BACKUP
- CRON_15MIN_RUN_TESTS_BATCH
- IPV4_IP_APP_INTERNAL
- INTERNETNL_DOMAINNAME
- INTERNETNL_CACHE_TTL
Expand Down Expand Up @@ -701,6 +702,7 @@ services:
- postgres-backups:/var/lib/postgresql/backups
- nginx-logs-exporter:/var/log/nginx/prometheus-nginxlog-exporter/
- prometheus-textfile-directory:/prometheus-textfile-directory
- ./cron/periodic:/etc/periodic

healthcheck:
test: ["CMD", "pgrep", "crond"]
Expand Down
1 change: 1 addition & 0 deletions docker/test.env
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,6 @@ CRON_WEEKLY_POSTGRESQL_BACKUP=True

# selftest runs against public domain (example.(nl|com)) which will never work in the test environment
CRON_15MIN_RUN_TESTS=False
CRON_15MIN_RUN_TESTS_BATCH=False

INTERNETNL_BRANDING=True
1 change: 1 addition & 0 deletions documentation/Docker-deployment-batch.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ For example:
cat >> docker/local.env <<EOF
ENABLE_BATCH=True
ENABLE_HOF=False
CRON_15MIN_RUN_TESTS_BATCH=True
# user/password(s) for access to /grafana monitoring
MONITORING_AUTH_RAW=user:<htpasswd hash>
# allowed IP's to visit web interface without password
Expand Down

0 comments on commit f4225c9

Please sign in to comment.