-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
290 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,281 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# run tests on example domains and write metrics to prometheus textfile | ||
|
||
# for iterative development | ||
# docker run -ti -e INTERNETNL_DOMAINNAME=internet.nl -v $PWD/docker/cron/periodic/15min/tests.py:/tests.py \ | ||
# ghcr.io/internetstandards/cron:latest /tests.py --debug | ||
|
||
import sys | ||
import os | ||
import time | ||
from prometheus_client import REGISTRY, Gauge, generate_latest | ||
import prometheus_client | ||
import logging | ||
import requests | ||
import datetime | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
DEBUG = "--debug" in sys.argv | ||
|
||
# file to write metrics to https://github.com/prometheus/node_exporter?tab=readme-ov-file#textfile-collector | ||
OUTPUT_TEXTFILE = "/prometheus-textfile-directory/tests-batch.prom" | ||
|
||
|
||
DEFAULT_TEST_TIMEOUT = 200 | ||
TEST_TIMEOUT = int(os.environ.get("INTERNETNL_CACHE_TTL", DEFAULT_TEST_TIMEOUT)) | ||
REQUEST_TIMEOUT = 30 | ||
|
||
TESTS = ["web", "mail"] | ||
|
||
IPV4_IP_APP_INTERNAL = os.environ.get("IPV4_IP_APP_INTERNAL") | ||
INTERNETNL_DOMAINNAME = os.environ.get("INTERNETNL_DOMAINNAME") | ||
# talk directly to the internal app container as the webserver might | ||
# have access restrictions in place | ||
URL_BASE = f"http://{IPV4_IP_APP_INTERNAL}:8080" | ||
HEADERS = {"Host": INTERNETNL_DOMAINNAME} | ||
|
||
TEST_DOMAINS = { | ||
# domain's to use in website tests | ||
"web": [ | ||
"internet.nl", | ||
"example.nl", | ||
"example.com", | ||
"internetsociety.org", | ||
"ripe.net", | ||
"surf.nl", | ||
"ecp.nl", | ||
"forumstandaardisatie.nl", | ||
"minez.nl", | ||
], | ||
# domain's to use in mail tests | ||
"mail": [ | ||
"internetsociety.org", | ||
"ripe.net", | ||
"surf.nl", | ||
"ecp.nl", | ||
# these are currently really slow and will probably improve when | ||
# we switch to sslyze, for now disable these in monitoring | ||
# "internet.nl", | ||
# "forumstandaardisatie.nl", | ||
# "minez.nl", | ||
], | ||
} | ||
|
||
# METRIC_PROBE_DONE = Gauge("tests_probe_done_total", "Whether the probe completed.", ["test"]) | ||
# METRIC_PROBE_SUCCESS = Gauge("tests_probe_success_total", "Whether the probe succeeded.", ["test"]) | ||
# METRIC_PROBE_RUNTIME = Gauge( | ||
# "tests_probe_runtime_seconds", "Amount of time probe ran before done.", ["test"] | ||
# ) | ||
# METRIC_PROBE_SCORE = Gauge("tests_probe_score", "Score of the probe.", ["test"]) | ||
# METRIC_PROBE_PASSED = Gauge("tests_probe_pass", "Probe has passed.", ["test"]) | ||
|
||
METRIC_BATCH_RUN = Gauge("tests_batch_run_total", "Test that have been run.", ["test"]) | ||
METRIC_BATCH_FAILURE = Gauge("tests_batch_failure_total", "Test runs that failed.", ["test"]) | ||
METRIC_BATCH_SUCCESS = Gauge("tests_batch_success_total", "Test runs that succeeded.", ["test"]) | ||
METRIC_BATCH_TIMEOUT = Gauge("tests_batch_timeout_total", "Test that ran into timeout.", ["test"]) | ||
METRIC_BATCH_RUNTIME = Gauge("tests_batch_runtime_seconds", "Amount of time test ran before done.", ["test"]) | ||
METRIC_BATCH_STAGE_RUNTIME = Gauge("tests_batch_stage_runtime_seconds", "Amount of time test ran before done.", ["test", "stage"]) | ||
|
||
METRIC_BATCH_DOMAIN_OK = Gauge("tests_batch_domain_ok", "Total score of all probes in the test.", ["test", "domain"]) | ||
METRIC_BATCH_DOMAIN_SCORE = Gauge("tests_batch_domain_score", "Total score of all probes in the test.", ["test", "domain"]) | ||
|
||
|
||
def run_tests_on_domain(test, domain): | ||
test_start = int(time.time()) | ||
|
||
# initiate the test | ||
r = requests.get( | ||
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}", | ||
timeout=REQUEST_TIMEOUT, | ||
allow_redirects=False, | ||
headers=HEADERS, | ||
) | ||
r.raise_for_status() | ||
log.debug(r.text) | ||
|
||
# abort early if cached result | ||
probes = r.json() | ||
if not [p for p in probes if not p["done"]]: | ||
METRIC_TEST_CACHE.labels(test, domain).set(1) | ||
return | ||
|
||
# poll probes until done | ||
finished_probes = set() | ||
while int(time.time()) < test_start + TEST_TIMEOUT: | ||
# get probe status | ||
r = requests.get( | ||
f"{URL_BASE}/{test}/probes/{domain}/?{time.time()}", | ||
timeout=REQUEST_TIMEOUT, | ||
allow_redirects=False, | ||
headers=HEADERS, | ||
) | ||
r.raise_for_status() | ||
log.debug(r.text) | ||
|
||
# record probe statuses for probes that are finished | ||
probes = r.json() | ||
for probe in probes: | ||
if probe["name"] in finished_probes: | ||
continue | ||
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"]) | ||
if probe["done"]: | ||
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"]) | ||
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start)) | ||
finished_probes.add(probe["name"]) | ||
|
||
# stop when all probes are finished | ||
if not [p for p in probes if not p["done"]]: | ||
METRIC_BATCH_SUCCESS.labels(test, domain).set(1) | ||
break | ||
|
||
time.sleep(1) | ||
else: | ||
# test timed out because one or more of the probes was not done within time | ||
METRIC_TEST_TIMEOUT.labels(test, domain).set(1) | ||
for probe in probes: | ||
if probe["name"] in finished_probes: | ||
continue | ||
# record not finished probes as failed | ||
METRIC_PROBE_DONE.labels(test, domain, probe["name"]).set(probe["done"]) | ||
METRIC_PROBE_RUNTIME.labels(test, domain, probe["name"]).set(int(time.time() - test_start)) | ||
if probe["done"]: | ||
METRIC_PROBE_SUCCESS.labels(test, domain, probe["name"]).set(probe["success"]) | ||
|
||
METRIC_BATCH_RUNTIME.labels(test, domain).set(int(time.time() - test_start)) | ||
|
||
# get additional metrics like score | ||
scores = list() | ||
for probe_name in finished_probes: | ||
try: | ||
r = requests.get( | ||
f"{URL_BASE}/{test}/{probe_name}/{domain}/?{time.time()}", | ||
timeout=REQUEST_TIMEOUT, | ||
allow_redirects=False, | ||
headers=HEADERS, | ||
) | ||
r.raise_for_status() | ||
if r.status_code == 200: | ||
probe_result = r.json() | ||
# only measure probe scores that count towards total score | ||
if probe_result["maxscore"]: | ||
METRIC_PROBE_SCORE.labels(test, domain, probe_name).set(probe_result["totalscore"]) | ||
scores.append(probe_result["totalscore"]) | ||
METRIC_PROBE_PASSED.labels(test, domain, probe_name).set(probe_result["verdict"] == "passed") | ||
except Exception: | ||
log.exception("failed to get probe score") | ||
|
||
if scores: | ||
METRIC_TEST_SCORE.labels(test, domain).set(max(min(int(sum(scores) / len(scores)), 100), 0)) | ||
else: | ||
METRIC_TEST_SCORE.labels(test, domain).set(0) | ||
|
||
def wait_for_request_status(url: str, expected_status: str, timeout: int=10, interval: int=1, auth=None): | ||
"""Poll url and parse JSON for request.status, return if value matches expected status or | ||
fail when timeout expires.""" | ||
max_tries = int(timeout / interval) | ||
|
||
tries = 0 | ||
while tries < max_tries: | ||
status_response = requests.get(url, auth=auth, headers=HEADERS) | ||
status_response.raise_for_status() | ||
|
||
log.debug(status_response.text) | ||
status_data = status_response.json() | ||
if status_data["request"]["status"] == expected_status: | ||
break | ||
time.sleep(interval) | ||
tries += 1 | ||
else: | ||
raise TimeoutError(f"request status never reached '{expected_status}' state") | ||
|
||
|
||
def run_test_batch(test: str, domains: list[str]): | ||
request_data = {"type": "web", "domains": domains, "name": f"periodic"} | ||
|
||
auth = ('periodic_tests', 'periodic_tests') | ||
api_url = URL_BASE + "/api/batch/v2/" | ||
|
||
test_start = int(time.time()) | ||
|
||
# start batch request | ||
register_response = requests.post(api_url + "requests", json=request_data, auth=auth, headers=HEADERS) | ||
register_response.raise_for_status() | ||
log.debug(register_response.text) | ||
|
||
# get test_id from register data | ||
register_data = register_response.json() | ||
test_id = register_data["request"]["request_id"] | ||
|
||
# wait for batch tests to start | ||
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "registering").time(): | ||
wait_for_request_status(api_url + "requests/" + test_id, "registering", timeout=TEST_TIMEOUT, auth=auth) | ||
|
||
# wait for batch tests to start | ||
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "running").time(): | ||
wait_for_request_status(api_url + "requests/" + test_id, "running", timeout=TEST_TIMEOUT, auth=auth) | ||
|
||
# wait for batch tests to complete and report to be generated | ||
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "generating").time(): | ||
wait_for_request_status(api_url + "requests/" + test_id, "generating", timeout=TEST_TIMEOUT, auth=auth) | ||
|
||
# wait for report generation and batch to be done | ||
with METRIC_BATCH_STAGE_RUNTIME.labels(test, "done").time(): | ||
wait_for_request_status(api_url + "requests/" + test_id, "done", timeout=TEST_TIMEOUT, auth=auth) | ||
|
||
# get batch results | ||
results_response = requests.get(api_url + "requests/" + test_id + "/results", auth=auth, headers=HEADERS) | ||
results_response.raise_for_status() | ||
log.debug(results_response.text) | ||
|
||
results_response_data = results_response.json() | ||
|
||
METRIC_BATCH_SUCCESS.labels(test).set(1 if results_response_data["request"]["status"] == "done" else 0) | ||
METRIC_BATCH_FAILURE.labels(test).set(0 if results_response_data["request"]["status"] == "done" else 1) | ||
METRIC_BATCH_RUNTIME.labels(test).set(int(time.time() - test_start)) | ||
|
||
for domain, results in results_response_data["domains"].items(): | ||
METRIC_BATCH_DOMAIN_OK.labels(test, domain).set(1 if results["status"] == "ok" else 0) | ||
METRIC_BATCH_DOMAIN_SCORE.labels(test, domain).set(results["scoring"]["percentage"]) | ||
|
||
|
||
|
||
def run_batch_tests(): | ||
for test in TESTS: | ||
domains = TEST_DOMAINS[test] | ||
log.info(f"testing: {test} {domains}") | ||
|
||
METRIC_BATCH_RUN.labels(test).set(1) | ||
METRIC_BATCH_FAILURE.labels(test).set(0) | ||
METRIC_BATCH_TIMEOUT.labels(test).set(0) | ||
METRIC_BATCH_SUCCESS.labels(test).set(0) | ||
try: | ||
run_test_batch(test, domains) | ||
|
||
except Exception: | ||
log.exception("Error during test") | ||
METRIC_BATCH_FAILURE.labels(test).set(1) | ||
|
||
|
||
def main(): | ||
logging.basicConfig(level=logging.DEBUG if DEBUG else logging.ERROR) | ||
|
||
# disable internal metrics | ||
REGISTRY.unregister(prometheus_client.GC_COLLECTOR) | ||
REGISTRY.unregister(prometheus_client.PLATFORM_COLLECTOR) | ||
REGISTRY.unregister(prometheus_client.PROCESS_COLLECTOR) | ||
|
||
# run test probes against domains and collect metrics | ||
run_batch_tests() | ||
|
||
# write metrics to stdout or file in prometheus textfile format | ||
if DEBUG: | ||
print(generate_latest(REGISTRY).decode()) | ||
else: | ||
with open(OUTPUT_TEXTFILE, "w") as f: | ||
f.write(generate_latest(REGISTRY).decode()) | ||
|
||
|
||
if __name__ == "__main__" and os.environ.get("CRON_15MIN_RUN_TESTS_BATCH", "False") == "True": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters