diff --git a/tools/slo_report/lib/fetchers.py b/tools/slo_report/lib/fetchers.py new file mode 100644 index 0000000..9622b5e --- /dev/null +++ b/tools/slo_report/lib/fetchers.py @@ -0,0 +1,180 @@ +from .hnyapi import hnyapi_request, query_factory, craft_query_body +import json + +class HoneycombFetcher: + def __init__(self, api_key, debug=False): + self.api_key = api_key + self.debug = debug + + def fetch_auth_info(self): + """ + Fetch the auth info for the current user + """ + response = hnyapi_request('auth', self.api_key) + return f"Current team: {response['team']['name']}, environment: {response['environment']['name']}" + + def fetch_all_datasets(self): + """ + Fetch all datasets in a team and return them as a list of dataset IDs + """ + response = hnyapi_request('datasets', self.api_key) + + all_datasets = [dataset['name'] for dataset in response] + return all_datasets + + def fetch_all_slos_for_dataset(self, dataset): + """ + Fetch all SLOs in a dataset and return them all as json + """ + if self.debug: + print(f"fetching SLOs for dataset: {dataset}") + endpoint = f'slos/{dataset}' + response = hnyapi_request(endpoint, self.api_key) + + all_slos = [] + for slo in response: + if self.debug: + print(f" slo {slo['name']} : {slo['id']}") + all_slos.append(slo) + + return all_slos + + def fetch_burn_alerts_for_slo(self, dataset, slo_id): + """ + Fetch burn alerts for a specific SLO in a dataset + """ + if self.debug: + print(f"fetching burn alerts for dataset: {dataset}, slo_id: {slo_id}") + + endpoint = f'burn_alerts/{dataset}?slo_id={slo_id}' + response = hnyapi_request(endpoint, self.api_key) + + return response + + def fetch_all_slos(self): + all_datasets = self.fetch_all_datasets() + if all_datasets is None: + print('No datasets found') + sys.exit(1) + + all_slos = [] + for dataset in all_datasets: + slos_for_dataset = self.fetch_all_slos_for_dataset(dataset) + for slo in slos_for_dataset: + slo['dataset'] = dataset + slo['burn_alerts'] = self.fetch_burn_alerts_for_slo(dataset, slo['id']) + all_slos.append(slo) + + return all_slos + + def fetch_sli_data(self, sli, dataset): + """ + Fetch SLI data for a SLO and return it as json + """ + query = craft_query_body(time_range=3600, breakdowns=[sli, "service.name"], calculations=[{"op": "COUNT"}]) + query_result = query_factory(dataset, query, api_key) + + return query_result + + def fetch_sli_service_values_counts(self, slos, dataset): + """ + Fetch SLI data for a list of SLOs and return the counts of service values + """ + sli_names = [slo['sli']['alias'] for slo in slos] + + where_array = [] + for sli in sli_names: + where_array.append({"column": sli, "op": "exists"}) + + breakdowns = sli_names + ["service.name"] + + qb = craft_query_body(time_range=86400, filters=where_array, breakdowns=breakdowns, calculations=[{"op": "COUNT"}]) + qr = query_factory(dataset, qb, self.api_key) + + if self.debug: + print(json.dumps(qr, indent=2)) + return self.agg_results(slos, qr['data']['results']) + + def agg_results(self, slos, results): + """ + Aggregate the results of a query: + - group by SLI + - SUM of COUNT of true and false values + - array of matching service.name values for each SLI + """ + + # iterate through sli list and aggregrate results + for slo in slos: + # initialize agg info to each slo + slo['sli_values'] = {"true": 0, "false": 0} + slo['sli_service_names'] = [] + + sli = slo['sli']['alias'] + + for result in results: + res = result['data'] + # sum up all true counts + if res[sli] == True: + slo['sli_values']["true"] += res['COUNT'] + # add deduped to service list + if res['service.name'] not in slo['sli_service_names']: + slo["sli_service_names"].append(res['service.name']) + print(f"SLI: {sli}, true COUNT: {res['COUNT']}") if self.debug else None + + # sum up all false counts + if res[sli] == False: + slo['sli_values']["false"] += res['COUNT'] + # add deduped to service list + if res['service.name'] not in slo['sli_service_names']: + slo["sli_service_names"].append(res['service.name']) + print(f"SLI: {sli}, false COUNT: {res['COUNT']}") if self.debug else None + + slo['sli_event_count'] = slo['sli_values']["true"] + slo['sli_values']["false"] + slo['sli_service_count'] = len(slo['sli_service_names']) + + print(json.dumps(slos, indent=2)) + return slos + + + # example result: + # "data": { + # "series": [], + # "results": [ + # { + # "data": { + # "COUNT": 1023687, + # "service.name": "frontend", + # "sli.frontend-latency-3500": true, + # "sli.frontend-root-latency-4000ms": true, + # "zoc-doctest-availibility": true + # } + # }, + # { + # "data": { + # "COUNT": 6187, + # "service.name": "frontend", + # "sli.frontend-latency-3500": true, + # "sli.frontend-root-latency-4000ms": true, + # "zoc-doctest-availibility": false + # } + # }, + # { + # "data": { + # "COUNT": 221, + # "service.name": "frontend", + # "sli.frontend-latency-3500": false, + # "sli.frontend-root-latency-4000ms": true, + # "zoc-doctest-availibility": true + # } + # }, + # { + # "data": { + # "COUNT": 188, + # "service.name": "frontend", + # "sli.frontend-latency-3500": false, + # "sli.frontend-root-latency-4000ms": false, + # "zoc-doctest-availibility": true + # } + # } + # ] + # }, diff --git a/tools/slo_report/lib/hnyapi.py b/tools/slo_report/lib/hnyapi.py index 7ba3667..973df9f 100644 --- a/tools/slo_report/lib/hnyapi.py +++ b/tools/slo_report/lib/hnyapi.py @@ -15,7 +15,7 @@ def create_query(dataset, query, api_key): print(f"Creating query for dataset: {dataset}") if DEBUG else None url = HONEYCOMB_API + 'queries/' + dataset response = session.post(url, headers={"X-Honeycomb-Team": api_key, "Content-Type": "application/json"}, json=query) - print(response.text) if DEBUG else None + # print(response.text) if DEBUG else None response.raise_for_status() return response.json() @@ -25,7 +25,7 @@ def create_query_result(dataset, query_id, api_key): url = HONEYCOMB_API + 'query_results/' + dataset qrjson = {"query_id": query_id, "disable_series": True, "limit": 10000} response = session.post(url, headers={"X-Honeycomb-Team": api_key, "Content-Type": "application/json"}, json=qrjson) - print(response.text) if DEBUG else None + # print(response.text) if DEBUG else None response.raise_for_status() return response.json() @@ -34,7 +34,7 @@ def get_query_result(dataset, query_result_id, api_key): print(f"Getting query result for query_result_id: {query_result_id}") if DEBUG else None url = HONEYCOMB_API + 'query_results/' + dataset + '/' + query_result_id response = session.get(url, headers={"X-Honeycomb-Team": api_key}) - print(response.text) if DEBUG else None + # print(response.text) if DEBUG else None response.raise_for_status() return response.json() diff --git a/tools/slo_report/slo_report.py b/tools/slo_report/slo_report.py index 7f7a3a8..d867c37 100755 --- a/tools/slo_report/slo_report.py +++ b/tools/slo_report/slo_report.py @@ -8,104 +8,43 @@ # - A Honeycomb API key with the "Manage Queries and Columns" permission import argparse -import requests -import os -import sys -import signal -import json -from lib.hnyapi import hnyapi_request, query_factory, craft_query_body +import os, sys, signal +from itertools import batched, groupby + +from lib.fetchers import HoneycombFetcher + +BATCH_SIZE = 10 api_key = None DEBUG = True if os.environ.get('DEBUG') else False -def fetch_auth_info(): - """ - Fetch the auth info for the current user - """ - response = hnyapi_request('auth', api_key) - return f"Current team: {response['team']['name']}, environment: {response['environment']['name']}" - -def fetch_all_datasets(): - """ - Fetch all datasets in a team and return them as a list of dataset IDs - """ - response = hnyapi_request('datasets', api_key) - - all_datasets = [] - for dataset in response: - all_datasets.append(dataset['name']) - - return all_datasets - -# Use the get all SLOs API: https://docs.honeycomb.io/api/tag/SLOs#operation/listSlos [docs.honeycomb.io] -def fetch_all_slos_for_dataset(dataset): - """ - Fetch all SLOs in a dataset and return them all as json - """ - print(f"fetching SLOs for dataset: {dataset}") if DEBUG else None - endpoint = 'slos/' + dataset - response = hnyapi_request(endpoint, api_key) - - all_slos = [] - for slo in response: - print(f" slo {slo['name']} : {slo['id']}") if DEBUG else None - all_slos.append(slo) - - return all_slos - -# Get the burn alert data: https://docs.honeycomb.io/api/tag/Burn-Alerts#operation/listBurnAlertsBySlo [docs.honeycomb.io] -def fetch_burn_alerts_for_slo(dataset, slo_id): - """ - Fetch all burn alerts for a SLO and return them all as json - """ - endpoint = 'burn_alerts/' + dataset + '?slo_id=' + slo_id - response = hnyapi_request(endpoint, api_key) - - all_burn_alerts = [] - for burn_alert in response: - all_burn_alerts.append(burn_alert) - - return all_burn_alerts - -def fetch_all_slos(): - all_datasets = fetch_all_datasets() - if all_datasets is None: - print('No datasets found') - sys.exit(1) - - all_slos = [] - for dataset in all_datasets: - slos_for_dataset = fetch_all_slos_for_dataset(dataset) - for slo in slos_for_dataset: - slo['dataset'] = dataset - slo['burn_alerts'] = fetch_burn_alerts_for_slo(dataset, slo['id']) - all_slos.append(slo) - - return all_slos - -# Run a query data api query where the SLI exists, once for a count and once for group by service name, and possibly one more time for where value = true? -# using -def fetch_sli_data(sli, dataset): - """ - Fetch SLI data for a SLO and return it as json - """ - query = craft_query_body(time_range=3600, breakdowns=[sli, "service.name"], calculations=[{"op": "COUNT"}]) - query_result = query_factory(dataset, query, api_key) - - return query_result - - -# The output of this is a production readiness check that tells you Service & SLO quality: - - - -# Does a service have a SLO - x referenced from ServiceNow / cmdb? -# Is the SLI scoped to too many events -# Is the SLI scoped to too few events -# Are all events succeeding -# Are none of the events succeeding -# Is the SLO burning uncontrollably -# Is a burn alert configured + + + +# expected output: +# { +# "slo_id": "sdfsdfs", +# "slo_name": "SLO Name", +# "sli_name": "SLI Name", +# "sli_expression": "IF(AND(CONTAINS($service.name, \"FOO\"), EQUALS($service. role_type, \"prod\*), EQUALS(Scloud. region, \"ap-southeast-2\"), EXISTS($http.status_code)), LT(Shttp. status_code, 500))", +# "dataset": "prod", +# "sli_event_count": 65659, +# "sli_service_count" : 2, +# "sli_values": { +# "true": 65659 +# }, +# "region": "prod", +# "count_date" : "2024-07-09" +# } + + +# simplified query specs from original: + +# COUNT, COUNT_DISTINCT(service.name) WHERE exists [24 hours] | +# COUNT WHERE exists GROUP BY [24 hours] | +# COUNT WHERE = true [7 days] | +# COUNT WHERE exists GROUP BY [7 days] +# COUNT WHERE exists GROUP BY [7 days] if __name__ == "__main__": try: @@ -127,22 +66,24 @@ def fetch_sli_data(sli, dataset): print('You must provide an API key via the -k flag or the HONEYCOMB_API_KEY environment variable') sys.exit(1) + fetcher = HoneycombFetcher(api_key, debug=DEBUG) + # fetch all SLOs - auth_info = fetch_auth_info() + auth_info = fetcher.fetch_auth_info() print('Fetching all SLOs for ' + auth_info + "\n\n") - all_slos = fetch_all_slos() + all_slos = fetcher.fetch_all_slos() if all_slos is None: sys.exit(1) - for slo in all_slos: - print(f"Dataset: {slo['dataset']}, SLO: {slo['name']}, ID: {slo['id']}, SLI: {slo['sli']['alias']}") - for burn_alert in slo['burn_alerts']: - print(f" Burn alert: {burn_alert['alert_type']}, ID: {burn_alert['id']}") - - print("Fetching SLI data ...") - fetch_sli_data(slo['sli']['alias'], slo['dataset']) - + # group all SLOs by dataset + for dataset, slos_group in groupby(all_slos, key=lambda slo: slo['dataset']): + print (f"Running batches for Dataset: {dataset}") + # take batches of 10 SLOs and fetch SLI data for them + for slo_batch in batched(slos_group, BATCH_SIZE): + slo_names = [slo['name'] for slo in slo_batch] + print(slo_names) + sli_data = fetcher.fetch_sli_service_values_counts(slo_batch, dataset) except KeyboardInterrupt: # Suppress tracebacks on SIGINT print('\nExiting early, not done ...\n')