Skip to content

Commit

Permalink
implement batched queries, agg info, and refactored fetchers out to a…
Browse files Browse the repository at this point in the history
… lib

Signed-off-by: Irving Popovetsky <[email protected]>
  • Loading branch information
irvingpop committed Jul 25, 2024
1 parent 6a246a3 commit 7f196f4
Show file tree
Hide file tree
Showing 3 changed files with 228 additions and 107 deletions.
180 changes: 180 additions & 0 deletions tools/slo_report/lib/fetchers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from .hnyapi import hnyapi_request, query_factory, craft_query_body
import json

class HoneycombFetcher:
def __init__(self, api_key, debug=False):
self.api_key = api_key
self.debug = debug

def fetch_auth_info(self):
"""
Fetch the auth info for the current user
"""
response = hnyapi_request('auth', self.api_key)
return f"Current team: {response['team']['name']}, environment: {response['environment']['name']}"

def fetch_all_datasets(self):
"""
Fetch all datasets in a team and return them as a list of dataset IDs
"""
response = hnyapi_request('datasets', self.api_key)

all_datasets = [dataset['name'] for dataset in response]
return all_datasets

def fetch_all_slos_for_dataset(self, dataset):
"""
Fetch all SLOs in a dataset and return them all as json
"""
if self.debug:
print(f"fetching SLOs for dataset: {dataset}")
endpoint = f'slos/{dataset}'
response = hnyapi_request(endpoint, self.api_key)

all_slos = []
for slo in response:
if self.debug:
print(f" slo {slo['name']} : {slo['id']}")
all_slos.append(slo)

return all_slos

def fetch_burn_alerts_for_slo(self, dataset, slo_id):
"""
Fetch burn alerts for a specific SLO in a dataset
"""
if self.debug:
print(f"fetching burn alerts for dataset: {dataset}, slo_id: {slo_id}")

endpoint = f'burn_alerts/{dataset}?slo_id={slo_id}'
response = hnyapi_request(endpoint, self.api_key)

return response

def fetch_all_slos(self):
all_datasets = self.fetch_all_datasets()
if all_datasets is None:
print('No datasets found')
sys.exit(1)

all_slos = []
for dataset in all_datasets:
slos_for_dataset = self.fetch_all_slos_for_dataset(dataset)
for slo in slos_for_dataset:
slo['dataset'] = dataset
slo['burn_alerts'] = self.fetch_burn_alerts_for_slo(dataset, slo['id'])
all_slos.append(slo)

return all_slos

def fetch_sli_data(self, sli, dataset):
"""
Fetch SLI data for a SLO and return it as json
"""
query = craft_query_body(time_range=3600, breakdowns=[sli, "service.name"], calculations=[{"op": "COUNT"}])
query_result = query_factory(dataset, query, api_key)

return query_result

def fetch_sli_service_values_counts(self, slos, dataset):
"""
Fetch SLI data for a list of SLOs and return the counts of service values
"""
sli_names = [slo['sli']['alias'] for slo in slos]

where_array = []
for sli in sli_names:
where_array.append({"column": sli, "op": "exists"})

breakdowns = sli_names + ["service.name"]

qb = craft_query_body(time_range=86400, filters=where_array, breakdowns=breakdowns, calculations=[{"op": "COUNT"}])
qr = query_factory(dataset, qb, self.api_key)

if self.debug:
print(json.dumps(qr, indent=2))
return self.agg_results(slos, qr['data']['results'])

def agg_results(self, slos, results):
"""
Aggregate the results of a query:
- group by SLI
- SUM of COUNT of true and false values
- array of matching service.name values for each SLI
"""

# iterate through sli list and aggregrate results
for slo in slos:
# initialize agg info to each slo
slo['sli_values'] = {"true": 0, "false": 0}
slo['sli_service_names'] = []

sli = slo['sli']['alias']

for result in results:
res = result['data']
# sum up all true counts
if res[sli] == True:
slo['sli_values']["true"] += res['COUNT']
# add deduped to service list
if res['service.name'] not in slo['sli_service_names']:
slo["sli_service_names"].append(res['service.name'])
print(f"SLI: {sli}, true COUNT: {res['COUNT']}") if self.debug else None

# sum up all false counts
if res[sli] == False:
slo['sli_values']["false"] += res['COUNT']
# add deduped to service list
if res['service.name'] not in slo['sli_service_names']:
slo["sli_service_names"].append(res['service.name'])
print(f"SLI: {sli}, false COUNT: {res['COUNT']}") if self.debug else None

slo['sli_event_count'] = slo['sli_values']["true"] + slo['sli_values']["false"]
slo['sli_service_count'] = len(slo['sli_service_names'])

print(json.dumps(slos, indent=2))
return slos


# example result:
# "data": {
# "series": [],
# "results": [
# {
# "data": {
# "COUNT": 1023687,
# "service.name": "frontend",
# "sli.frontend-latency-3500": true,
# "sli.frontend-root-latency-4000ms": true,
# "zoc-doctest-availibility": true
# }
# },
# {
# "data": {
# "COUNT": 6187,
# "service.name": "frontend",
# "sli.frontend-latency-3500": true,
# "sli.frontend-root-latency-4000ms": true,
# "zoc-doctest-availibility": false
# }
# },
# {
# "data": {
# "COUNT": 221,
# "service.name": "frontend",
# "sli.frontend-latency-3500": false,
# "sli.frontend-root-latency-4000ms": true,
# "zoc-doctest-availibility": true
# }
# },
# {
# "data": {
# "COUNT": 188,
# "service.name": "frontend",
# "sli.frontend-latency-3500": false,
# "sli.frontend-root-latency-4000ms": false,
# "zoc-doctest-availibility": true
# }
# }
# ]
# },
6 changes: 3 additions & 3 deletions tools/slo_report/lib/hnyapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def create_query(dataset, query, api_key):
print(f"Creating query for dataset: {dataset}") if DEBUG else None
url = HONEYCOMB_API + 'queries/' + dataset
response = session.post(url, headers={"X-Honeycomb-Team": api_key, "Content-Type": "application/json"}, json=query)
print(response.text) if DEBUG else None
# print(response.text) if DEBUG else None
response.raise_for_status()
return response.json()

Expand All @@ -25,7 +25,7 @@ def create_query_result(dataset, query_id, api_key):
url = HONEYCOMB_API + 'query_results/' + dataset
qrjson = {"query_id": query_id, "disable_series": True, "limit": 10000}
response = session.post(url, headers={"X-Honeycomb-Team": api_key, "Content-Type": "application/json"}, json=qrjson)
print(response.text) if DEBUG else None
# print(response.text) if DEBUG else None
response.raise_for_status()
return response.json()

Expand All @@ -34,7 +34,7 @@ def get_query_result(dataset, query_result_id, api_key):
print(f"Getting query result for query_result_id: {query_result_id}") if DEBUG else None
url = HONEYCOMB_API + 'query_results/' + dataset + '/' + query_result_id
response = session.get(url, headers={"X-Honeycomb-Team": api_key})
print(response.text) if DEBUG else None
# print(response.text) if DEBUG else None
response.raise_for_status()
return response.json()

Expand Down
149 changes: 45 additions & 104 deletions tools/slo_report/slo_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,104 +8,43 @@
# - A Honeycomb API key with the "Manage Queries and Columns" permission

import argparse
import requests
import os
import sys
import signal
import json
from lib.hnyapi import hnyapi_request, query_factory, craft_query_body
import os, sys, signal
from itertools import batched, groupby

from lib.fetchers import HoneycombFetcher

BATCH_SIZE = 10

api_key = None
DEBUG = True if os.environ.get('DEBUG') else False

def fetch_auth_info():
"""
Fetch the auth info for the current user
"""
response = hnyapi_request('auth', api_key)
return f"Current team: {response['team']['name']}, environment: {response['environment']['name']}"

def fetch_all_datasets():
"""
Fetch all datasets in a team and return them as a list of dataset IDs
"""
response = hnyapi_request('datasets', api_key)

all_datasets = []
for dataset in response:
all_datasets.append(dataset['name'])

return all_datasets

# Use the get all SLOs API: https://docs.honeycomb.io/api/tag/SLOs#operation/listSlos [docs.honeycomb.io]
def fetch_all_slos_for_dataset(dataset):
"""
Fetch all SLOs in a dataset and return them all as json
"""
print(f"fetching SLOs for dataset: {dataset}") if DEBUG else None
endpoint = 'slos/' + dataset
response = hnyapi_request(endpoint, api_key)

all_slos = []
for slo in response:
print(f" slo {slo['name']} : {slo['id']}") if DEBUG else None
all_slos.append(slo)

return all_slos

# Get the burn alert data: https://docs.honeycomb.io/api/tag/Burn-Alerts#operation/listBurnAlertsBySlo [docs.honeycomb.io]
def fetch_burn_alerts_for_slo(dataset, slo_id):
"""
Fetch all burn alerts for a SLO and return them all as json
"""
endpoint = 'burn_alerts/' + dataset + '?slo_id=' + slo_id
response = hnyapi_request(endpoint, api_key)

all_burn_alerts = []
for burn_alert in response:
all_burn_alerts.append(burn_alert)

return all_burn_alerts

def fetch_all_slos():
all_datasets = fetch_all_datasets()
if all_datasets is None:
print('No datasets found')
sys.exit(1)

all_slos = []
for dataset in all_datasets:
slos_for_dataset = fetch_all_slos_for_dataset(dataset)
for slo in slos_for_dataset:
slo['dataset'] = dataset
slo['burn_alerts'] = fetch_burn_alerts_for_slo(dataset, slo['id'])
all_slos.append(slo)

return all_slos

# Run a query data api query where the SLI exists, once for a count and once for group by service name, and possibly one more time for where value = true?
# using
def fetch_sli_data(sli, dataset):
"""
Fetch SLI data for a SLO and return it as json
"""
query = craft_query_body(time_range=3600, breakdowns=[sli, "service.name"], calculations=[{"op": "COUNT"}])
query_result = query_factory(dataset, query, api_key)

return query_result


# The output of this is a production readiness check that tells you Service & SLO quality:



# Does a service have a SLO - x referenced from ServiceNow / cmdb?
# Is the SLI scoped to too many events
# Is the SLI scoped to too few events
# Are all events succeeding
# Are none of the events succeeding
# Is the SLO burning uncontrollably
# Is a burn alert configured



# expected output:
# {
# "slo_id": "sdfsdfs",
# "slo_name": "SLO Name",
# "sli_name": "SLI Name",
# "sli_expression": "IF(AND(CONTAINS($service.name, \"FOO\"), EQUALS($service. role_type, \"prod\*), EQUALS(Scloud. region, \"ap-southeast-2\"), EXISTS($http.status_code)), LT(Shttp. status_code, 500))",
# "dataset": "prod",
# "sli_event_count": 65659,
# "sli_service_count" : 2,
# "sli_values": {
# "true": 65659
# },
# "region": "prod",
# "count_date" : "2024-07-09"
# }


# simplified query specs from original:

# COUNT, COUNT_DISTINCT(service.name) WHERE <sli> exists [24 hours] |
# COUNT WHERE <sli> exists GROUP BY <sli> [24 hours] |
# COUNT WHERE <sli> = true [7 days] |
# COUNT WHERE <sli> exists GROUP BY <sli> [7 days]
# COUNT WHERE <sli> exists GROUP BY <sli> [7 days]

if __name__ == "__main__":
try:
Expand All @@ -127,22 +66,24 @@ def fetch_sli_data(sli, dataset):
print('You must provide an API key via the -k flag or the HONEYCOMB_API_KEY environment variable')
sys.exit(1)

fetcher = HoneycombFetcher(api_key, debug=DEBUG)

# fetch all SLOs
auth_info = fetch_auth_info()
auth_info = fetcher.fetch_auth_info()
print('Fetching all SLOs for ' + auth_info + "\n\n")

all_slos = fetch_all_slos()
all_slos = fetcher.fetch_all_slos()
if all_slos is None:
sys.exit(1)

for slo in all_slos:
print(f"Dataset: {slo['dataset']}, SLO: {slo['name']}, ID: {slo['id']}, SLI: {slo['sli']['alias']}")
for burn_alert in slo['burn_alerts']:
print(f" Burn alert: {burn_alert['alert_type']}, ID: {burn_alert['id']}")

print("Fetching SLI data ...")
fetch_sli_data(slo['sli']['alias'], slo['dataset'])

# group all SLOs by dataset
for dataset, slos_group in groupby(all_slos, key=lambda slo: slo['dataset']):
print (f"Running batches for Dataset: {dataset}")
# take batches of 10 SLOs and fetch SLI data for them
for slo_batch in batched(slos_group, BATCH_SIZE):
slo_names = [slo['name'] for slo in slo_batch]
print(slo_names)
sli_data = fetcher.fetch_sli_service_values_counts(slo_batch, dataset)

except KeyboardInterrupt: # Suppress tracebacks on SIGINT
print('\nExiting early, not done ...\n')
Expand Down

0 comments on commit 7f196f4

Please sign in to comment.