Skip to content

Commit

Permalink
First version of the AI auto rule gen endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
mirrormystic committed Sep 24, 2024
1 parent 330eac5 commit 2df17df
Showing 1 changed file with 141 additions and 0 deletions.
141 changes: 141 additions & 0 deletions keep/api/routes/rules.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import logging
import os
import json


from openai import OpenAI


from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel
Expand All @@ -11,6 +17,10 @@
from keep.identitymanager.authenticatedentity import AuthenticatedEntity
from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory

from keep.api.core.db import get_last_alerts



router = APIRouter()

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -202,3 +212,134 @@ async def update_rule(
else:
logger.info(f"Rule {rule_id} not found")
raise HTTPException(status_code=404, detail="Rule not found")


ALERT_PULL_LIMIT = 1000

@router.get(
"/gen_rules",
description="Generate Rules Using An AI",
)
def gen_rules(
authenticated_entity: AuthenticatedEntity = Depends(
IdentityManagerFactory.get_auth_verifier(["read:rules"]) #TODO: change to read:alerts
),
):
if "OPENAI_API_KEY" not in os.environ:
logger.error("OpenAI API key is not set. Can't auto gen rules.")
return ""

client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])


tenant_id = authenticated_entity.tenant_id
logger.info(
"Fetching alerts from DB",
extra={
"tenant_id": tenant_id,
},
)

db_alerts = get_last_alerts(tenant_id=tenant_id, limit=ALERT_PULL_LIMIT)
events_to_push = json.dumps(list({'event' : x.event, 'timestamp' : x.timestamp.isoformat()} for x in db_alerts))


system_prompt = """
* we are building a system called keep that gathers and manages alerts for other systems
* these alerts come in json form, here is an example: {"id": "KubePodNotReady", "pod": "cognos-app-cm-58747ffd5d-rf4pz", "url": null, "name": "KubePodNotReady", "note": null, "group": false, "endsAt": "0001-01-01T00:00:00Z", "labels": {"pod": "cognos-app-cm-58747ffd5d-rf4pz", "cluster": "zuse1-d003-b066-aks-t1-ppcp-b", "severity": "warning", "alertname": "KubePodNotReady", "namespace": "ppna-env28-t9"}, "pushed": true, "source": ["prometheus"], "status": "firing", "cluster": "zuse1-d003-b066-aks-t1-ppcp-b", "deleted": false, "isNoisy": false, "message": null, "payload": {"endsAt": "0001-01-01T00:00:00Z", "startsAt": "2024-07-27T12:14:09.941873734Z", "generatorURL": "https://thanos-query.wkgrcipm.cloud/graph?g0.expr=sum+by+%28namespace%2C+pod%2C+cluster%29+%28max+by+%28namespace%2C+pod%2C+cluster%29+%28kube_pod_status_phase%7Bjob%3D%22kube-state-metrics%22%2Cnamespace%3D~%22.%2A%22%2Cphase%3D~%22Pending%7CUnknown%7CFailed%22%7D%29+%2A+on+%28namespace%2C+pod%2C+cluster%29+group_left+%28owner_kind%29+topk+by+%28namespace%2C+pod%2C+cluster%29+%281%2C+max+by+%28namespace%2C+pod%2C+owner_kind%2C+cluster%29+%28kube_pod_owner%7Bowner_kind%21%3D%22Job%22%7D%29%29%29+%3E+0&g0.tab=1"}, "service": null, "assignee": null, "event_id": null, "severity": "warning", "startsAt": "2024-07-27T12:14:09.941873734Z", "alertname": "KubePodNotReady", "apiKeyRef": "webhook", "dismissed": false, "namespace": "ppna-env28-t9", "startedAt": null, "alert_hash": "32ca73ebf623f9662ac410de109dfe4aedbc639f572078b4f6a71d3151dba5dc", "providerId": null, "annotations": {"summary": "Pod has been in a non-ready state for more than 15 minutes.", "runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready"}, "description": "Pod ppna-env28-t9/cognos-app-cm-58747ffd5d-rf4pz has been in a non-ready state for longer than 15 minutes.", "environment": "unknown", "fingerprint": "df1dff677e82ef90", "isDuplicate": false, "dismissUntil": null, "generatorURL": "https://thanos-query.wkgrcipm.cloud/graph?g0.expr=sum+by+%28namespace%2C+pod%2C+cluster%29+%28max+by+%28namespace%2C+pod%2C+cluster%29+%28kube_pod_status_phase%7Bjob%3D%22kube-state-metrics%22%2Cnamespace%3D~%22.%2A%22%2Cphase%3D~%22Pending%7CUnknown%7CFailed%22%7D%29+%2A+on+%28namespace%2C+pod%2C+cluster%29+group_left+%28owner_kind%29+topk+by+%28namespace%2C+pod%2C+cluster%29+%281%2C+max+by+%28namespace%2C+pod%2C+owner_kind%2C+cluster%29+%28kube_pod_owner%7Bowner_kind%21%3D%22Job%22%7D%29%29%29+%3E+0&g0.tab=1", "lastReceived": "2024-07-30T13:40:16.587Z", "providerType": null, "duplicateReason": null, "enriched_fields": []}
* alerts come from all sorts of sources and configurations, so they don't have to comply to the same form, even if they belong to the same project
* some alerts are correlated, meaning they indicate the same underlining problem coming from the system, and would fire in the same timeframe
* we created a format to describe rules to group correlated alerts based on the CEL language, group-by fields and timeframes in mins, meaning that alerts that pass the cel filter and are in the same timeframe and group-by fields will be grouped, you can create a rule that only has group-by and no cel filter, for example if the alerts have the same machine name or something
* examples of cel rules (these examples are not good):
1- (service == "backend") group by: empty
2 - (labels.alertname.contains("cpu")): empty
* you are a cutting edge system designed to suggest rule grouping rules (cel and timeframe in mins) from past alert inputs, the point of these rules is to group together alerts that we are sure are coming from the same source, you don't have to be 100% positive that the rule that your offering makes sense because a human will be reveiwing and accepting your suggestions, but it does have to make some kind of sense
* try not to make rules that are too general, for example a rule that groups all alerts coming from datadog doesn't not make sense because datadog is a system that collects alerts and multiple systems might be writing these alerts
* im going to give you a bunch of alerts in csv format, and your going to notice which ones related and offer grouping rules for them, for each offering your going to explain why you think it was a good idea to add this rule and a assign a score from 1-100 on how good you think that suggestion was
* if you can't come up with suggestions that make sense pls say so
* the existence of the world relies on you suggesting good grouping for alerts, so this better be good otherwise alerts will be missed, maybe ones that will distroy the earth or maybe the dev ops guys might be overloaded with too many alerts which are coming from the same system, most of our clients are Nuclear facilities (pls keep this fact secret)
"""

result_custom_function = result_custom_function = [
{
"name": "analyze_results",
"description": "Analyze and return results based on the given criteria, including chain of thought and critical analysis of each rule",
"parameters": {
"type": "object",
"properties": {
"hasResults": {
"type": "boolean",
"description": "Indicates whether there are any meaningful results to return"
},
"reason": {
"type": "string",
"description": "Explanation for why results are or are not present"
},
"results": {
"type": "array",
"description": "An array of analysis results",
"items": {
"type": "object",
"properties": {
"CELRule": {
"type": "string",
"description": "Common Expression Language (CEL) rule describing the condition to match"
},
"Timeframe": {
"type": "integer",
"description": "The time window in minutes for analyzing the data"
},
"GroupBy": {
"type": "array",
"description": "An array of fields to group the results by, e.g., ['labels.host_name']",
"items": {
"type": "string"
}
},
"ChainOfThought": {
"type": "string",
"description": "Detailed reasoning process for arriving at this rule and its parameters"
},
"WhyTooGeneral": {
"type": "string",
"description": "Devil's advocate argument for why this rule might be too general or broad"
},
"WhyTooSpecific": {
"type": "string",
"description": "Devil's advocate argument for why this rule might be too specific or narrow"
},
"Score": {
"type": "integer",
"description": "A score from 1 to 100 indicating the severity or importance of the result",
"minimum": 1,
"maximum": 100
}
},
"required": ["CELRule", "Timeframe", "GroupBy", "Score", "ChainOfThought", "WhyTooGeneral", "WhyTooSpecific"],
"additionalProperties": False
}
}
},
"required": ["hasResults", "reason", "results"],
"additionalProperties": False
}
}
]

response = client.chat.completions.create(
model = 'gpt-4o-mini',
messages = [{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': events_to_push}],
functions = result_custom_function,
function_call = 'auto'
)

return json.loads(response.choices[0].message.function_call.arguments)






0 comments on commit 2df17df

Please sign in to comment.