Skip to content

Commit

Permalink
feat: add datadog provider (keephq#136)
Browse files Browse the repository at this point in the history
  • Loading branch information
shahargl authored May 21, 2023
1 parent 3e6b8df commit 3f1a71f
Show file tree
Hide file tree
Showing 6 changed files with 300 additions and 5 deletions.
2 changes: 1 addition & 1 deletion keep/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def diff(iterable: iter) -> bool:
return not all(iterable)


def len(iterable):
def len(iterable=[]):
return _len(iterable)


Expand Down
21 changes: 17 additions & 4 deletions keep/iohandler/iohandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def parse(self, string):
string = self._render(string)

# Now, extract the token if exists -
pattern = r"\bkeep\.\w+\((?:[^()]|\((?:[^()]|)*\))*\)"
pattern = r"\bkeep\.\w+\((?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*\)"
parsed_string = copy.copy(string)
matches = re.findall(pattern, parsed_string)
tokens = list(matches)
Expand Down Expand Up @@ -124,6 +124,10 @@ def _parse(tree):
try:
# TODO(shahargl): when Keep gonna be self hosted, this will be a security issue!!!
# because the user can run any python code need to find a way to limit the functions that can be used

# https://github.com/keephq/keep/issues/138
from dateutil.tz import tzutc

_arg = eval(_arg)
except ValueError:
pass
Expand All @@ -136,9 +140,18 @@ def _parse(tree):

try:
tree = ast.parse(token)
except SyntaxError:
# for strings such as "45%\n", we need to escape
tree = ast.parse(token.encode("unicode_escape"))
except SyntaxError as e:
if "unterminated string literal" in str(e):
# try to HTML escape the string
# this is happens when libraries such as datadog api client
# HTML escapes the string and then ast.parse fails ()
# https://github.com/keephq/keep/issues/137
import html

tree = ast.parse(html.unescape(token))
else:
# for strings such as "45%\n", we need to escape
tree = ast.parse(token.encode("unicode_escape"))
return _parse(tree)

def _render(self, key):
Expand Down
Empty file.
123 changes: 123 additions & 0 deletions keep/providers/datadog_provider/datadog_alert_format_description.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from typing import Literal

from pydantic import BaseModel, Field


class Thresholds(BaseModel):
critical: float
critical_recovery: float
ok: float
warning: float
warning_recovery: float
unknown: float


class EvaluationWindow(BaseModel):
day_starts: str
hour_starts: int
month_starts: int


class SchedulingOptions(BaseModel):
evaluation_window: EvaluationWindow


class ThresholdWindows(BaseModel):
recovery_window: str
trigger_window: str


class DatadogOptions(BaseModel):
enable_logs_sample: bool
enable_samples: bool
escalation_message: str
evaluation_delay: int
group_retention_duration: str
grouby_simple_monitor: bool
include_tags: bool
locked: bool
min_failure_duration: int
min_location_failed: int
new_group_delay: int
new_host_delay: int
no_data_timeframe: int
notification_preset_name: Literal[
"show_all", "hide_query", "hide_handles", "hide_all"
]
notify_audit: bool
notify_by: list[str]
notify_no_data: bool
on_missing_data: Literal[
"default", "show_no_data", "show_and_notify_no_data", "resolve"
]
renotify_interval: int
renotify_occurrences: int
renotify_statuses: list[str]
require_full_window: bool
cheduling_options: SchedulingOptions
silenced: dict
threshold_windows: ThresholdWindows
# thresholds: Thresholds
timeout_h: int


class DatadogAlertFormatDescription(BaseModel):
message: str = Field(
..., description="A message to include with notifications for this monitor."
)
name: str = Field(..., description="The name of the monitor.")
options: DatadogOptions
priority: int = Field(..., description="The priority of the monitor.", min=1, max=5)
query: str = Field(..., description="The query to monitor.", required=True)
tags: list[str]
type: Literal[
"composite",
"event alert",
"log alert",
"metric alert",
"process alert",
"query alert",
"rum alert",
"service check",
"synthetics alert",
"trace-analytics alert",
"slo alert",
"event-v2 alert",
"audit alert",
"ci-pipelines alert",
"ci-tests alert",
"error-tracking alert",
]

class Config:
schema_extra = {
"example": {
"name": "Example-Monitor",
"type": "rum alert",
"query": 'formula("query2 / query1 * 100").last("15m") >= 0.8',
"message": "some message Notify: @hipchat-channel",
"tags": ["test:examplemonitor", "env:ci"],
"priority": 3,
"options": {
"thresholds": {"critical": 0.8},
"variables": [
{
"data_source": "rum",
"name": "query2",
"search": {"query": ""},
"indexes": ["*"],
"compute": {"aggregation": "count"},
"group_by": [],
},
{
"data_source": "rum",
"name": "query1",
"search": {"query": "status:error"},
"indexes": ["*"],
"compute": {"aggregation": "count"},
"group_by": [],
},
],
},
}
}
158 changes: 158 additions & 0 deletions keep/providers/datadog_provider/datadog_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""
Datadog Provider is a class that allows to ingest/digest data from Datadog.
"""
import datetime
import time

import pydantic
from datadog_api_client import ApiClient, Configuration
from datadog_api_client.v1.api.logs_api import LogsApi
from datadog_api_client.v1.api.metrics_api import MetricsApi
from datadog_api_client.v1.api.monitors_api import MonitorsApi
from datadog_api_client.v1.model.monitor import Monitor
from datadog_api_client.v1.model.monitor_type import MonitorType

from keep.providers.base.base_provider import BaseProvider
from keep.providers.datadog_provider.datadog_alert_format_description import (
DatadogAlertFormatDescription,
)
from keep.providers.models.provider_config import ProviderConfig
from keep.providers.providers_factory import ProvidersFactory


@pydantic.dataclasses.dataclass
class DatadogAuthConfig:
"""
Datadog authentication configuration.
"""

api_key: str
app_key: str


class DatadogProvider(BaseProvider):
"""
Datadog provider class.
"""

def convert_to_seconds(s):
seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
return int(s[:-1]) * seconds_per_unit[s[-1]]

def __init__(self, provider_id: str, config: ProviderConfig):
super().__init__(provider_id, config)
self.configuration = Configuration()
self.configuration.api_key["apiKeyAuth"] = self.authentication_config.api_key
self.configuration.api_key["appKeyAuth"] = self.authentication_config.app_key

def dispose(self):
"""
Dispose the provider.
"""
pass

def validate_config(self):
"""
Validates required configuration for Datadog provider.
"""
self.authentication_config = DatadogAuthConfig(**self.config.authentication)

def query(self, **kwargs: dict):
query = kwargs.get("query")
timeframe = kwargs.get("timeframe")
timeframe_in_seconds = DatadogProvider.convert_to_seconds(timeframe)
query_type = kwargs.get("query_type")
if query_type == "logs":
with ApiClient(self.configuration) as api_client:
api = LogsApi(api_client)
results = api.list_logs(
body={
"query": query,
"time": {
"_from": datetime.datetime.fromtimestamp(
time.time() - (timeframe_in_seconds)
),
"to": datetime.datetime.fromtimestamp(time.time()),
},
}
)
elif query_type == "metrics":
with ApiClient(self.configuration) as api_client:
api = MetricsApi(api_client)
results = api.query_metrics(
query=query,
_from=time.time() - (timeframe_in_seconds * 1000),
to=time.time(),
)
return results

def get_alerts(self, alert_id: str | None = None):
with ApiClient(self.configuration) as api_client:
api = MonitorsApi(api_client)
monitors = api.list_monitors()
monitors = [monitor.to_dict() for monitor in monitors]
if alert_id:
monitors = list(
filter(lambda monitor: monitor["id"] == alert_id, monitors)
)
return monitors

def deploy_alert(self, alert: dict, alert_id: str | None = None):
body = Monitor(**alert)
with ApiClient(self.configuration) as api_client:
api_instance = MonitorsApi(api_client)
try:
response = api_instance.create_monitor(body=body)
except Exception as e:
raise Exception({"message": e.body["errors"][0]})
return response

@staticmethod
def get_alert_format_description():
return DatadogAlertFormatDescription.schema()


if __name__ == "__main__":
# Output debug messages
import logging

logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()])

# Load environment variables
import os

api_key = os.environ.get("DATADOG_API_KEY")
app_key = os.environ.get("DATADOG_APP_KEY")

config = {
"authentication": {"api_key": api_key, "app_key": app_key},
}
provider = ProvidersFactory.get_provider(
provider_id="datadog-keephq", provider_type="datadog", provider_config=config
)
results = provider.query(
query="service:keep-github-app status:error", timeframe="4w", query_type="logs"
)
"""
alerts = provider.deploy_alert(
{
"name": "Error Rate Alert",
"type": "metric alert",
"query": "sum:myapp.server.errors{service:talboren/simple-crud-service}.as_count().rollup(sum, 600) > 5",
"message": "The error rate for talboren/simple-crud-service has exceeded 5% in the last 10 minutes. Please investigate immediately",
"tags": ["service:talboren/simple-crud-service", "severity:critical"],
"options": {
"thresholds": {"critical": 5},
"notify_audit": False,
"notify_no_data": False,
"require_full_window": True,
"timeout_h": 1,
"silenced": {},
},
"restricted_roles": [],
"priority": 2,
}
)
"""
print(alerts)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ opsgenie-sdk = "^2.1.5"
psycopg2-binary = "^2.9.5"
starlette-context = "^0.3.6"
nest-asyncio = "^1.5.6"
datadog-api-client = "^2.12.0"


[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 3f1a71f

Please sign in to comment.