From 3f1a71f70ec98f09a10068ed0b64b6c6061920be Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Sun, 21 May 2023 11:18:50 +0300 Subject: [PATCH] feat: add datadog provider (#136) --- keep/functions/__init__.py | 2 +- keep/iohandler/iohandler.py | 21 ++- keep/providers/datadog_provider/__init__.py | 0 .../datadog_alert_format_description.py | 123 ++++++++++++++ .../datadog_provider/datadog_provider.py | 158 ++++++++++++++++++ pyproject.toml | 1 + 6 files changed, 300 insertions(+), 5 deletions(-) create mode 100644 keep/providers/datadog_provider/__init__.py create mode 100644 keep/providers/datadog_provider/datadog_alert_format_description.py create mode 100644 keep/providers/datadog_provider/datadog_provider.py diff --git a/keep/functions/__init__.py b/keep/functions/__init__.py index 37b0a1d44..d9702ca3c 100644 --- a/keep/functions/__init__.py +++ b/keep/functions/__init__.py @@ -19,7 +19,7 @@ def diff(iterable: iter) -> bool: return not all(iterable) -def len(iterable): +def len(iterable=[]): return _len(iterable) diff --git a/keep/iohandler/iohandler.py b/keep/iohandler/iohandler.py index e01d6e5fa..111960f17 100644 --- a/keep/iohandler/iohandler.py +++ b/keep/iohandler/iohandler.py @@ -76,7 +76,7 @@ def parse(self, string): string = self._render(string) # Now, extract the token if exists - - pattern = r"\bkeep\.\w+\((?:[^()]|\((?:[^()]|)*\))*\)" + pattern = r"\bkeep\.\w+\((?:[^()]*|\((?:[^()]*|\([^()]*\))*\))*\)" parsed_string = copy.copy(string) matches = re.findall(pattern, parsed_string) tokens = list(matches) @@ -124,6 +124,10 @@ def _parse(tree): try: # TODO(shahargl): when Keep gonna be self hosted, this will be a security issue!!! # because the user can run any python code need to find a way to limit the functions that can be used + + # https://github.com/keephq/keep/issues/138 + from dateutil.tz import tzutc + _arg = eval(_arg) except ValueError: pass @@ -136,9 +140,18 @@ def _parse(tree): try: tree = ast.parse(token) - except SyntaxError: - # for strings such as "45%\n", we need to escape - tree = ast.parse(token.encode("unicode_escape")) + except SyntaxError as e: + if "unterminated string literal" in str(e): + # try to HTML escape the string + # this is happens when libraries such as datadog api client + # HTML escapes the string and then ast.parse fails () + # https://github.com/keephq/keep/issues/137 + import html + + tree = ast.parse(html.unescape(token)) + else: + # for strings such as "45%\n", we need to escape + tree = ast.parse(token.encode("unicode_escape")) return _parse(tree) def _render(self, key): diff --git a/keep/providers/datadog_provider/__init__.py b/keep/providers/datadog_provider/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/keep/providers/datadog_provider/datadog_alert_format_description.py b/keep/providers/datadog_provider/datadog_alert_format_description.py new file mode 100644 index 000000000..c157c17ad --- /dev/null +++ b/keep/providers/datadog_provider/datadog_alert_format_description.py @@ -0,0 +1,123 @@ +from typing import Literal + +from pydantic import BaseModel, Field + + +class Thresholds(BaseModel): + critical: float + critical_recovery: float + ok: float + warning: float + warning_recovery: float + unknown: float + + +class EvaluationWindow(BaseModel): + day_starts: str + hour_starts: int + month_starts: int + + +class SchedulingOptions(BaseModel): + evaluation_window: EvaluationWindow + + +class ThresholdWindows(BaseModel): + recovery_window: str + trigger_window: str + + +class DatadogOptions(BaseModel): + enable_logs_sample: bool + enable_samples: bool + escalation_message: str + evaluation_delay: int + group_retention_duration: str + grouby_simple_monitor: bool + include_tags: bool + locked: bool + min_failure_duration: int + min_location_failed: int + new_group_delay: int + new_host_delay: int + no_data_timeframe: int + notification_preset_name: Literal[ + "show_all", "hide_query", "hide_handles", "hide_all" + ] + notify_audit: bool + notify_by: list[str] + notify_no_data: bool + on_missing_data: Literal[ + "default", "show_no_data", "show_and_notify_no_data", "resolve" + ] + renotify_interval: int + renotify_occurrences: int + renotify_statuses: list[str] + require_full_window: bool + cheduling_options: SchedulingOptions + silenced: dict + threshold_windows: ThresholdWindows + # thresholds: Thresholds + timeout_h: int + + +class DatadogAlertFormatDescription(BaseModel): + message: str = Field( + ..., description="A message to include with notifications for this monitor." + ) + name: str = Field(..., description="The name of the monitor.") + options: DatadogOptions + priority: int = Field(..., description="The priority of the monitor.", min=1, max=5) + query: str = Field(..., description="The query to monitor.", required=True) + tags: list[str] + type: Literal[ + "composite", + "event alert", + "log alert", + "metric alert", + "process alert", + "query alert", + "rum alert", + "service check", + "synthetics alert", + "trace-analytics alert", + "slo alert", + "event-v2 alert", + "audit alert", + "ci-pipelines alert", + "ci-tests alert", + "error-tracking alert", + ] + + class Config: + schema_extra = { + "example": { + "name": "Example-Monitor", + "type": "rum alert", + "query": 'formula("query2 / query1 * 100").last("15m") >= 0.8', + "message": "some message Notify: @hipchat-channel", + "tags": ["test:examplemonitor", "env:ci"], + "priority": 3, + "options": { + "thresholds": {"critical": 0.8}, + "variables": [ + { + "data_source": "rum", + "name": "query2", + "search": {"query": ""}, + "indexes": ["*"], + "compute": {"aggregation": "count"}, + "group_by": [], + }, + { + "data_source": "rum", + "name": "query1", + "search": {"query": "status:error"}, + "indexes": ["*"], + "compute": {"aggregation": "count"}, + "group_by": [], + }, + ], + }, + } + } diff --git a/keep/providers/datadog_provider/datadog_provider.py b/keep/providers/datadog_provider/datadog_provider.py new file mode 100644 index 000000000..f7e2c2bc6 --- /dev/null +++ b/keep/providers/datadog_provider/datadog_provider.py @@ -0,0 +1,158 @@ +""" +Datadog Provider is a class that allows to ingest/digest data from Datadog. +""" +import datetime +import time + +import pydantic +from datadog_api_client import ApiClient, Configuration +from datadog_api_client.v1.api.logs_api import LogsApi +from datadog_api_client.v1.api.metrics_api import MetricsApi +from datadog_api_client.v1.api.monitors_api import MonitorsApi +from datadog_api_client.v1.model.monitor import Monitor +from datadog_api_client.v1.model.monitor_type import MonitorType + +from keep.providers.base.base_provider import BaseProvider +from keep.providers.datadog_provider.datadog_alert_format_description import ( + DatadogAlertFormatDescription, +) +from keep.providers.models.provider_config import ProviderConfig +from keep.providers.providers_factory import ProvidersFactory + + +@pydantic.dataclasses.dataclass +class DatadogAuthConfig: + """ + Datadog authentication configuration. + """ + + api_key: str + app_key: str + + +class DatadogProvider(BaseProvider): + """ + Datadog provider class. + """ + + def convert_to_seconds(s): + seconds_per_unit = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800} + return int(s[:-1]) * seconds_per_unit[s[-1]] + + def __init__(self, provider_id: str, config: ProviderConfig): + super().__init__(provider_id, config) + self.configuration = Configuration() + self.configuration.api_key["apiKeyAuth"] = self.authentication_config.api_key + self.configuration.api_key["appKeyAuth"] = self.authentication_config.app_key + + def dispose(self): + """ + Dispose the provider. + """ + pass + + def validate_config(self): + """ + Validates required configuration for Datadog provider. + + """ + self.authentication_config = DatadogAuthConfig(**self.config.authentication) + + def query(self, **kwargs: dict): + query = kwargs.get("query") + timeframe = kwargs.get("timeframe") + timeframe_in_seconds = DatadogProvider.convert_to_seconds(timeframe) + query_type = kwargs.get("query_type") + if query_type == "logs": + with ApiClient(self.configuration) as api_client: + api = LogsApi(api_client) + results = api.list_logs( + body={ + "query": query, + "time": { + "_from": datetime.datetime.fromtimestamp( + time.time() - (timeframe_in_seconds) + ), + "to": datetime.datetime.fromtimestamp(time.time()), + }, + } + ) + elif query_type == "metrics": + with ApiClient(self.configuration) as api_client: + api = MetricsApi(api_client) + results = api.query_metrics( + query=query, + _from=time.time() - (timeframe_in_seconds * 1000), + to=time.time(), + ) + return results + + def get_alerts(self, alert_id: str | None = None): + with ApiClient(self.configuration) as api_client: + api = MonitorsApi(api_client) + monitors = api.list_monitors() + monitors = [monitor.to_dict() for monitor in monitors] + if alert_id: + monitors = list( + filter(lambda monitor: monitor["id"] == alert_id, monitors) + ) + return monitors + + def deploy_alert(self, alert: dict, alert_id: str | None = None): + body = Monitor(**alert) + with ApiClient(self.configuration) as api_client: + api_instance = MonitorsApi(api_client) + try: + response = api_instance.create_monitor(body=body) + except Exception as e: + raise Exception({"message": e.body["errors"][0]}) + return response + + @staticmethod + def get_alert_format_description(): + return DatadogAlertFormatDescription.schema() + + +if __name__ == "__main__": + # Output debug messages + import logging + + logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler()]) + + # Load environment variables + import os + + api_key = os.environ.get("DATADOG_API_KEY") + app_key = os.environ.get("DATADOG_APP_KEY") + + config = { + "authentication": {"api_key": api_key, "app_key": app_key}, + } + provider = ProvidersFactory.get_provider( + provider_id="datadog-keephq", provider_type="datadog", provider_config=config + ) + results = provider.query( + query="service:keep-github-app status:error", timeframe="4w", query_type="logs" + ) + """ + alerts = provider.deploy_alert( + { + "name": "Error Rate Alert", + "type": "metric alert", + "query": "sum:myapp.server.errors{service:talboren/simple-crud-service}.as_count().rollup(sum, 600) > 5", + "message": "The error rate for talboren/simple-crud-service has exceeded 5% in the last 10 minutes. Please investigate immediately", + "tags": ["service:talboren/simple-crud-service", "severity:critical"], + "options": { + "thresholds": {"critical": 5}, + "notify_audit": False, + "notify_no_data": False, + "require_full_window": True, + "timeout_h": 1, + "silenced": {}, + }, + "restricted_roles": [], + "priority": 2, + } + ) + """ + print(alerts) diff --git a/pyproject.toml b/pyproject.toml index efcd9f08f..9cd04c2e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ opsgenie-sdk = "^2.1.5" psycopg2-binary = "^2.9.5" starlette-context = "^0.3.6" nest-asyncio = "^1.5.6" +datadog-api-client = "^2.12.0" [tool.poetry.group.dev.dependencies]