Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test CI snapshotting #25688

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
cbb59f7
fix(alerts): alert monitoring + trendlines only for absolute
anirudhpillai Oct 17, 2024
646e1f3
order imports
anirudhpillai Oct 17, 2024
3547c7c
mypy
anirudhpillai Oct 17, 2024
4f0d390
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
948a565
Merge branch 'master' of github.com:PostHog/posthog into fix/alert-mo…
anirudhpillai Oct 17, 2024
8d0e89e
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
a82be31
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
f17c00a
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
9a5ff6c
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
0c7cb5b
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
4c281b0
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
26d85e5
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
5ea7b46
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
c0d6f2c
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
a64e448
Update UI snapshots for `chromium` (2)
github-actions[bot] Oct 17, 2024
b26466a
Update UI snapshots for `chromium` (2)
github-actions[bot] Oct 17, 2024
cab028e
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 17, 2024
32b1db7
disable sleep for tests and dev
anirudhpillai Oct 18, 2024
f636e7e
Merge branch 'fix/alert-monitoring-trendlines' of github.com:PostHog/…
anirudhpillai Oct 18, 2024
5e82536
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
fc810ff
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
eebf36c
truncate relative date for snoozing
anirudhpillai Oct 18, 2024
ccaa7ff
Merge branch 'fix/alert-monitoring-trendlines' of github.com:PostHog/…
anirudhpillai Oct 18, 2024
a550f14
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
ae16fff
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
a8c58da
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
42693cc
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
2df6780
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
4c46632
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
3951645
Merge branch 'master' into fix/alert-monitoring-trendlines
anirudhpillai Oct 18, 2024
85369cc
Update UI snapshots for `chromium` (1)
github-actions[bot] Oct 18, 2024
75998a2
improve alert breach message
anirudhpillai Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion frontend/src/lib/components/Alerts/insightAlertsLogic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { loaders } from 'kea-loaders'
import api from 'lib/api'
import { insightVizDataLogic } from 'scenes/insights/insightVizDataLogic'

import { GoalLine, InsightThresholdType } from '~/queries/schema'
import { AlertConditionType, GoalLine, InsightThresholdType } from '~/queries/schema'
import { getBreakdown, isInsightVizNode, isTrendsQuery } from '~/queries/utils'
import { InsightLogicProps } from '~/types'

Expand Down Expand Up @@ -67,6 +67,7 @@ export const insightAlertsLogic = kea<insightAlertsLogicType>([
alerts.flatMap((alert) => {
if (
alert.threshold.configuration.type !== InsightThresholdType.ABSOLUTE ||
alert.condition.type !== AlertConditionType.ABSOLUTE_VALUE ||
!alert.threshold.configuration.bounds
) {
return []
Expand Down
4 changes: 3 additions & 1 deletion posthog/api/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,9 @@ def update(self, instance, validated_data):
else:
# always store snoozed_until as UTC time
# as we look at current UTC time to check when to run alerts
snoozed_until = relative_date_parse(snoozed_until_param, ZoneInfo("UTC"), increase=True)
snoozed_until = relative_date_parse(
snoozed_until_param, ZoneInfo("UTC"), increase=True, always_truncate=True
)
instance.state = AlertState.SNOOZED
instance.snoozed_until = snoozed_until

Expand Down
39 changes: 36 additions & 3 deletions posthog/tasks/alerts/checks.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import math
import time
import traceback

from datetime import datetime, timedelta, UTC
from typing import cast
from dateutil.relativedelta import relativedelta
import traceback

from celery import shared_task
from celery.canvas import chain
from django.conf import settings
from django.db import transaction
import structlog
from sentry_sdk import capture_exception
Expand Down Expand Up @@ -39,7 +43,15 @@
logger = structlog.get_logger(__name__)


class AlertCheckException(Exception): ...
class AlertCheckException(Exception):
"""
Required for custom exceptions to pass stack trace to sentry.
Subclassing through other ways doesn't transfer the traceback.
https://stackoverflow.com/a/69963663/5540417
"""

def __init__(self, err: Exception):
self.__traceback__ = err.__traceback__


HOURLY_ALERTS_BACKLOG_GAUGE = Gauge(
Expand Down Expand Up @@ -102,6 +114,9 @@ def alerts_backlog_task() -> None:

DAILY_ALERTS_BACKLOG_GAUGE.set(daily_alerts_breaching_sla)

# sleeping 30s for prometheus to pick up the metrics sent during task
time.sleep(30)


@shared_task(
ignore_result=True,
Expand Down Expand Up @@ -158,6 +173,8 @@ def check_alert_task(alert_id: str) -> None:


def check_alert(alert_id: str) -> None:
task_start_time = time.time()

try:
alert = AlertConfiguration.objects.get(id=alert_id, enabled=True)
except AlertConfiguration.DoesNotExist:
Expand Down Expand Up @@ -199,8 +216,15 @@ def check_alert(alert_id: str) -> None:
check_alert_and_notify_atomically(alert)
except Exception as err:
ALERT_CHECK_ERROR_COUNTER.inc()

logger.exception(AlertCheckException(err))
capture_exception(AlertCheckException(err))
capture_exception(
AlertCheckException(err),
tags={
"alert_configuration_id": alert_id,
},
)

# raise again so alert check is retried depending on error type
raise
finally:
Expand All @@ -209,6 +233,15 @@ def check_alert(alert_id: str) -> None:
alert.is_calculating = False
alert.save()

# only in PROD
if not settings.DEBUG and not settings.TEST:
task_duration = time.time() - task_start_time

# Ensure task runs at least 40s
# for prometheus to pick up the metrics sent during task
time_left_to_run = 40 - math.floor(task_duration)
time.sleep(time_left_to_run)


@transaction.atomic
def check_alert_and_notify_atomically(alert: AlertConfiguration) -> None:
Expand Down
61 changes: 13 additions & 48 deletions posthog/tasks/alerts/test/test_trends_absolute_alerts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import Optional, Any
from unittest.mock import MagicMock, patch
from unittest.mock import ANY, MagicMock, patch
import dateutil


from freezegun import freeze_time

from posthog.models.alert import AlertCheck
Expand All @@ -29,7 +28,7 @@
FROZEN_TIME = dateutil.parser.parse("2024-06-02T08:55:00.000Z")


@freeze_time("2024-06-02T08:55:00.000Z")
@freeze_time(FROZEN_TIME)
@patch("posthog.tasks.alerts.checks.send_notifications_for_errors")
@patch("posthog.tasks.alerts.checks.send_notifications_for_breaches")
class TestTimeSeriesTrendsAbsoluteAlerts(APIBaseTest, ClickhouseDestroyTablesMixin):
Expand Down Expand Up @@ -97,7 +96,7 @@ def create_time_series_trend_insight(self, breakdown: Optional[BreakdownFilter]

return insight

def test_alert_properties(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
def test_alert_lower_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=1)

Expand All @@ -119,11 +118,15 @@ def test_alert_properties(self, mock_send_breaches: MagicMock, mock_send_errors:
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week is (0) less than lower threshold (1.0)"]
)

def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, upper=1)

with freeze_time("2024-06-02T07:55:00.000Z"):
with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -149,11 +152,15 @@ def test_trend_high_threshold_breached(self, mock_send_breaches: MagicMock, mock
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week is (2) more than upper threshold (1.0)"]
)

def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock) -> None:
insight = self.create_time_series_trend_insight()
alert = self.create_alert(insight, series_index=0, lower=0, upper=2)

with freeze_time("2024-06-02T07:55:00.000Z"):
with freeze_time(FROZEN_TIME - dateutil.relativedelta.relativedelta(days=1)):
_create_event(
team=self.team,
event="signed_up",
Expand All @@ -172,45 +179,3 @@ def test_trend_no_threshold_breached(self, mock_send_breaches: MagicMock, mock_s
assert alert_check.calculated_value == 1
assert alert_check.state == AlertState.NOT_FIRING
assert alert_check.error is None

# TODO: support breakdowns
def test_trend_with_single_breakdown_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
insight = self.create_time_series_trend_insight(
breakdown=BreakdownFilter(breakdown_type="event", breakdown="$browser")
)
alert = self.create_alert(insight, series_index=0, lower=0, upper=1)

with freeze_time("2024-06-02T07:55:00.000Z"):
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Chrome"},
)
_create_event(
team=self.team,
event="signed_up",
distinct_id="2",
properties={"$browser": "Chrome"},
)
_create_event(
team=self.team,
event="signed_up",
distinct_id="1",
properties={"$browser": "Firefox"},
)
flush_persons_and_events()

check_alert(alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=alert["id"])
assert updated_alert.state == AlertState.FIRING
assert updated_alert.next_check_at == FROZEN_TIME + dateutil.relativedelta.relativedelta(days=1)

alert_check = AlertCheck.objects.filter(alert_configuration=alert["id"]).latest("created_at")
# calculated value should only be from browser = Chrome
assert alert_check.calculated_value == 2
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None
30 changes: 29 additions & 1 deletion posthog/tasks/alerts/test/test_trends_relative_alerts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Optional, Any
from unittest.mock import MagicMock, patch
from unittest.mock import ANY, MagicMock, patch
import dateutil


Expand Down Expand Up @@ -184,6 +184,10 @@ def test_relative_increase_absolute_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week increased (2) more than upper threshold (1.0)"]
)

def test_relative_increase_upper_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -341,6 +345,10 @@ def test_relative_increase_lower_threshold_breached_1(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week increased (-1) less than lower threshold (2.0)"]
)

# check percentage alert
check_alert(percentage_alert["id"])

Expand All @@ -354,6 +362,10 @@ def test_relative_increase_lower_threshold_breached_1(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week increased (-50.00%) less than lower threshold (50.00%)"]
)

def test_relative_increase_lower_threshold_breached_2(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -511,6 +523,10 @@ def test_relative_decrease_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week decreased (2) more than upper threshold (1.0)"]
)

check_alert(percentage_alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=percentage_alert["id"])
Expand All @@ -523,6 +539,10 @@ def test_relative_decrease_upper_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week decreased (66.67%) more than upper threshold (20.00%)"]
)

def test_relative_decrease_lower_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down Expand Up @@ -592,6 +612,10 @@ def test_relative_decrease_lower_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_once_with(
ANY, ["The insight value for previous week decreased (1) less than lower threshold (2.0)"]
)

check_alert(percentage_alert["id"])

updated_alert = AlertConfiguration.objects.get(pk=percentage_alert["id"])
Expand All @@ -604,6 +628,10 @@ def test_relative_decrease_lower_threshold_breached(
assert alert_check.state == AlertState.FIRING
assert alert_check.error is None

mock_send_breaches.assert_called_with(
ANY, ["The insight value for previous week decreased (50.00%) less than lower threshold (80.00%)"]
)

def test_relative_increase_no_threshold_breached(
self, mock_send_breaches: MagicMock, mock_send_errors: MagicMock
) -> None:
Expand Down
42 changes: 31 additions & 11 deletions posthog/tasks/alerts/trends.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
# want value for current interval (last hour, last day, last week, last month)
# depending on the alert calculation interval
if _is_non_time_series_trend(query):
filters_override = _date_range_override_for_intervals(query)
filters_override = _date_range_override_for_intervals(query, last_x_intervals=2)
else:
# for non time series, it's an aggregated value for full interval
# so we need to compute full insight
Expand All @@ -79,10 +79,12 @@
if not calculation_result.result:
raise RuntimeError(f"No results found for insight with alert id = {alert.id}")

current_interval_value = _pick_interval_value_from_trend_result(config, query, calculation_result)
breaches = _validate_bounds(threshold.bounds, current_interval_value)
prev_interval_value = _pick_interval_value_from_trend_result(config, query, calculation_result, -1)
breaches = _validate_bounds(
threshold.bounds, prev_interval_value, threshold.type, condition.type, query.interval

Check failure on line 84 in posthog/tasks/alerts/trends.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 5 to "_validate_bounds" has incompatible type "IntervalType | None"; expected "IntervalType"
)

return AlertEvaluationResult(value=current_interval_value, breaches=breaches)
return AlertEvaluationResult(value=prev_interval_value, breaches=breaches)

case AlertConditionType.RELATIVE_INCREASE:
if _is_non_time_series_trend(query):
Expand All @@ -107,10 +109,10 @@

if threshold.type == InsightThresholdType.ABSOLUTE:
increase = prev_interval_value - prev_prev_interval_value
breaches = _validate_bounds(threshold.bounds, increase)
breaches = _validate_bounds(threshold.bounds, increase, threshold.type, condition.type, query.interval)

Check failure on line 112 in posthog/tasks/alerts/trends.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 5 to "_validate_bounds" has incompatible type "IntervalType | None"; expected "IntervalType"
elif threshold.type == InsightThresholdType.PERCENTAGE:
increase = (prev_interval_value - prev_prev_interval_value) / prev_prev_interval_value
breaches = _validate_bounds(threshold.bounds, increase, is_percentage=True)
breaches = _validate_bounds(threshold.bounds, increase, threshold.type, condition.type, query.interval)

Check failure on line 115 in posthog/tasks/alerts/trends.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 5 to "_validate_bounds" has incompatible type "IntervalType | None"; expected "IntervalType"
else:
raise ValueError(
f"Neither relative nor absolute threshold configured for alert condition RELATIVE_INCREASE"
Expand Down Expand Up @@ -141,10 +143,10 @@

if threshold.type == InsightThresholdType.ABSOLUTE:
decrease = prev_prev_interval_value - prev_interval_value
breaches = _validate_bounds(threshold.bounds, decrease)
breaches = _validate_bounds(threshold.bounds, decrease, threshold.type, condition.type, query.interval)

Check failure on line 146 in posthog/tasks/alerts/trends.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 5 to "_validate_bounds" has incompatible type "IntervalType | None"; expected "IntervalType"
elif threshold.type == InsightThresholdType.PERCENTAGE:
decrease = (prev_prev_interval_value - prev_interval_value) / prev_prev_interval_value
breaches = _validate_bounds(threshold.bounds, decrease, is_percentage=True)
breaches = _validate_bounds(threshold.bounds, decrease, threshold.type, condition.type, query.interval)

Check failure on line 149 in posthog/tasks/alerts/trends.py

View workflow job for this annotation

GitHub Actions / Python code quality checks

Argument 5 to "_validate_bounds" has incompatible type "IntervalType | None"; expected "IntervalType"
else:
raise ValueError(
f"Neither relative nor absolute threshold configured for alert condition RELATIVE_INCREASE"
Expand Down Expand Up @@ -202,18 +204,36 @@


def _validate_bounds(
bounds: InsightsThresholdBounds | None, calculated_value: float, is_percentage: bool = False
bounds: InsightsThresholdBounds | None,
calculated_value: float,
threshold_type: InsightThresholdType,
condition_type: AlertConditionType,
interval_type: IntervalType,
) -> list[str]:
if not bounds:
return []

is_percentage = threshold_type == InsightThresholdType.PERCENTAGE

formatted_value = f"{calculated_value:.2%}" if is_percentage else calculated_value

match condition_type:
case AlertConditionType.ABSOLUTE_VALUE:
condition_text = "is"
case AlertConditionType.RELATIVE_INCREASE:
condition_text = "increased"
case AlertConditionType.RELATIVE_DECREASE:
condition_text = "decreased"

if bounds.lower is not None and calculated_value < bounds.lower:
lower_value = f"{bounds.lower:.2%}" if is_percentage else bounds.lower
return [f"The trend value ({formatted_value}) is below the lower threshold ({lower_value})"]
return [
f"The insight value for previous {interval_type} {condition_text} ({formatted_value}) less than lower threshold ({lower_value})"
]
if bounds.upper is not None and calculated_value > bounds.upper:
upper_value = f"{bounds.upper:.2%}" if is_percentage else bounds.upper
return [f"The trend value ({formatted_value}) is above the upper threshold ({upper_value})"]
return [
f"The insight value for previous {interval_type} {condition_text} ({formatted_value}) more than upper threshold ({upper_value})"
]

return []
Loading