Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ref(alerts): Update rule status on dynamic alert rule update #78143

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions src/sentry/incidents/logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def update_alert_rule(
elif detection_type == AlertRuleDetectionType.DYNAMIC:
# NOTE: we set seasonality for EA
updated_query_fields["resolution"] = timedelta(
minutes=time_window if time_window is not None else snuba_query.time_window
seconds=time_window if time_window is not None else snuba_query.time_window
)
updated_fields["seasonality"] = AlertRuleSeasonality.AUTO
updated_fields["comparison_delta"] = None
Expand Down Expand Up @@ -936,22 +936,37 @@ def update_alert_rule(
raise ResourceDoesNotExist(
"Your organization does not have access to this feature."
)

if updated_fields.get("detection_type") == AlertRuleDetectionType.DYNAMIC and (
alert_rule.detection_type != AlertRuleDetectionType.DYNAMIC or query or aggregate
):
for k, v in updated_fields.items():
setattr(alert_rule, k, v)

for k, v in updated_query_fields.items():
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any snuba query updates were previously not getting passed to the Seer code until this was added

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the cases that we'll execute this code?

if updated_fields.get("detection_type") == AlertRuleDetectionType.DYNAMIC and (
                alert_rule.detection_type != AlertRuleDetectionType.DYNAMIC or query or aggregate
            ):

feels a little confusing to me, if it's a dynamic alert - and - it's not a dynamic alert or query or aggregate? could we simplify this check at all or maybe nest them for clarity? If we can nest it, should we merge this code: https://github.com/getsentry/sentry/pull/78143/files#diff-54ef12aa467e9e032fcdeb5d98079424d01fb28db982938d9e8db849ae78dd5eR1012 into that check?

I'm trying to think about how this method is quite large, and ways we can decompose it into smaller methods. it seems like we could use the alert type as a switch statement and handle different chunks accordingly. (fwiw, i think decomposing this whole method is out of scope, but i think we should try to limit adding to this method and slowly improve it over time)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That check is saying "if the rule type is changing to become a dynamic type OR it's already dynamic and the update changes either the query or the aggregate..." then we need to refetch the historical data and send it to Seer because either it wasn't a dynamic type before (so Seer wouldn't have that data) or the snuba query has changed so the historical data has changed

if k == "dataset":
v = v.value
elif k == "time_window":
v = time_window if time_window else snuba_query.time_window
elif k == "event_types":
continue
setattr(alert_rule.snuba_query, k, v)

try:
# NOTE: if adding a new metric alert type, take care to check that it's handled here
rule_status = send_historical_data_to_seer(
alert_rule=alert_rule,
project=projects[0] if projects else alert_rule.projects.get(),
snuba_query=alert_rule.snuba_query,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we fetch the snuba query from the db in send_historical_data_to_seer it was missing the updates unless I passed it like this

event_types=updated_query_fields.get("event_types"),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

event_types is in another table and updated slightly differently, (see here) so I had to pass it in directly for updates

)
if rule_status == AlertRuleStatus.NOT_ENOUGH_DATA:
# if we don't have at least seven days worth of data, then the dynamic alert won't fire
alert_rule.update(status=AlertRuleStatus.NOT_ENOUGH_DATA.value)
elif (
rule_status == AlertRuleStatus.PENDING
and alert_rule.status != AlertRuleStatus.PENDING
):
alert_rule.update(status=AlertRuleStatus.PENDING.value)
except (TimeoutError, MaxRetryError):
raise TimeoutError("Failed to send data to Seer - cannot update alert rule.")
except ParseError:
Expand Down
27 changes: 19 additions & 8 deletions src/sentry/seer/anomaly_detection/store_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
from sentry.seer.anomaly_detection.utils import (
fetch_historical_data,
format_historical_data,
get_dataset_from_label,
translate_direction,
)
from sentry.seer.signed_seer_api import make_signed_seer_api_request
from sentry.snuba.models import SnubaQuery
from sentry.snuba.utils import get_dataset
from sentry.snuba.models import SnubaQuery, SnubaQueryEventType
from sentry.utils import json
from sentry.utils.json import JSONDecodeError

Expand All @@ -35,7 +35,7 @@
settings.SEER_ANOMALY_DETECTION_URL,
timeout=settings.SEER_ANOMALY_DETECTION_TIMEOUT,
)
NUM_DAYS = 28
MIN_DAYS = 7


def _get_start_and_end_indices(data: list[TimeSeriesPoint]) -> tuple[int, int]:
Expand All @@ -57,16 +57,28 @@ def _get_start_and_end_indices(data: list[TimeSeriesPoint]) -> tuple[int, int]:
return start, end


def send_historical_data_to_seer(alert_rule: AlertRule, project: Project) -> AlertRuleStatus:
def send_historical_data_to_seer(
alert_rule: AlertRule,
project: Project,
snuba_query=None,
event_types: list[SnubaQueryEventType.EventType] | None = None,
) -> AlertRuleStatus:
"""
Get 28 days of historical data and pass it to Seer to be used for prediction anomalies on the alert.
"""
snuba_query = SnubaQuery.objects.get(id=alert_rule.snuba_query_id)
if not snuba_query:
snuba_query = SnubaQuery.objects.get(id=alert_rule.snuba_query_id)
window_min = int(snuba_query.time_window / 60)
dataset = get_dataset(snuba_query.dataset)
dataset = get_dataset_from_label(snuba_query.dataset)
query_columns = get_query_columns([snuba_query.aggregate], snuba_query.time_window)
if not event_types:
event_types = snuba_query.event_types
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is snuba_query.event_types always set? (we may want to provide a default if this is required or throw an exception if it's a required field)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes a snuba query will always have event types on it, but I changed this function to accept both snuba_query and event_types because this is happening during an alert rule update - to avoid a bad state, we don't want to save the changes until we've successfully sent the data to Seer, so I can't pull the snuba query off of the alert rule (it may have changed) and the event types on the snuba query get updated separately

historical_data = fetch_historical_data(
alert_rule=alert_rule, snuba_query=snuba_query, query_columns=query_columns, project=project
alert_rule=alert_rule,
snuba_query=snuba_query,
query_columns=query_columns,
project=project,
event_types=event_types,
)

if not historical_data:
Expand Down Expand Up @@ -182,7 +194,6 @@ def send_historical_data_to_seer(alert_rule: AlertRule, project: Project) -> Ale
)
raise Exception(message)

MIN_DAYS = 7
data_start_index, data_end_index = _get_start_and_end_indices(formatted_data)
if data_start_index == -1:
return AlertRuleStatus.NOT_ENOUGH_DATA
Expand Down
57 changes: 36 additions & 21 deletions src/sentry/seer/anomaly_detection/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from django.utils import timezone
from django.utils.datastructures import MultiValueDict
from rest_framework.exceptions import ParseError

from sentry import release_health
from sentry.api.bases.organization_events import resolve_axis_column
Expand All @@ -17,11 +18,17 @@
from sentry.snuba.models import SnubaQuery, SnubaQueryEventType
from sentry.snuba.referrer import Referrer
from sentry.snuba.sessions_v2 import QueryDefinition
from sentry.snuba.utils import get_dataset
from sentry.snuba.utils import DATASET_OPTIONS, get_dataset
from sentry.utils.snuba import SnubaTSResult

NUM_DAYS = 28

SNUBA_QUERY_EVENT_TYPE_TO_STRING = {
SnubaQueryEventType.EventType.ERROR: "error",
SnubaQueryEventType.EventType.DEFAULT: "default",
SnubaQueryEventType.EventType.TRANSACTION: "transaction",
}


def translate_direction(direction: int) -> str:
"""
Expand All @@ -35,27 +42,26 @@ def translate_direction(direction: int) -> str:
return direction_map[AlertRuleThresholdType(direction)]


def get_snuba_query_string(snuba_query: SnubaQuery) -> str:
def get_snuba_query_string(
snuba_query: SnubaQuery, event_types: list[SnubaQueryEventType.EventType] | None = None
) -> str:
"""
Generate a query string that matches what the OrganizationEventsStatsEndpoint does
"""
SNUBA_QUERY_EVENT_TYPE_TO_STRING = {
SnubaQueryEventType.EventType.ERROR: "error",
SnubaQueryEventType.EventType.DEFAULT: "default",
SnubaQueryEventType.EventType.TRANSACTION: "transaction",
}
if not event_types:
event_types = snuba_query.event_types or []

if len(snuba_query.event_types) > 1:
# e.g. (is:unresolved) AND (event.type:[error, default])
if len(event_types) > 1:
# e.g. '(is:unresolved) AND (event.type:[error, default])'
event_types_list = [
SNUBA_QUERY_EVENT_TYPE_TO_STRING[event_type] for event_type in snuba_query.event_types
SNUBA_QUERY_EVENT_TYPE_TO_STRING[event_type] for event_type in event_types
]
event_types_string = "(event.type:["
event_types_string += ", ".join(event_types_list)
event_types_string += "])"
else:
# e.g. (is:unresolved) AND (event.type:error)
snuba_query_event_type_string = SNUBA_QUERY_EVENT_TYPE_TO_STRING[snuba_query.event_types[0]]
# e.g. '(is:unresolved) AND (event.type:error)'
snuba_query_event_type_string = SNUBA_QUERY_EVENT_TYPE_TO_STRING[event_types[0]]
event_types_string = f"(event.type:{snuba_query_event_type_string})"
if snuba_query.query:
snuba_query_string = f"({snuba_query.query}) AND {event_types_string}"
Expand Down Expand Up @@ -160,13 +166,27 @@ def format_historical_data(
return format_snuba_ts_data(data, query_columns, organization)


def get_dataset_from_label(dataset_label: str):
if dataset_label == "events":
# DATASET_OPTIONS expects the name 'errors'
dataset_label = "errors"
elif dataset_label in ["generic_metrics", "transactions"]:
# XXX: performance alerts dataset differs locally vs in prod
dataset_label = "discover"
dataset = get_dataset(dataset_label)
if dataset is None:
raise ParseError(detail=f"dataset must be one of: {', '.join(DATASET_OPTIONS.keys())}")
return dataset


def fetch_historical_data(
alert_rule: AlertRule,
snuba_query: SnubaQuery,
query_columns: list[str],
project: Project,
start: datetime | None = None,
end: datetime | None = None,
event_types: list[SnubaQueryEventType.EventType] | None = None,
) -> SnubaTSResult | None:
"""
Fetch 28 days of historical data from Snuba to pass to Seer to build the anomaly detection model
Expand All @@ -183,14 +203,7 @@ def fetch_historical_data(
granularity = snuba_query.time_window

dataset_label = snuba_query.dataset

if dataset_label == "events":
# DATASET_OPTIONS expects the name 'errors'
dataset_label = "errors"
elif dataset_label in ["generic_metrics", "transactions"]:
# XXX: performance alerts dataset differs locally vs in prod
dataset_label = "discover"
dataset = get_dataset(dataset_label)
dataset = get_dataset_from_label(dataset_label)

if not project or not dataset or not alert_rule.organization:
return None
Expand All @@ -213,7 +226,9 @@ def fetch_historical_data(
start, end, project, alert_rule.organization, granularity
)
else:
snuba_query_string = get_snuba_query_string(snuba_query)
if not event_types:
event_types = snuba_query.event_types
snuba_query_string = get_snuba_query_string(snuba_query, event_types)
historical_data = dataset.timeseries_query(
selected_columns=query_columns,
query=snuba_query_string,
Expand Down
124 changes: 123 additions & 1 deletion tests/sentry/incidents/test_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
from sentry.snuba.dataset import Dataset
from sentry.snuba.models import QuerySubscription, SnubaQuery, SnubaQueryEventType
from sentry.testutils.cases import BaseIncidentsTest, BaseMetricsTestCase, TestCase
from sentry.testutils.helpers.datetime import before_now, freeze_time
from sentry.testutils.helpers.datetime import before_now, freeze_time, iso_format
from sentry.testutils.helpers.features import with_feature
from sentry.testutils.helpers.options import override_options
from sentry.testutils.silo import assume_test_silo_mode, assume_test_silo_mode_of
Expand Down Expand Up @@ -1751,6 +1751,119 @@ def test_update_alert_rule_static_to_dynamic_enough_data(self, mock_seer_request
assert mock_seer_request.call_count == 1
assert alert_rule.status == AlertRuleStatus.PENDING.value

@with_feature("organizations:anomaly-detection-alerts")
@patch(
"sentry.seer.anomaly_detection.store_data.seer_anomaly_detection_connection_pool.urlopen"
)
def test_update_dynamic_alert_not_enough_to_pending(self, mock_seer_request):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note to self - this is freezing time - it's just pretty hidden in this diff.

"""
Update a dynamic rule's aggregate so the rule's status changes from not enough data to enough/pending
"""
seer_return_value: StoreDataResponse = {"success": True}
mock_seer_request.return_value = HTTPResponse(orjson.dumps(seer_return_value), status=200)

dynamic_rule = self.create_alert_rule(
sensitivity=AlertRuleSensitivity.HIGH,
seasonality=AlertRuleSeasonality.AUTO,
time_window=60,
detection_type=AlertRuleDetectionType.DYNAMIC,
)
assert mock_seer_request.call_count == 1
assert dynamic_rule.status == AlertRuleStatus.NOT_ENOUGH_DATA.value
mock_seer_request.reset_mock()

two_weeks_ago = before_now(days=14).replace(hour=10, minute=0, second=0, microsecond=0)
with self.options({"issues.group_attributes.send_kafka": True}):
self.store_event(
data={
"event_id": "a" * 32,
"message": "super bad",
"timestamp": iso_format(two_weeks_ago + timedelta(minutes=1)),
"tags": {"sentry:user": self.user.email},
"exception": [{"value": "BadError"}],
},
project_id=self.project.id,
)
self.store_event(
data={
"event_id": "a" * 32,
"message": "super bad",
"timestamp": iso_format(two_weeks_ago + timedelta(days=10)), # 4 days ago
"tags": {"sentry:user": self.user.email},
"exception": [{"value": "BadError"}],
},
project_id=self.project.id,
)
# update aggregate
update_alert_rule(
dynamic_rule,
aggregate="count_unique(user)",
time_window=60,
detection_type=AlertRuleDetectionType.DYNAMIC,
)
assert mock_seer_request.call_count == 1
assert dynamic_rule.status == AlertRuleStatus.PENDING.value

@with_feature("organizations:anomaly-detection-alerts")
@patch(
"sentry.seer.anomaly_detection.store_data.seer_anomaly_detection_connection_pool.urlopen"
)
def test_update_dynamic_alert_pending_to_not_enough(self, mock_seer_request):
"""
Update a dynamic rule's aggregate so the rule's status changes from enough/pending to not enough data
"""
seer_return_value: StoreDataResponse = {"success": True}
mock_seer_request.return_value = HTTPResponse(orjson.dumps(seer_return_value), status=200)

two_weeks_ago = before_now(days=14).replace(hour=10, minute=0, second=0, microsecond=0)
with self.options({"issues.group_attributes.send_kafka": True}):
self.store_event(
data={
"event_id": "b" * 32,
"message": "super bad",
"timestamp": iso_format(two_weeks_ago + timedelta(minutes=1)),
"fingerprint": ["group2"],
"tags": {"sentry:user": self.user.email},
"exception": [{"value": "BadError"}],
},
project_id=self.project.id,
)
self.store_event(
data={
"event_id": "b" * 32,
"message": "super bad",
"timestamp": iso_format(two_weeks_ago + timedelta(days=10)), # 4 days ago
"fingerprint": ["group2"],
"tags": {"sentry:user": self.user.email},
"exception": [{"value": "BadError"}],
},
project_id=self.project.id,
)

dynamic_rule = self.create_alert_rule(
sensitivity=AlertRuleSensitivity.HIGH,
seasonality=AlertRuleSeasonality.AUTO,
time_window=60,
detection_type=AlertRuleDetectionType.DYNAMIC,
)
assert mock_seer_request.call_count == 1
assert dynamic_rule.status == AlertRuleStatus.PENDING.value

mock_seer_request.reset_mock()

# update aggregate
update_alert_rule(
dynamic_rule,
aggregate="p95(measurements.fid)", # first input delay data we don't have stored
dataset=Dataset.Transactions,
event_types=[SnubaQueryEventType.EventType.TRANSACTION],
query="",
# time_window=60,
detection_type=AlertRuleDetectionType.DYNAMIC,
)
assert mock_seer_request.call_count == 1
assert dynamic_rule.status == AlertRuleStatus.NOT_ENOUGH_DATA.value

@with_feature("organizations:anomaly-detection-alerts")
@patch(
"sentry.seer.anomaly_detection.store_data.seer_anomaly_detection_connection_pool.urlopen"
Expand Down Expand Up @@ -1830,6 +1943,7 @@ def test_update_alert_rule_anomaly_detection_seer_timeout_max_retry(
seasonality=AlertRuleSeasonality.AUTO,
time_window=60,
detection_type=AlertRuleDetectionType.DYNAMIC,
name="my rule",
)
assert mock_seer_request.call_count == 1
mock_seer_request.reset_mock()
Expand All @@ -1845,6 +1959,7 @@ def test_update_alert_rule_anomaly_detection_seer_timeout_max_retry(
detection_type=AlertRuleDetectionType.DYNAMIC,
sensitivity=AlertRuleSensitivity.HIGH,
seasonality=AlertRuleSeasonality.AUTO,
name="your rule",
)

assert mock_logger.warning.call_count == 1
Expand All @@ -1865,10 +1980,17 @@ def test_update_alert_rule_anomaly_detection_seer_timeout_max_retry(
detection_type=AlertRuleDetectionType.DYNAMIC,
sensitivity=AlertRuleSensitivity.HIGH,
seasonality=AlertRuleSeasonality.AUTO,
name="hellboy's rule",
)

assert mock_logger.warning.call_count == 1
assert mock_seer_request.call_count == 1
# make sure the rule wasn't updated - need to refetch
fresh_dynamic_rule = AlertRule.objects.get(id=dynamic_rule.id)
assert fresh_dynamic_rule.name == "my rule"
assert fresh_dynamic_rule.snuba_query
assert fresh_dynamic_rule.snuba_query.time_window == 60 * 60
assert fresh_dynamic_rule.snuba_query.query == "level:error"

@with_feature("organizations:anomaly-detection-alerts")
@patch(
Expand Down
Loading