Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/sentry/tasks/seer/night_shift/agentic_triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
fixability_score_strategy,
priority_label,
)
from sentry.tasks.seer.night_shift.skip_cache import mark_skipped
from sentry.tasks.seer.night_shift.triage_tools import (
get_event_details_agentic_triage,
get_issue_details_agentic_triage,
Expand Down Expand Up @@ -191,6 +192,10 @@ def _triage_candidates(
},
)

for v in triage_response.verdicts:
if v.group_id in groups_by_id and v.action == TriageAction.SKIP:
mark_skipped(v.group_id)
Comment on lines +195 to +197
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: A Redis connection failure in mark_skipped after the main agent logic will cause an unhandled exception, discarding all previously computed triage results.
Severity: MEDIUM

Suggested Fix

Wrap the mark_skipped call in its own try/except block to catch potential Redis connection errors. Log the error for observability but do not re-raise it, allowing the function to return the successfully computed triage results. This ensures that failures in the caching optimization do not cause the loss of primary results.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: src/sentry/tasks/seer/night_shift/agentic_triage.py#L195-L197

Potential issue: The `mark_skipped` function is called outside the `try/except` block
that wraps the expensive Seer agent interactions. If a Redis connection error occurs
during this call, the exception is not handled locally. It propagates up to the
`run_night_shift_execution` function, which then marks the entire run as failed and
discards all the triage results (e.g., `AUTOFIX`, `ROOT_CAUSE_ONLY`) that were
successfully generated by the agent. This wastes significant LLM computation due to a
failure in a non-critical optimization step.

Did we get this right? 👍 / 👎 to inform future reviews.


return [
TriageResult(group=groups_by_id[v.group_id], action=v.action, reason=v.reason)
for v in triage_response.verdicts
Expand Down
7 changes: 7 additions & 0 deletions src/sentry/tasks/seer/night_shift/simple_triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sentry.seer.autofix.utils import is_issue_category_eligible
from sentry.snuba.referrer import Referrer
from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult
from sentry.tasks.seer.night_shift.skip_cache import recently_skipped
from sentry.types.group import PriorityLevel

logger = logging.getLogger("sentry.tasks.seer.night_shift")
Expand Down Expand Up @@ -65,16 +66,22 @@ def fixability_score_strategy(
referrer=Referrer.SEER_NIGHT_SHIFT_FIXABILITY_SCORE_STRATEGY.value,
)

skipped_ids = recently_skipped(g.id for g in result.results)

logger.info(
"night_shift.search_results",
extra={
"num_projects": len(projects),
"num_results": len(result.results),
"num_skip_filtered": len(skipped_ids),
"num_kept_after_skip_filter": len(result.results) - len(skipped_ids),
},
)

candidates: list[ScoredCandidate] = []
for group in result.results:
if group.id in skipped_ids:
continue
if not is_issue_category_eligible(group):
continue

Expand Down
38 changes: 38 additions & 0 deletions src/sentry/tasks/seer/night_shift/skip_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from __future__ import annotations

from collections.abc import Iterable
from datetime import timedelta

from redis.client import StrictRedis
from rediscluster import RedisCluster

from sentry.utils.redis import redis_clusters

# Padded past 3 days so nightly-run jitter can't expire a key right at the
# 3-day boundary; guarantees the next 3 nightly runs suppress the issue.
SKIP_TTL_SECONDS = int(timedelta(days=3, hours=12).total_seconds())
KEY_PREFIX = "seer:night-shift:skip:"


def mark_skipped(group_id: int) -> None:
_client().set(key(group_id), "1", ex=SKIP_TTL_SECONDS)


def recently_skipped(group_ids: Iterable[int]) -> set[int]:
ids = list(group_ids)
if not ids:
return set()

pipeline = _client().pipeline()
for gid in ids:
pipeline.get(key(gid))
values = pipeline.execute()
return {gid for gid, val in zip(ids, values) if val is not None}


def key(group_id: int) -> str:
return f"{KEY_PREFIX}{group_id}"


def _client() -> RedisCluster[str] | StrictRedis[str]:
return redis_clusters.get("default")
48 changes: 48 additions & 0 deletions tests/sentry/tasks/seer/night_shift/test_skip_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from datetime import timedelta

from sentry.tasks.seer.night_shift.skip_cache import (
SKIP_TTL_SECONDS,
key,
mark_skipped,
recently_skipped,
)
from sentry.utils.redis import redis_clusters


def _delete(group_id: int) -> None:
redis_clusters.get("default").delete(key(group_id))


def test_mark_then_recently_skipped_returns_id() -> None:
try:
mark_skipped(101)
assert recently_skipped([101]) == {101}
finally:
_delete(101)


def test_recently_skipped_returns_only_marked_ids() -> None:
try:
mark_skipped(202)
assert recently_skipped([202, 203, 204]) == {202}
finally:
_delete(202)


def test_recently_skipped_empty_input() -> None:
assert recently_skipped([]) == set()


def test_ttl_padded_past_three_days() -> None:
try:
mark_skipped(305)
ttl = redis_clusters.get("default").ttl(key(305))
assert int(timedelta(days=3).total_seconds()) < ttl <= SKIP_TTL_SECONDS
finally:
_delete(305)


def test_deleted_key_no_longer_recently_skipped() -> None:
mark_skipped(406)
_delete(406)
assert recently_skipped([406]) == set()
31 changes: 30 additions & 1 deletion tests/sentry/tasks/seer/test_night_shift.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
)
from sentry.tasks.seer.night_shift.models import TriageAction
from sentry.tasks.seer.night_shift.simple_triage import fixability_score_strategy
from sentry.tasks.seer.night_shift.skip_cache import key as skip_cache_key
from sentry.tasks.seer.night_shift.skip_cache import mark_skipped
from sentry.testutils.cases import SnubaTestCase, TestCase
from sentry.testutils.helpers.datetime import before_now
from sentry.testutils.pytest.fixtures import django_db_all
from sentry.utils.redis import redis_clusters


class FakeExplorerClient:
Expand Down Expand Up @@ -500,16 +503,42 @@ def test_skips_autofix_for_skip_candidates(self) -> None:
project, "skip-me", seer_fixability_score=0.9, times_seen=5
)

with self._patched_night_shift([(group.id, "skip")]) as (mock_trigger, mock_logger):
with (
self._patched_night_shift([(group.id, "skip")]) as (mock_trigger, mock_logger),
patch("sentry.tasks.seer.night_shift.agentic_triage.mark_skipped") as mock_mark_skipped,
):
run_night_shift_for_org(org.id)

mock_trigger.assert_not_called()
log_calls = [call.args[0] for call in mock_logger.info.call_args_list]
assert "night_shift.no_fixable_candidates" in log_calls
mock_mark_skipped.assert_called_once_with(group.id)

run = SeerNightShiftRun.objects.get(organization=org)
assert not SeerNightShiftRunResult.objects.filter(run=run).exists()

def test_filters_recently_skipped_groups(self) -> None:
org = self.create_organization()
project = self.create_project(organization=org)
self._make_eligible(project)

skipped_group = self._store_event_and_update_group(
project, "already-skipped", seer_fixability_score=0.9, times_seen=5
)
other_group = self._store_event_and_update_group(
project, "fresh", seer_fixability_score=0.9, times_seen=5
)

mark_skipped(skipped_group.id)
try:
with self._patched_night_shift([(other_group.id, "autofix")]) as (mock_trigger, _):
run_night_shift_for_org(org.id)
finally:
redis_clusters.get("default").delete(skip_cache_key(skipped_group.id))

mock_trigger.assert_called_once()
assert mock_trigger.call_args.kwargs["group"].id == other_group.id

def test_skips_autofix_when_no_seer_quota(self) -> None:
org = self.create_organization()
project = self.create_project(organization=org)
Expand Down
Loading