Skip to content

Commit 63fa406

Browse files
authored
Merge pull request #757 from matysek/mzibrick-ccxdev15098-insights_core-workaround
[CCXDEV-16284] Implement memleak workaround from insights-core
2 parents 06ca29f + 72d4f0a commit 63fa406

3 files changed

Lines changed: 338 additions & 3 deletions

File tree

ccx_messaging/monitored_broker.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Utility broker to improve Ssentry exception handling."""
1+
"""Utility broker to improve Sentry exception handling."""
22

33
import logging
44

@@ -22,4 +22,40 @@ def add_exception(self, component, ex, tb=None):
2222
# prevent MissingRequirements and ContentException from being sent to sentry
2323
if not isinstance(ex, (MissingRequirements, ContentException)):
2424
self.logger.debug("Sending exception to Sentry: %s", type(ex))
25-
capture_exception(ex)
25+
# Sentry failure must not prevent traceback cleanup below;
26+
# otherwise the memory leak silently returns in production.
27+
try:
28+
capture_exception(ex)
29+
except Exception:
30+
self.logger.exception("Failed to send exception to Sentry")
31+
32+
# Break circular reference chain to prevent memory leak.
33+
# TODO: remove when this is merged: https://github.com/RedHatInsights/insights-core/pull/4763
34+
# Python attaches a live traceback to ex.__traceback__ which
35+
# references the stack frame (including the broker), creating:
36+
# broker.exceptions -> ex -> __traceback__ -> frame -> broker
37+
# The formatted traceback string is already stored in
38+
# broker.tracebacks via super().add_exception(), and
39+
# capture_exception() above receives the live traceback for
40+
# error-tracking systems (Sentry/GlitchTip), so no debugging
41+
# information is lost.
42+
if isinstance(ex, BaseException):
43+
# Defensive check: add_exception() is public API that could receive
44+
# invalid input from plugin code (e.g., strings, None). Only process
45+
# actual exceptions to prevent crashes downstream.
46+
# Walk the full exception chain (explicit __cause__ from
47+
# `raise X from Y` and implicit __context__) clearing
48+
# __traceback__ at every level. Uses iterative depth-first
49+
# traversal (not recursion) to handle arbitrarily deep chains
50+
# without stack overflow. A `seen` set guards against cycles
51+
# so the loop always terminates.
52+
seen = set()
53+
stack = [ex]
54+
while stack:
55+
cur = stack.pop()
56+
if not isinstance(cur, BaseException) or id(cur) in seen:
57+
continue
58+
seen.add(id(cur))
59+
cur.__traceback__ = None
60+
stack.append(getattr(cur, "__cause__", None))
61+
stack.append(getattr(cur, "__context__", None))

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ dependencies = [
2828
"boto3>=1.42.91,<1.42.92", # limited due to incompatibility between aiobotocore and newer versions of boto
2929
"confluent-kafka>=2.0.0",
3030
"insights-core>=3.1.2",
31-
"insights-core-messaging>=1.2.15",
31+
"insights-core-messaging @ git+https://github.com/RedHatInsights/insights-core-messaging@1.2.21",
3232
"jsonschema>=4.0.0",
3333
"prometheus-client>=0.16.0",
3434
"python-json-logger>=2.0.7,<5",

test/test_traceback_leak.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
"""Tests for memory leak fix in SentryMonitoredBroker traceback handling."""
2+
3+
import gc
4+
import traceback
5+
import weakref
6+
from unittest.mock import patch
7+
8+
from insights.core.dr import MissingRequirements
9+
from insights.core.spec_factory import ContentException
10+
11+
from ccx_messaging.monitored_broker import SentryMonitoredBroker
12+
13+
14+
def _make_exception_with_traceback(exc_type=Exception, msg="test exception"):
15+
"""Raise and catch an exception so it has a real __traceback__."""
16+
try:
17+
raise exc_type(msg)
18+
except exc_type as ex:
19+
tb_string = traceback.format_exc()
20+
return ex, tb_string
21+
22+
23+
def test_traceback_cleared_keyboard_interrupt():
24+
"""Verify __traceback__ is cleared for BaseException subclasses.
25+
26+
The implementation uses `isinstance(ex, BaseException)` (not Exception),
27+
which includes system exceptions like KeyboardInterrupt, SystemExit, and
28+
GeneratorExit. These can occur during graceful shutdown, signal handling,
29+
or async task cancellation. This test verifies the full BaseException
30+
hierarchy is handled correctly, not just regular exceptions.
31+
32+
KeyboardInterrupt exception is in some frameworks mapped to signal SIGTERM
33+
for graceful shutdown.
34+
"""
35+
broker = SentryMonitoredBroker()
36+
ex, tb = _make_exception_with_traceback(KeyboardInterrupt, "interrupt")
37+
assert ex.__traceback__ is not None
38+
39+
with patch("ccx_messaging.monitored_broker.capture_exception"):
40+
broker.add_exception("component", ex, tb)
41+
42+
assert ex.__traceback__ is None
43+
44+
45+
def test_formatted_traceback_preserved():
46+
"""The formatted traceback string must still be in broker.tracebacks."""
47+
broker = SentryMonitoredBroker()
48+
ex, tb = _make_exception_with_traceback(Exception, "preserved test")
49+
50+
with patch("ccx_messaging.monitored_broker.capture_exception"):
51+
broker.add_exception("component", ex, tb)
52+
53+
assert broker.tracebacks[ex] == tb
54+
assert "preserved test" in broker.tracebacks[ex]
55+
56+
57+
def test_sentry_called_with_traceback():
58+
"""capture_exception must be called while __traceback__ is still set."""
59+
broker = SentryMonitoredBroker()
60+
ex, tb = _make_exception_with_traceback(Exception, "sentry test")
61+
62+
traceback_was_set = []
63+
64+
def mock_capture(captured_ex):
65+
traceback_was_set.append(captured_ex.__traceback__ is not None)
66+
67+
with patch(
68+
"ccx_messaging.monitored_broker.capture_exception",
69+
side_effect=mock_capture,
70+
):
71+
broker.add_exception("component", ex, tb)
72+
73+
assert traceback_was_set == [True], (
74+
"capture_exception must be called before __traceback__ is cleared"
75+
)
76+
assert ex.__traceback__ is None
77+
78+
79+
def test_sentry_not_called_for_excluded_types():
80+
"""MissingRequirements and ContentException skip Sentry but still clear traceback."""
81+
broker = SentryMonitoredBroker()
82+
83+
# Don't use _make_exception_with_traceback helper here because these
84+
# insights-core exception types have different constructor signatures:
85+
# MissingRequirements expects a list, ContentException expects a string.
86+
# The inline approach is clearer than abstracting a helper with complex
87+
# parameter handling for just two test cases.
88+
for exc_type, msg in [
89+
(MissingRequirements, ["req"]),
90+
(ContentException, "content"),
91+
]:
92+
ex = exc_type(msg)
93+
try:
94+
raise ex
95+
except type(ex):
96+
tb = traceback.format_exc()
97+
98+
with patch("ccx_messaging.monitored_broker.capture_exception") as mock:
99+
broker.add_exception("component", ex, tb)
100+
assert not mock.called
101+
assert ex.__traceback__ is None
102+
103+
104+
def test_traceback_cleared_even_when_sentry_fails():
105+
"""Traceback must be cleared even if capture_exception raises."""
106+
broker = SentryMonitoredBroker()
107+
ex, tb = _make_exception_with_traceback(Exception, "sentry failure")
108+
109+
with patch(
110+
"ccx_messaging.monitored_broker.capture_exception",
111+
side_effect=RuntimeError("sentry unavailable"),
112+
):
113+
broker.add_exception("component", ex, tb)
114+
115+
assert ex.__traceback__ is None
116+
117+
118+
def test_chained_exception_tracebacks():
119+
"""Chained exception tracebacks (__cause__, __context__) should also be cleared."""
120+
broker = SentryMonitoredBroker()
121+
caught = None
122+
try:
123+
try:
124+
raise ValueError("original")
125+
except ValueError as orig:
126+
raise RuntimeError("wrapper") from orig
127+
except RuntimeError as ex:
128+
tb = traceback.format_exc()
129+
caught = ex
130+
131+
assert caught.__traceback__ is not None
132+
assert caught.__cause__.__traceback__ is not None
133+
134+
with patch("ccx_messaging.monitored_broker.capture_exception"):
135+
broker.add_exception("component", caught, tb)
136+
137+
assert caught.__traceback__ is None
138+
assert caught.__cause__.__traceback__ is None
139+
140+
141+
def test_implicit_chained_exception_context():
142+
"""Implicit chaining (__context__) should also have traceback cleared."""
143+
broker = SentryMonitoredBroker()
144+
caught = None
145+
try:
146+
try:
147+
raise ValueError("original")
148+
except ValueError:
149+
raise RuntimeError("during handling") # noqa: B904
150+
except RuntimeError as ex:
151+
tb = traceback.format_exc()
152+
caught = ex
153+
154+
assert caught.__traceback__ is not None
155+
assert caught.__context__.__traceback__ is not None
156+
157+
with patch("ccx_messaging.monitored_broker.capture_exception"):
158+
broker.add_exception("component", caught, tb)
159+
160+
assert caught.__traceback__ is None
161+
assert caught.__context__.__traceback__ is None
162+
163+
164+
def test_deep_chained_exception_tracebacks():
165+
"""Multi-level exception chains must have all tracebacks cleared."""
166+
broker = SentryMonitoredBroker()
167+
caught = None
168+
try:
169+
try:
170+
try:
171+
raise ValueError("root cause")
172+
except ValueError as e1:
173+
raise TypeError("middle") from e1
174+
except TypeError as e2:
175+
raise RuntimeError("outer") from e2
176+
except RuntimeError as ex:
177+
tb = traceback.format_exc()
178+
caught = ex
179+
180+
assert caught.__traceback__ is not None
181+
assert caught.__cause__.__traceback__ is not None
182+
assert caught.__cause__.__cause__.__traceback__ is not None
183+
184+
with patch("ccx_messaging.monitored_broker.capture_exception"):
185+
broker.add_exception("component", caught, tb)
186+
187+
assert caught.__traceback__ is None
188+
assert caught.__cause__.__traceback__ is None
189+
assert caught.__cause__.__cause__.__traceback__ is None
190+
191+
192+
def test_multiple_exceptions_all_tracebacks_cleared():
193+
"""All tracebacks must be cleared when multiple exceptions are added to one broker."""
194+
broker = SentryMonitoredBroker()
195+
exceptions = []
196+
197+
for i in range(5):
198+
ex, tb = _make_exception_with_traceback(Exception, f"error {i}")
199+
with patch("ccx_messaging.monitored_broker.capture_exception"):
200+
broker.add_exception(f"component_{i}", ex, tb)
201+
exceptions.append(ex)
202+
203+
for ex in exceptions:
204+
assert ex.__traceback__ is None, f"Traceback not cleared for: {ex}"
205+
206+
207+
def test_exception_without_traceback():
208+
"""Exceptions that were never raised (no __traceback__) should not crash."""
209+
broker = SentryMonitoredBroker()
210+
ex = Exception("never raised")
211+
assert ex.__traceback__ is None
212+
213+
with patch("ccx_messaging.monitored_broker.capture_exception"):
214+
broker.add_exception("component", ex, None)
215+
216+
assert ex.__traceback__ is None
217+
218+
219+
def test_broker_collected_after_add_exception():
220+
"""Verify that brokers are collected by reference counting alone.
221+
222+
The memory leak manifests as circular references that prevent garbage
223+
collection: broker.exceptions -> ex -> __traceback__ -> frame -> broker.
224+
225+
This test disables Python's cyclic garbage collector to verify that
226+
reference counting ALONE (immediate collection) successfully reclaims
227+
brokers after exceptions are added. This proves that clearing
228+
__traceback__ breaks the circular reference chain.
229+
230+
Without the fix, brokers would remain alive until the next GC cycle,
231+
causing memory to accumulate over time in production (where thousands
232+
of exceptions are processed before GC runs).
233+
"""
234+
n_brokers = 5
235+
refs = []
236+
237+
gc.collect()
238+
was_enabled = gc.isenabled()
239+
gc.disable()
240+
241+
try:
242+
for _ in range(n_brokers):
243+
broker = SentryMonitoredBroker()
244+
ex, tb = _make_exception_with_traceback(Exception, "gc test")
245+
246+
with patch("ccx_messaging.monitored_broker.capture_exception"):
247+
broker.add_exception("component", ex, tb)
248+
249+
refs.append(weakref.ref(broker))
250+
del broker, ex, tb
251+
252+
collected = sum(1 for ref in refs if ref() is None)
253+
assert collected == n_brokers, (
254+
f"Only {collected}/{n_brokers} brokers were collected "
255+
"by reference counting alone. Remaining brokers are held "
256+
"by circular references from ex.__traceback__ -> frame -> broker."
257+
)
258+
finally:
259+
if was_enabled:
260+
gc.enable()
261+
gc.collect()
262+
263+
264+
def test_broker_collected_with_chained_exceptions():
265+
"""Brokers must be collectable even when exceptions have __cause__ chains."""
266+
n_brokers = 5
267+
refs = []
268+
269+
gc.collect()
270+
was_enabled = gc.isenabled()
271+
gc.disable()
272+
273+
try:
274+
for _ in range(n_brokers):
275+
broker = SentryMonitoredBroker()
276+
caught = None
277+
try:
278+
try:
279+
raise ValueError("cause")
280+
except ValueError as orig:
281+
raise RuntimeError("effect") from orig
282+
except RuntimeError as ex:
283+
tb = traceback.format_exc()
284+
caught = ex
285+
286+
with patch("ccx_messaging.monitored_broker.capture_exception"):
287+
broker.add_exception("component", caught, tb)
288+
289+
refs.append(weakref.ref(broker))
290+
del broker, caught, tb
291+
292+
collected = sum(1 for ref in refs if ref() is None)
293+
assert collected == n_brokers, (
294+
f"Only {collected}/{n_brokers} brokers collected with chained exceptions"
295+
)
296+
finally:
297+
if was_enabled:
298+
gc.enable()
299+
gc.collect()

0 commit comments

Comments
 (0)