Skip to content

Commit a00e997

Browse files
authored
Fix committer lag of one record when NoSqlTarget is used (#548)
`NoSqlTarget` must not keep a reference to the last event it processed, or else the GC can't collect it, and its offset won't be committed. [ML-4421](https://iguazio.atlassian.net/browse/ML-4421)
1 parent 77e57d8 commit a00e997

File tree

2 files changed

+50
-0
lines changed

2 files changed

+50
-0
lines changed

storey/table.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ async def _persist_worker(self):
432432
for done_job in self._pending_by_key[job.key].in_flight:
433433
if done_job.callback:
434434
await done_job.callback(done_job.extra_data, completed)
435+
del done_job # Allow job to be garbage collected
435436
self._pending_by_key[job.key].in_flight = []
436437

437438
# If we got more pending events for the same key process them
@@ -444,8 +445,14 @@ async def _persist_worker(self):
444445
jobs_at_tail = self_sent_jobs.get(tail_position, [])
445446
jobs_at_tail.append((job, asyncio.get_running_loop().create_task(future_task)))
446447
self_sent_jobs[tail_position] = jobs_at_tail
448+
# Allow these jobs to be garbage collected
449+
del jobs_at_tail
447450
else:
448451
del self._pending_by_key[job.key]
452+
453+
# Allow job to be garbage collected
454+
del job
455+
task = None
449456
except BaseException as ex:
450457
if task and task is not _termination_obj:
451458
if task[0].extra_data and task[0].extra_data._awaitable_result:

tests/test_flow.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,49 @@ def test_async_offset_commit_before_termination():
319319
asyncio.run(async_offset_commit_before_termination())
320320

321321

322+
async def async_offset_commit_before_termination_with_nosqltarget():
323+
platform = Committer()
324+
context = CommitterContext(platform)
325+
326+
max_wait_before_commit = 1
327+
328+
controller = build_flow(
329+
[
330+
AsyncEmitSource(context=context, explicit_ack=True, max_wait_before_commit=max_wait_before_commit),
331+
Map(lambda x: x + 1),
332+
Filter(lambda x: x < 3),
333+
FlatMap(lambda x: [x, x * 10]),
334+
NoSqlTarget(Table("/", NoopDriver(), flush_interval_secs=None)),
335+
]
336+
).run()
337+
338+
num_shards = 10
339+
num_records_per_shard = 10
340+
341+
for offset in range(1, num_records_per_shard + 1):
342+
for shard in range(num_shards):
343+
event = Event(shard, "abc")
344+
event.shard_id = shard
345+
event.offset = offset
346+
await controller.emit(event)
347+
348+
del event
349+
350+
await asyncio.sleep(max_wait_before_commit + 1)
351+
352+
try:
353+
offsets = copy.copy(platform.offsets)
354+
assert offsets == {("/", i): num_records_per_shard for i in range(num_shards)}
355+
finally:
356+
await controller.terminate()
357+
await controller.await_termination()
358+
359+
360+
# ML-4421
361+
def test_async_offset_commit_before_termination_with_nosqltarget():
362+
asyncio.run(async_offset_commit_before_termination_with_nosqltarget())
363+
364+
322365
def test_offset_not_committed_prematurely():
323366
platform = Committer()
324367
context = CommitterContext(platform)

0 commit comments

Comments
 (0)