22PostHog API client functions.
33"""
44
5- import functools
65from collections .abc import AsyncGenerator
76from datetime import UTC , datetime , timedelta
87from logging import Logger
3029)
3130
3231HOGQL_PAGE_SIZE = 50_000
32+ BACKFILL_TIMEOUT_PERIOD = timedelta (minutes = 5 )
3333
3434
3535# Cache for project IDs per organization (avoids re-fetching on retry).
@@ -160,9 +160,21 @@ async def _query_hogql[T: HogQLEntity[str] | HogQLEntity[int]](
160160 http : HTTPSession ,
161161 log : Logger ,
162162) -> AsyncGenerator [T ]:
163+ log .debug (
164+ "Querying HogQL" ,
165+ {
166+ "table" : model .table_name ,
167+ "project_id" : project_id ,
168+ "start" : start_date ,
169+ "end" : end_date ,
170+ },
171+ )
172+
163173 url = model .get_api_endpoint_url (base_url , project_id )
164174 column_names = await _get_hogql_columns (model , base_url , project_id , http , log )
165175
176+ coalesced_cursor_fields = f'COALESCE({ "," .join (model .cursor_columns )} )'
177+
166178 serialized_start_date = start_date .astimezone (UTC ).replace (tzinfo = None ).isoformat ()
167179 serialized_end_date = (
168180 end_date .astimezone (UTC ).replace (tzinfo = None ).isoformat ()
@@ -171,11 +183,11 @@ async def _query_hogql[T: HogQLEntity[str] | HogQLEntity[int]](
171183 )
172184
173185 start_date_clause = (
174- f"WHERE { model . cursor_column } > "
186+ f"WHERE { coalesced_cursor_fields } > "
175187 + f"toDateTime64('{ serialized_start_date } ', 6, 'UTC') "
176188 )
177189 end_date_clause = (
178- f"AND { model . cursor_column } <= toDateTime64('{ serialized_end_date } ', 6, 'UTC') "
190+ f"AND { coalesced_cursor_fields } <= toDateTime64('{ serialized_end_date } ', 6, 'UTC') "
179191 if end_date is not None
180192 else ""
181193 )
@@ -187,7 +199,7 @@ async def _query_hogql[T: HogQLEntity[str] | HogQLEntity[int]](
187199 + f"FROM { model .table_name } "
188200 + start_date_clause
189201 + end_date_clause
190- + f"ORDER BY { model . cursor_column } DESC "
202+ + f"ORDER BY { coalesced_cursor_fields } ASC "
191203 + f"LIMIT { HOGQL_PAGE_SIZE } " ,
192204 },
193205 }
@@ -299,7 +311,6 @@ async def backfill_project_events(
299311 return
300312
301313 base_url = config .advanced .base_url
302- ctx = ProjectIdValidationContext (project_id = project_id )
303314 new_cursor = start_date
304315 doc_count = 0
305316
@@ -343,7 +354,6 @@ async def fetch_project_events(
343354 assert isinstance (cursor , datetime )
344355
345356 base_url = config .advanced .base_url
346- ctx = ProjectIdValidationContext (project_id = project_id )
347357 now = datetime .now (tz = UTC )
348358 upper_bound = now - horizon if horizon else None
349359
@@ -380,34 +390,98 @@ async def fetch_project_events(
380390 yield new_cursor
381391
382392
383- async def snapshot_persons (
393+ async def backfill_persons (
384394 http : HTTPSession ,
385395 config : EndpointConfig ,
396+ project_id : int ,
386397 log : Logger ,
387- ) -> AsyncGenerator [Person , None ]:
398+ page : PageCursor | None ,
399+ cutoff : LogCursor ,
400+ ) -> AsyncGenerator [Person | PageCursor , None ]:
401+ assert isinstance (page , str | None )
402+ assert isinstance (cutoff , datetime )
403+
404+ start_date = datetime .fromisoformat (page ) if page is not None else config .start_date
405+
406+ if start_date >= cutoff :
407+ return
408+
388409 base_url = config .advanced .base_url
389- project_ids = await fetch_project_ids (http , config , log )
410+ new_cursor = start_date
411+ doc_count = 0
412+ backfill_timeout = datetime .now (tz = UTC ) + BACKFILL_TIMEOUT_PERIOD
390413
391- total_doc_count = 0
414+ while True :
415+ batch_count = 0
392416
393- for project_id in project_ids :
394- doc_count = 0
395- project_cursor = datetime .min .replace (tzinfo = UTC )
417+ async for item in _query_hogql (
418+ Person ,
419+ new_cursor ,
420+ cutoff ,
421+ base_url ,
422+ project_id ,
423+ http ,
424+ log ,
425+ ):
426+ batch_count += 1
427+ doc_count += 1
428+ item_cursor = item .get_cursor ()
429+ new_cursor = max (new_cursor , item_cursor )
396430
397- while True :
398- async for item in _query_hogql (
399- Person , project_cursor , None , base_url , project_id , http , log
400- ):
401- doc_count += 1
402- project_cursor = item .get_cursor ()
431+ yield item
403432
404- yield item
433+ if batch_count < HOGQL_PAGE_SIZE :
434+ break
435+
436+ if datetime .now (tz = UTC ) > backfill_timeout :
437+ log .info (
438+ f"{ BACKFILL_TIMEOUT_PERIOD .total_seconds () / 60 } "
439+ + "minutes have elapsed, emitting a checkpoint"
440+ )
441+ break
442+
443+ log .info (f"Backfilled { doc_count } persons from project { project_id } " )
444+
445+ if new_cursor > start_date :
446+ yield new_cursor .isoformat ()
447+
448+
449+ async def fetch_persons (
450+ http : HTTPSession ,
451+ config : EndpointConfig ,
452+ project_id : int ,
453+ log : Logger ,
454+ cursor : LogCursor ,
455+ ) -> AsyncGenerator [Person | LogCursor , None ]:
456+ assert isinstance (cursor , datetime )
457+
458+ base_url = config .advanced .base_url
459+ new_cursor = cursor
460+ doc_count = 0
461+
462+ while True :
463+ batch_count = 0
405464
406- # If we got fewer rows than page_size, we've reached the end
407- if doc_count < HOGQL_PAGE_SIZE :
408- break
465+ async for item in _query_hogql (
466+ Person ,
467+ new_cursor ,
468+ None ,
469+ base_url ,
470+ project_id ,
471+ http ,
472+ log ,
473+ ):
474+ batch_count += 1
475+ doc_count += 1
476+ item_cursor = item .get_cursor ()
477+ new_cursor = max (new_cursor , item_cursor )
409478
410- log .info (f"Fetched { doc_count } persons from project { project_id } " )
411- total_doc_count += doc_count
479+ yield item
412480
413- log .info (f"Fetched { total_doc_count } total persons across all projects" )
481+ if batch_count < HOGQL_PAGE_SIZE :
482+ break
483+
484+ log .info (f"Fetched { doc_count } persons from project { project_id } " )
485+
486+ if new_cursor > cursor :
487+ yield new_cursor
0 commit comments