-
Notifications
You must be signed in to change notification settings - Fork 1.4k
fix: dedupe research import retries after timeout #257
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,7 @@ | |
| import logging | ||
| import os | ||
| import time | ||
| from collections.abc import Iterable | ||
| from functools import wraps | ||
| from typing import TYPE_CHECKING | ||
|
|
||
|
|
@@ -76,6 +77,42 @@ def run_async(coro): | |
| return asyncio.run(coro) | ||
|
|
||
|
|
||
| def _source_identity(source: dict) -> tuple[str, str] | None: | ||
| """Return a stable dedupe key for research import sources. | ||
|
|
||
| URL-backed sources dedupe by URL. Deep-research report entries do not have a | ||
| URL, so we fall back to the report body and title to avoid resubmitting the | ||
| same synthesized report after a timeout. | ||
| """ | ||
| url = source.get("url") | ||
| if isinstance(url, str) and url: | ||
| return ("url", url) | ||
|
|
||
| if source.get("result_type") == 5: | ||
| report_markdown = source.get("report_markdown") | ||
| title = source.get("title") | ||
| if isinstance(report_markdown, str) and report_markdown: | ||
| return ("report", f"{title or ''}\n{report_markdown}") | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| def _existing_source_identities(existing_sources: Iterable[object]) -> set[tuple[str, str]]: | ||
| """Build dedupe keys from notebook sources returned by the API.""" | ||
| identities: set[tuple[str, str]] = set() | ||
| for source in existing_sources: | ||
| url = getattr(source, "url", None) | ||
| if isinstance(url, str) and url: | ||
| identities.add(("url", url)) | ||
|
|
||
| result_type = getattr(source, "result_type", None) | ||
| report_markdown = getattr(source, "report_markdown", None) | ||
| if result_type == 5 and isinstance(report_markdown, str) and report_markdown: | ||
| identities.add(("report", f"{getattr(source, 'title', '') or ''}\n{report_markdown}")) | ||
|
|
||
| return identities | ||
|
|
||
|
|
||
| async def import_with_retry( | ||
| client, | ||
| notebook_id: str, | ||
|
|
@@ -96,16 +133,42 @@ async def import_with_retry( | |
| started_at = time.monotonic() | ||
| delay = initial_delay | ||
| attempt = 1 | ||
| pending_sources = list(sources) | ||
|
|
||
| while True: | ||
| try: | ||
| return await client.research.import_sources(notebook_id, task_id, sources) | ||
| return await client.research.import_sources(notebook_id, task_id, pending_sources) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a retry occurs with a subset of the original sources, the return value of References
|
||
| except RPCTimeoutError: | ||
| elapsed = time.monotonic() - started_at | ||
| remaining = max_elapsed - elapsed | ||
| if remaining <= 0: | ||
| raise | ||
|
|
||
| try: | ||
| existing_sources = await client.sources.list(notebook_id) | ||
| existing_identities = _existing_source_identities(existing_sources) | ||
| pending_sources = [ | ||
| source | ||
| for source in pending_sources | ||
| if (identity := _source_identity(source)) is None | ||
| or identity not in existing_identities | ||
| ] | ||
| if not pending_sources: | ||
| logger.info( | ||
| "IMPORT_RESEARCH timeout for notebook %s but all sources are already present; stopping retries", | ||
| notebook_id, | ||
| ) | ||
| # Preserve the existing CLI contract: if every source already | ||
| # landed during a timed-out attempt, return an empty list | ||
| # rather than fabricating imported records we do not have. | ||
| return [] | ||
| except Exception as e: # pragma: no cover - defensive: retry original pending batch | ||
| logger.debug( | ||
| "Failed to list existing sources before retrying research import for %s: %s", | ||
| notebook_id, | ||
| e, | ||
| ) | ||
|
|
||
| sleep_for = min(delay, max_delay, remaining) | ||
| logger.warning( | ||
| "IMPORT_RESEARCH timed out for notebook %s; retrying in %.1fs (attempt %d, %.1fs elapsed)", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: teng-lin/notebooklm-py
Length of output: 9685
Report deduplication logic will never execute due to missing attributes.
The
Sourcedataclass insrc/notebooklm/types.py:484-507defines attributesid,title,url,_type_code,created_at, andstatus, but notresult_typeorreport_markdown.Lines 108–111 attempt to access these non-existent attributes:
getattr(source, "result_type", None)always returnsNoneresult_type == 5never evaluates toTrueURL-based deduplication (lines 104–106) works correctly, but report-type sources will not be deduplicated against existing sources, risking duplicates on retry.
🤖 Prompt for AI Agents