-
Notifications
You must be signed in to change notification settings - Fork 1.4k
fix: deduplicate imports on retry and clean up error sources after import #242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -76,6 +76,51 @@ def run_async(coro): | |||||
| return asyncio.run(coro) | ||||||
|
|
||||||
|
|
||||||
| def filter_unsupported_sources(sources: list[dict], *, json_output: bool = False) -> list[dict]: | ||||||
| """Pre-filter sources that NotebookLM cannot import as web pages. | ||||||
|
|
||||||
| Removes direct PDF/document URLs and known bot-protected domains | ||||||
| that consistently result in error status after import. | ||||||
| """ | ||||||
| PDF_EXTENSIONS = (".pdf", ".docx", ".xlsx", ".pptx", ".zip") | ||||||
| BLOCKED_PATTERNS = ("/fileadmin/", "/download/", "/sites/default/files/", "/SharedDocs/Downloads/") | ||||||
|
|
||||||
| filtered = [] | ||||||
| skipped = [] | ||||||
| for s in sources: | ||||||
| url = (s.get("url") or "").lower() | ||||||
| if url.endswith(PDF_EXTENSIONS) or any(p in url for p in BLOCKED_PATTERNS): | ||||||
|
Comment on lines
+85
to
+92
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Match against a normalized URL path here. This misses common unsupported URLs: Proposed fix+from urllib.parse import urlsplit
+
def filter_unsupported_sources(sources: list[dict], *, json_output: bool = False) -> list[dict]:
@@
- PDF_EXTENSIONS = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
- BLOCKED_PATTERNS = ("/fileadmin/", "/download/", "/sites/default/files/", "/SharedDocs/Downloads/")
+ document_extensions = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
+ blocked_patterns = (
+ "/fileadmin/",
+ "/download/",
+ "/sites/default/files/",
+ "/shareddocs/downloads/",
+ )
@@
- url = (s.get("url") or "").lower()
- if url.endswith(PDF_EXTENSIONS) or any(p in url for p in BLOCKED_PATTERNS):
+ raw_url = s.get("url") or ""
+ path = urlsplit(raw_url).path.lower()
+ if path.endswith(document_extensions) or any(p in path for p in blocked_patterns):🤖 Prompt for AI Agents |
||||||
| skipped.append(s) | ||||||
| else: | ||||||
| filtered.append(s) | ||||||
|
|
||||||
| if skipped and not json_output: | ||||||
| console.print( | ||||||
| f"[dim]Skipping {len(skipped)} unsupported source(s) (PDFs/downloads) before import[/dim]" | ||||||
| ) | ||||||
| return filtered | ||||||
|
|
||||||
|
|
||||||
| async def cleanup_error_sources(client, notebook_id: str, *, json_output: bool = False) -> int: | ||||||
| """Delete all sources with error status from a notebook. | ||||||
|
|
||||||
| Returns the number of deleted sources. | ||||||
| """ | ||||||
| try: | ||||||
| sources = await client.sources.list(notebook_id) | ||||||
| error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")] | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
References
|
||||||
| for source_id in error_ids: | ||||||
| try: | ||||||
| await client.sources.delete(notebook_id, source_id) | ||||||
| except Exception: | ||||||
| pass | ||||||
| if error_ids and not json_output: | ||||||
| console.print(f"[dim]Removed {len(error_ids)} failed source(s) after import[/dim]") | ||||||
| return len(error_ids) | ||||||
| except Exception: | ||||||
| return 0 | ||||||
|
Comment on lines
+109
to
+121
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Elsewhere in this module the same API is consumed via 🧰 Tools🪛 Ruff (0.15.7)[error] 115-116: (S110) [warning] 115-115: Do not catch blind exception: (BLE001) [warning] 120-120: Do not catch blind exception: (BLE001) 🤖 Prompt for AI Agents |
||||||
|
|
||||||
|
|
||||||
| async def import_with_retry( | ||||||
| client, | ||||||
| notebook_id: str, | ||||||
|
|
@@ -96,17 +141,25 @@ async def import_with_retry( | |||||
| started_at = time.monotonic() | ||||||
| delay = initial_delay | ||||||
| attempt = 1 | ||||||
| pending_sources = list(sources) | ||||||
|
|
||||||
| while True: | ||||||
| try: | ||||||
| return await client.research.import_sources(notebook_id, task_id, sources) | ||||||
| return await client.research.import_sources(notebook_id, task_id, pending_sources) | ||||||
|
Comment on lines
+144
to
+148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't return only the last
Also applies to: 156-160 🤖 Prompt for AI Agents |
||||||
| except RPCTimeoutError: | ||||||
| elapsed = time.monotonic() - started_at | ||||||
| remaining = max_elapsed - elapsed | ||||||
| if remaining <= 0: | ||||||
| raise | ||||||
|
|
||||||
| sleep_for = min(delay, max_delay, remaining) | ||||||
| # Filter out sources already imported to avoid duplicates on retry | ||||||
| try: | ||||||
| existing = await client.sources.list(notebook_id) | ||||||
| existing_urls = {s.get("url") for s in existing if s.get("url")} | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the issue in
Suggested change
|
||||||
| pending_sources = [s for s in pending_sources if s.get("url") not in existing_urls] | ||||||
| except Exception: | ||||||
| pass # If listing fails, retry with original list | ||||||
|
Comment on lines
+156
to
+162
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Retry dedup currently never removes anything.
🧰 Tools🪛 Ruff (0.15.7)[error] 161-162: (S110) [warning] 161-161: Do not catch blind exception: (BLE001) 🤖 Prompt for AI Agents |
||||||
| logger.warning( | ||||||
| "IMPORT_RESEARCH timed out for notebook %s; retrying in %.1fs (attempt %d, %.1fs elapsed)", | ||||||
| notebook_id, | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check
url.endswith(PDF_EXTENSIONS)will fail to identify files if the URL contains query parameters (e.g.,https://example.com/file.pdf?dl=1). For robust file matching, parse the URL and check the filename directly (e.g., usingpathlib.Path(path).name) instead of checking for a substring or suffix in the full path.References