teng-lin · Andre-4711 · Apr 3, 2026 · gemini-code-assist · Apr 3, 2026 · coderabbitai
diff --git a/src/notebooklm/cli/helpers.py b/src/notebooklm/cli/helpers.py
@@ -76,6 +76,51 @@ def run_async(coro):
     return asyncio.run(coro)
 
 
+def filter_unsupported_sources(sources: list[dict], *, json_output: bool = False) -> list[dict]:
+    """Pre-filter sources that NotebookLM cannot import as web pages.
+
+    Removes direct PDF/document URLs and known bot-protected domains
+    that consistently result in error status after import.
+    """
+    PDF_EXTENSIONS = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
+    BLOCKED_PATTERNS = ("/fileadmin/", "/download/", "/sites/default/files/", "/SharedDocs/Downloads/")
+
+    filtered = []
+    skipped = []
+    for s in sources:
+        url = (s.get("url") or "").lower()
+        if url.endswith(PDF_EXTENSIONS) or any(p in url for p in BLOCKED_PATTERNS):
+            skipped.append(s)
+        else:
+            filtered.append(s)
+
+    if skipped and not json_output:
+        console.print(
+            f"[dim]Skipping {len(skipped)} unsupported source(s) (PDFs/downloads) before import[/dim]"
+        )
+    return filtered
+
+
+async def cleanup_error_sources(client, notebook_id: str, *, json_output: bool = False) -> int:
+    """Delete all sources with error status from a notebook.
+
+    Returns the number of deleted sources.
+    """
+    try:
+        sources = await client.sources.list(notebook_id)
+        error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")]
-        error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")]
+        error_ids = [s.id for s in sources if s.is_error and s.id]
-        error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")]
+        error_ids = [s.id for s in sources if s.is_error and s.id]
+        for source_id in error_ids:
+            try:
+                await client.sources.delete(notebook_id, source_id)
+            except Exception:
+                pass
+        if error_ids and not json_output:
+            console.print(f"[dim]Removed {len(error_ids)} failed source(s) after import[/dim]")
+        return len(error_ids)
+    except Exception:
+        return 0
+
+
 async def import_with_retry(
     client,
     notebook_id: str,
@@ -96,17 +141,25 @@ async def import_with_retry(
     started_at = time.monotonic()
     delay = initial_delay
     attempt = 1
+    pending_sources = list(sources)
 
     while True:
         try:
-            return await client.research.import_sources(notebook_id, task_id, sources)
+            return await client.research.import_sources(notebook_id, task_id, pending_sources)
         except RPCTimeoutError:
             elapsed = time.monotonic() - started_at
             remaining = max_elapsed - elapsed
             if remaining <= 0:
                 raise
 
             sleep_for = min(delay, max_delay, remaining)
+            # Filter out sources already imported to avoid duplicates on retry
+            try:
+                existing = await client.sources.list(notebook_id)
+                existing_urls = {s.get("url") for s in existing if s.get("url")}
-                existing_urls = {s.get("url") for s in existing if s.get("url")}
+                existing_urls = {s.url for s in existing if s.url}
-                existing_urls = {s.get("url") for s in existing if s.get("url")}
+                existing_urls = {s.url for s in existing if s.url}
+                pending_sources = [s for s in pending_sources if s.get("url") not in existing_urls]
+            except Exception:
+                pass  # If listing fails, retry with original list
             logger.warning(
                 "IMPORT_RESEARCH timed out for notebook %s; retrying in %.1fs (attempt %d, %.1fs elapsed)",
                 notebook_id,

diff --git a/src/notebooklm/cli/research.py b/src/notebooklm/cli/research.py
@@ -11,9 +11,11 @@
 
 from ..client import NotebookLMClient
 from .helpers import (
+    cleanup_error_sources,
     console,
     display_report,
     display_research_sources,
+    filter_unsupported_sources,
     import_with_retry,
     json_output_response,
     require_notebook,
@@ -188,16 +190,19 @@ async def _run():
                     "report": report,
                 }
                 if import_all and sources and task_id:
+                    filtered_sources = filter_unsupported_sources(sources, json_output=True)
                     imported = await import_with_retry(
                         client,
                         nb_id_resolved,
                         task_id,
-                        sources,
+                        filtered_sources,
                         max_elapsed=timeout,
                         json_output=True,
                     )
+                    removed = await cleanup_error_sources(client, nb_id_resolved, json_output=True)
                     result["imported"] = len(imported)
                     result["imported_sources"] = imported
+                    result["error_sources_removed"] = removed
                 json_output_response(result)
             else:
                 console.print(f"[green]✓ Research completed:[/green] {query}")
@@ -206,14 +211,16 @@ async def _run():
                 display_report(report)
 
                 if import_all and sources and task_id:
+                    filtered_sources = filter_unsupported_sources(sources)
                     with console.status("Importing sources..."):
                         imported = await import_with_retry(
                             client,
                             nb_id_resolved,
                             task_id,
-                            sources,
+                            filtered_sources,
                             max_elapsed=timeout,
                         )
                     console.print(f"[green]Imported {len(imported)} sources[/green]")
+                    await cleanup_error_sources(client, nb_id_resolved)
 
     return _run()