Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 54 additions & 1 deletion src/notebooklm/cli/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,51 @@ def run_async(coro):
return asyncio.run(coro)


def filter_unsupported_sources(sources: list[dict], *, json_output: bool = False) -> list[dict]:
"""Pre-filter sources that NotebookLM cannot import as web pages.

Removes direct PDF/document URLs and known bot-protected domains
that consistently result in error status after import.
"""
PDF_EXTENSIONS = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
BLOCKED_PATTERNS = ("/fileadmin/", "/download/", "/sites/default/files/", "/SharedDocs/Downloads/")

filtered = []
skipped = []
for s in sources:
url = (s.get("url") or "").lower()
if url.endswith(PDF_EXTENSIONS) or any(p in url for p in BLOCKED_PATTERNS):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The check url.endswith(PDF_EXTENSIONS) will fail to identify files if the URL contains query parameters (e.g., https://example.com/file.pdf?dl=1). For robust file matching, parse the URL and check the filename directly (e.g., using pathlib.Path(path).name) instead of checking for a substring or suffix in the full path.

References
  1. For robust file matching, check the filename directly (e.g., path.name) instead of checking for a substring in the full path.

Comment on lines +85 to +92
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Match against a normalized URL path here.

This misses common unsupported URLs: endswith(...) runs on the full string, so file.pdf?download=1 is kept, and the lowercased URL will never match "/SharedDocs/Downloads/". Parse the path once, lowercase it, and compare against lowercase patterns.

Proposed fix
+from urllib.parse import urlsplit
+
 def filter_unsupported_sources(sources: list[dict], *, json_output: bool = False) -> list[dict]:
@@
-    PDF_EXTENSIONS = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
-    BLOCKED_PATTERNS = ("/fileadmin/", "/download/", "/sites/default/files/", "/SharedDocs/Downloads/")
+    document_extensions = (".pdf", ".docx", ".xlsx", ".pptx", ".zip")
+    blocked_patterns = (
+        "/fileadmin/",
+        "/download/",
+        "/sites/default/files/",
+        "/shareddocs/downloads/",
+    )
@@
-        url = (s.get("url") or "").lower()
-        if url.endswith(PDF_EXTENSIONS) or any(p in url for p in BLOCKED_PATTERNS):
+        raw_url = s.get("url") or ""
+        path = urlsplit(raw_url).path.lower()
+        if path.endswith(document_extensions) or any(p in path for p in blocked_patterns):
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/notebooklm/cli/helpers.py` around lines 85 - 92, The current filter loop
uses the full lowercased URL string (`url`) to check PDF_EXTENSIONS and
BLOCKED_PATTERNS which misses cases like query strings and mixed-case paths;
modify the loop in helpers.py to parse each source URL (from `s.get("url")`)
with a URL parser (e.g., urlparse), extract the path component, lowercase that
path once into a variable (e.g., `path` or `normalized_path`), then test
path.endswith(PDF_EXTENSIONS) and any(p in path for p in map(str.lower,
BLOCKED_PATTERNS)); update the conditional that currently references `url` to
use the normalized path instead and keep the rest of the filtering logic the
same.

skipped.append(s)
else:
filtered.append(s)

if skipped and not json_output:
console.print(
f"[dim]Skipping {len(skipped)} unsupported source(s) (PDFs/downloads) before import[/dim]"
)
return filtered


async def cleanup_error_sources(client, notebook_id: str, *, json_output: bool = False) -> int:
"""Delete all sources with error status from a notebook.

Returns the number of deleted sources.
"""
try:
sources = await client.sources.list(notebook_id)
error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

client.sources.list(notebook_id) returns a list of Source objects, which do not have a .get() method. This line will raise an AttributeError at runtime. You should access attributes directly (e.g., s.id). Additionally, to maintain API consistency within the class, use the is_error property instead of comparing the status to a string literal or enum, following the established pattern of properties like is_completed or is_processing.

Suggested change
error_ids = [s.get("id") for s in sources if s.get("status") == "error" and s.get("id")]
error_ids = [s.id for s in sources if s.is_error and s.id]
References
  1. Maintain API consistency within a class. New properties should follow established patterns (e.g., is_failed) rather than requiring direct enum comparisons.

for source_id in error_ids:
try:
await client.sources.delete(notebook_id, source_id)
except Exception:
pass
if error_ids and not json_output:
console.print(f"[dim]Removed {len(error_ids)} failed source(s) after import[/dim]")
return len(error_ids)
except Exception:
return 0
Comment on lines +109 to +121
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

client.sources.list() isn't returning dicts here.

Elsewhere in this module the same API is consumed via item.id / item.title. These .get("status") / .get("id") calls will raise AttributeError, and the blanket except then turns cleanup into a silent no-op. After fixing that access, please also scope deletion to the current import batch and count only successful deletes; otherwise error_sources_removed can include older notebook errors and failed deletions.

🧰 Tools
🪛 Ruff (0.15.7)

[error] 115-116: try-except-pass detected, consider logging the exception

(S110)


[warning] 115-115: Do not catch blind exception: Exception

(BLE001)


[warning] 120-120: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/notebooklm/cli/helpers.py` around lines 109 - 121, client.sources.list()
yields objects with attributes, not dicts, so change the filtering to iterate
sources and use attribute access (e.g., source.status == "error" and source.id)
instead of .get(...) and when calling client.sources.delete only delete sources
that belong to the current import batch by comparing the source's batch
identifier property (e.g., source.import_batch_id or source.batch_id) to the
current batch id passed into the function; perform deletion inside a try/except
per-source, increment the removed count only on successful deletes, and return
that successful-deletes count (use client.sources.list, source.status,
source.id, source.<batch_id_field>, and client.sources.delete to locate
changes).



async def import_with_retry(
client,
notebook_id: str,
Expand All @@ -96,17 +141,25 @@ async def import_with_retry(
started_at = time.monotonic()
delay = initial_delay
attempt = 1
pending_sources = list(sources)

while True:
try:
return await client.research.import_sources(notebook_id, task_id, sources)
return await client.research.import_sources(notebook_id, task_id, pending_sources)
Comment on lines +144 to +148
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don't return only the last import_sources() response.

src/notebooklm/_research.py:297-350 already documents that import_sources() may return fewer items than were actually imported. After a timeout, pending_sources only holds the leftovers, so this helper can under-report imports or even return [] when the first timed-out attempt already finished. Rebuild the final result from client.sources.list(notebook_id) for the original URL set, and short-circuit when pending_sources becomes empty; otherwise both the CLI count and JSON imported_sources stay wrong after retries.

Also applies to: 156-160

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/notebooklm/cli/helpers.py` around lines 144 - 148, The loop currently
returns the raw result of client.research.import_sources(notebook_id, task_id,
pending_sources) which may omit items that were imported during a timed-out
attempt; instead, after each import_sources call fetch the canonical state with
client.sources.list(notebook_id) and reconcile that against the original URL set
to build the final imported_sources list, update pending_sources by removing
URLs now present, and short-circuit/return once pending_sources is empty; apply
the same reconciliation logic to the similar loop at lines 156-160 so the CLI
and JSON output reflect the true imported set rather than only the last
import_sources response.

except RPCTimeoutError:
elapsed = time.monotonic() - started_at
remaining = max_elapsed - elapsed
if remaining <= 0:
raise

sleep_for = min(delay, max_delay, remaining)
# Filter out sources already imported to avoid duplicates on retry
try:
existing = await client.sources.list(notebook_id)
existing_urls = {s.get("url") for s in existing if s.get("url")}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Similar to the issue in cleanup_error_sources, existing is a list of Source objects. Calling .get("url") will result in an AttributeError. Use the .url attribute instead.

Suggested change
existing_urls = {s.get("url") for s in existing if s.get("url")}
existing_urls = {s.url for s in existing if s.url}

pending_sources = [s for s in pending_sources if s.get("url") not in existing_urls]
except Exception:
pass # If listing fails, retry with original list
Comment on lines +156 to +162
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Retry dedup currently never removes anything.

client.sources.list() is used as an object list elsewhere in this file, so .get("url") throws here too. Because the exception is swallowed, pending_sources stays unchanged and every RPC timeout still retries the full batch, which recreates the duplicate-import bug this PR is trying to fix.

🧰 Tools
🪛 Ruff (0.15.7)

[error] 161-162: try-except-pass detected, consider logging the exception

(S110)


[warning] 161-161: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/notebooklm/cli/helpers.py` around lines 156 - 162, client.sources.list()
returns objects, not dicts, so using s.get("url") raises and the except swallows
the error leaving pending_sources unchanged; change the existing set
construction to use attribute access (e.g., existing_urls = {getattr(s, "url",
None) for s in existing if getattr(s, "url", None)}) and then filter
pending_sources with the existing_urls (pending_sources = [s for s in
pending_sources if s.get("url") not in existing_urls]); also avoid silently
swallowing errors from client.sources.list() — at minimum log the exception in
the except block so failures are visible.

logger.warning(
"IMPORT_RESEARCH timed out for notebook %s; retrying in %.1fs (attempt %d, %.1fs elapsed)",
notebook_id,
Expand Down
11 changes: 9 additions & 2 deletions src/notebooklm/cli/research.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

from ..client import NotebookLMClient
from .helpers import (
cleanup_error_sources,
console,
display_report,
display_research_sources,
filter_unsupported_sources,
import_with_retry,
json_output_response,
require_notebook,
Expand Down Expand Up @@ -188,16 +190,19 @@ async def _run():
"report": report,
}
if import_all and sources and task_id:
filtered_sources = filter_unsupported_sources(sources, json_output=True)
imported = await import_with_retry(
client,
nb_id_resolved,
task_id,
sources,
filtered_sources,
max_elapsed=timeout,
json_output=True,
)
removed = await cleanup_error_sources(client, nb_id_resolved, json_output=True)
result["imported"] = len(imported)
result["imported_sources"] = imported
result["error_sources_removed"] = removed
json_output_response(result)
else:
console.print(f"[green]✓ Research completed:[/green] {query}")
Expand All @@ -206,14 +211,16 @@ async def _run():
display_report(report)

if import_all and sources and task_id:
filtered_sources = filter_unsupported_sources(sources)
with console.status("Importing sources..."):
imported = await import_with_retry(
client,
nb_id_resolved,
task_id,
sources,
filtered_sources,
max_elapsed=timeout,
)
console.print(f"[green]Imported {len(imported)} sources[/green]")
await cleanup_error_sources(client, nb_id_resolved)

return _run()
Loading