Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions src/notebooklm/_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,24 @@ async def list(self, notebook_id: str) -> list[Source]:
src_id = src[0][0] if isinstance(src[0], list) else src[0]
title = src[1] if len(src) > 1 else None

# Extract URL if present (at src[2][7])
# Extract URL if present (at src[2][7] for web/PDF, src[2][5] for YouTube)
url = None
if len(src) > 2 and isinstance(src[2], list) and len(src[2]) > 7:
url_list = src[2][7]
if isinstance(url_list, list) and len(url_list) > 0:
url = url_list[0]
if len(src) > 2 and isinstance(src[2], list):
if len(src[2]) > 7:
url_list = src[2][7]
if isinstance(url_list, list) and len(url_list) > 0:
url = url_list[0]
Comment on lines +108 to +111
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Validate index-7 URL element type before assignment

At Line 110, url_list[0] is accepted without a string check. A malformed payload can set a non-string URL and bypass the fallback chain.

Proposed patch
-                    if len(src[2]) > 7:
-                        url_list = src[2][7]
-                        if isinstance(url_list, list) and len(url_list) > 0:
-                            url = url_list[0]
+                    if len(src[2]) > 7:
+                        url_list = src[2][7]
+                        if (
+                            isinstance(url_list, list)
+                            and len(url_list) > 0
+                            and isinstance(url_list[0], str)
+                        ):
+                            url = url_list[0]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/notebooklm/_sources.py` around lines 108 - 111, The code assigns url =
url_list[0] without verifying its type; update the block where src is inspected
(the branch that checks len(src[2]) > 7 and sets url_list = src[2][7]) to
validate that url_list is a non-empty list and that its first element is a
string (and optionally non-empty after strip) before assigning to url; if the
element is not a string, skip it and continue the fallback chain (or leave url
unset) so malformed payloads cannot inject non-string values.

if not url and len(src[2]) > 5:
yt_data = src[2][5]
if (
isinstance(yt_data, list)
and len(yt_data) > 0
and isinstance(yt_data[0], str)
):
url = yt_data[0]
if not url and len(src[2]) > 0:
if isinstance(src[2][0], str) and src[2][0].startswith("http"):
url = src[2][0]
Comment on lines +108 to +122
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The extraction logic for src[2][7] should include a type check for the first element (ensuring it is a string), consistent with the validation added for YouTube URLs at src[2][5]. This prevents potential type errors if the API returns non-string data. Additionally, per repository rules, please add a comment explaining this logic as it addresses varying API response formats.

Suggested change
if len(src[2]) > 7:
url_list = src[2][7]
if isinstance(url_list, list) and len(url_list) > 0:
url = url_list[0]
if not url and len(src[2]) > 5:
yt_data = src[2][5]
if isinstance(yt_data, list) and len(yt_data) > 0 and isinstance(yt_data[0], str):
url = yt_data[0]
if not url and len(src[2]) > 0:
if isinstance(src[2][0], str) and src[2][0].startswith("http"):
url = src[2][0]
# Handle varying API response formats for URL extraction.
if len(src[2]) > 7 and isinstance(src[2][7], list) and len(src[2][7]) > 0 and isinstance(src[2][7][0], str):
url = src[2][7][0]
if not url and len(src[2]) > 5 and isinstance(src[2][5], list) and len(src[2][5]) > 0 and isinstance(src[2][5][0], str):
url = src[2][5][0]
if not url and len(src[2]) > 0:
if isinstance(src[2][0], str) and src[2][0].startswith("http"):
url = src[2][0]
References
  1. Add comments to explain complex logic, such as recursive ID extraction, especially when it addresses varying API response formats.


# Extract timestamp from src[2][2] - [seconds, nanoseconds]
created_at = None
Expand Down
18 changes: 17 additions & 1 deletion src/notebooklm/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,11 +582,19 @@ def from_api_response(cls, data: list[Any], notebook_id: str | None = None) -> "
source_id = entry[0][0] if isinstance(entry[0], list) else entry[0]
title = entry[1] if len(entry) > 1 else None

# Try to extract URL if present
# Try to extract URL if present (web/PDF at [2][7], YouTube at [2][5])
url = None
if len(entry) > 2 and isinstance(entry[2], list):
if len(entry[2]) > 7 and isinstance(entry[2][7], list):
url = entry[2][7][0] if entry[2][7] else None
if not url and len(entry[2]) > 5:
yt_data = entry[2][5]
if (
isinstance(yt_data, list)
and len(yt_data) > 0
and isinstance(yt_data[0], str)
):
url = yt_data[0]
Comment on lines 588 to +597
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The medium-nested parsing path is missing the fallback check for entry[2][0] starting with "http", which is present in the deeply-nested path and in _sources.py. Additionally, the extraction of entry[2][7] should be made more robust by verifying that the first element is a string. Please include a comment explaining this logic to handle varying API response formats.

Suggested change
if len(entry[2]) > 7 and isinstance(entry[2][7], list):
url = entry[2][7][0] if entry[2][7] else None
if not url and len(entry[2]) > 5:
yt_data = entry[2][5]
if isinstance(yt_data, list) and len(yt_data) > 0 and isinstance(yt_data[0], str):
url = yt_data[0]
# Handle varying API response formats for URL extraction.
if len(entry[2]) > 7 and isinstance(entry[2][7], list) and len(entry[2][7]) > 0 and isinstance(entry[2][7][0], str):
url = entry[2][7][0]
if not url and len(entry[2]) > 5 and isinstance(entry[2][5], list) and len(entry[2][5]) > 0 and isinstance(entry[2][5][0], str):
url = entry[2][5][0]
if not url and len(entry[2]) > 0 and isinstance(entry[2][0], str) and entry[2][0].startswith("http"):
url = entry[2][0]
References
  1. Add comments to explain complex logic, such as recursive ID extraction, especially when it addresses varying API response formats.


return cls(id=str(source_id), title=title, url=url, _type_code=None)

Expand All @@ -598,6 +606,14 @@ def from_api_response(cls, data: list[Any], notebook_id: str | None = None) -> "
url_list = entry[2][7]
if isinstance(url_list, list) and len(url_list) > 0:
url = url_list[0]
if not url and len(entry[2]) > 5:
yt_data = entry[2][5]
if (
isinstance(yt_data, list)
and len(yt_data) > 0
and isinstance(yt_data[0], str)
):
url = yt_data[0]
if not url and len(entry[2]) > 0:
if isinstance(entry[2][0], str) and entry[2][0].startswith("http"):
url = entry[2][0]
Expand Down
56 changes: 56 additions & 0 deletions tests/integration/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,62 @@ async def test_list_sources(
assert sources[0].kind == "web_page"
assert sources[0].url == "https://example.com"
assert sources[2].kind == "youtube"
assert sources[2].url == "https://youtube.com/watch?v=abc"

@pytest.mark.asyncio
async def test_list_sources_youtube_url_at_index_5(
self,
auth_tokens,
httpx_mock: HTTPXMock,
build_rpc_response,
):
"""Test that YouTube source URLs stored at src[2][5] are extracted correctly.

The NotebookLM API stores YouTube URLs at index 5 of the source metadata
array as [url, video_id, channel_name], not at index 7 like web/PDF sources.
"""
response = build_rpc_response(
RPCMethod.GET_NOTEBOOK,
[
[
"Test Notebook",
[
[
["src_yt"],
"YouTube Video",
[
None,
11,
[1704240000, 0],
None,
9, # YOUTUBE type code
[
"https://www.youtube.com/watch?v=dcWU-qD8ISQ",
"dcWU-qD8ISQ",
"john newquist",
],
None,
None, # index 7 is None for YouTube sources
],
[None, 2],
],
],
"nb_123",
"📘",
None,
[None, None, None, None, None, [1704067200, 0]],
]
],
)
httpx_mock.add_response(content=response.encode())

async with NotebookLMClient(auth_tokens) as client:
sources = await client.sources.list("nb_123")

assert len(sources) == 1
assert sources[0].id == "src_yt"
assert sources[0].kind == "youtube"
assert sources[0].url == "https://www.youtube.com/watch?v=dcWU-qD8ISQ"

@pytest.mark.asyncio
async def test_list_sources_empty(
Expand Down
34 changes: 34 additions & 0 deletions tests/unit/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,40 @@ def test_from_api_response_youtube_source(self):
assert source.kind == SourceType.YOUTUBE
assert source.kind == "youtube" # str enum comparison

def test_from_api_response_youtube_url_at_index_5(self):
"""Test that YouTube sources store URL at src[2][5], not src[2][7].

The NotebookLM API stores YouTube URLs at index 5 of the metadata array
as [url, video_id, channel_name], while web/PDF sources use index 7.
"""
data = [
[
[
["src_yt2"],
"YouTube Video Title",
[
None,
None,
None,
None,
9,
[
"https://www.youtube.com/watch?v=dcWU-qD8ISQ",
"dcWU-qD8ISQ",
"john newquist",
],
None,
None,
],
]
]
]
source = Source.from_api_response(data)

assert source.id == "src_yt2"
assert source.url == "https://www.youtube.com/watch?v=dcWU-qD8ISQ"
assert source.kind == SourceType.YOUTUBE

def test_from_api_response_web_page_source(self):
"""Test that web page sources are parsed with type code 5."""
data = [
Expand Down
Loading