notebooklm-py/src/notebooklm/_research_task_parser.py at main · teng-lin/notebooklm-py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""POLL_RESEARCH wire-row parsing helpers.

The public typed models (:class:`ResearchSource`, :class:`ResearchTask`,
:class:`ResearchStatus`) live in ``_types/research.py`` (issue #1209); they are
re-exported here so the historical import path
``from ._research_task_parser import ResearchSource, ResearchTask`` keeps
working and this module stays the home of the wire-row parsing logic.
"""

from __future__ import annotations

import logging
from typing import Any

from ._types.research import (
    RESEARCH_RESULT_TYPE_REPORT,
    RESEARCH_RESULT_TYPE_WEB,
    ResearchResultType,
    ResearchSource,
    ResearchStatus,
    ResearchTask,
    parse_result_type,
)
from .rpc import RPCMethod, safe_index

__all__ = [
    "RESEARCH_RESULT_TYPE_REPORT",
    "RESEARCH_RESULT_TYPE_WEB",
    "ResearchResultType",
    "ResearchSource",
    "ResearchStatus",
    "ResearchTask",
    "parse_research_task_models",
    "parse_research_tasks",
    "parse_result_type",
]

logger = logging.getLogger(__name__)

_POLL_SOURCE = "_research.poll"
_POLL_METHOD_ID = RPCMethod.POLL_RESEARCH.value


def extract_legacy_report_chunks(src: list[Any]) -> str:
    """Join legacy deep-research report chunks stored in ``src[6]``."""
    if len(src) <= 6 or not isinstance(src[6], list):
        return ""
    chunks = [chunk for chunk in src[6] if isinstance(chunk, str) and chunk]
    return "\n\n".join(chunks)


def _extract_task_id(task_data: Any) -> str | None:
    """Return ``task_data[0]`` as a string when present, else ``None``."""
    value = safe_index(task_data, 0, method_id=_POLL_METHOD_ID, source=_POLL_SOURCE)
    if isinstance(value, str):
        return value
    if value is not None:
        logger.warning(
            "task_data[0] is not a string (method_id=%r, source=%r): %r",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
            type(value).__name__,
        )
    return None


def _extract_task_info(task_data: Any) -> list[Any] | None:
    """Return ``task_data[1]`` as a list when present, else ``None``."""
    value = safe_index(task_data, 1, method_id=_POLL_METHOD_ID, source=_POLL_SOURCE)
    if isinstance(value, list):
        return value
    if value is not None:
        logger.warning(
            "task_data[1] is not a list (method_id=%r, source=%r): %r",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
            type(value).__name__,
        )
    return None


def _extract_query_text(task_info: Any) -> str | None:
    """Return ``task_info[1][0]`` as the original query text, else ``None``."""
    query_info = safe_index(task_info, 1, method_id=_POLL_METHOD_ID, source=_POLL_SOURCE)
    if not isinstance(query_info, list):
        if query_info is not None:
            logger.warning(
                "task_info[1] is not a list (method_id=%r, source=%r): %r",
                _POLL_METHOD_ID,
                _POLL_SOURCE,
                type(query_info).__name__,
            )
        return None

    value = query_info[0] if query_info else None
    if isinstance(value, str):
        return value
    if value is not None:
        logger.warning(
            "task_info[1][0] is not a string (method_id=%r, source=%r): %r",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
            type(value).__name__,
        )
    return None


def _extract_status_code(task_info: Any) -> int | None:
    """Return ``task_info[4]`` as an int status code, else ``None``."""
    value = safe_index(task_info, 4, method_id=_POLL_METHOD_ID, source=_POLL_SOURCE)
    if isinstance(value, bool):
        # bool is a subclass of int; reject explicitly so callers don't get
        # surprising truthy comparisons against status codes 1/2/6.
        logger.warning(
            "task_info[4] is bool, not int (method_id=%r, source=%r)",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
        )
        return None
    if isinstance(value, int):
        return value
    if value is not None:
        logger.warning(
            "task_info[4] is not an int (method_id=%r, source=%r): %r",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
            type(value).__name__,
        )
    return None


def _extract_sources_and_summary(task_info: Any) -> tuple[list[Any], str | None]:
    """Return ``(sources_data, summary)`` from ``task_info[3]``."""
    bundle = safe_index(task_info, 3, method_id=_POLL_METHOD_ID, source=_POLL_SOURCE)
    if not isinstance(bundle, list) or not bundle:
        if bundle is not None and not isinstance(bundle, list):
            logger.warning(
                "task_info[3] is not a list (method_id=%r, source=%r): %r",
                _POLL_METHOD_ID,
                _POLL_SOURCE,
                type(bundle).__name__,
            )
        return [], None

    sources_data = bundle[0] if isinstance(bundle[0], list) else []
    if bundle[0] is not None and not isinstance(bundle[0], list):
        logger.warning(
            "task_info[3][0] is not a list (method_id=%r, source=%r): %r",
            _POLL_METHOD_ID,
            _POLL_SOURCE,
            type(bundle[0]).__name__,
        )

    summary: str | None = None
    if len(bundle) >= 2 and isinstance(bundle[1], str):
        summary = bundle[1]

    return sources_data, summary


def _status_from_code(status_code: int | None) -> ResearchStatus:
    # Research: 1=in_progress, 2=completed, 6=completed (deep research).
    # Unknown non-null codes are terminal failures so wait loops do not spin
    # until timeout after the backend rejects a task.
    if status_code in (2, 6):
        return ResearchStatus.COMPLETED
    if status_code == 1 or status_code is None:
        return ResearchStatus.IN_PROGRESS
    return ResearchStatus.FAILED


def _parse_source_row(
    src: Any, *, task_id: str, report_found: bool = False
) -> tuple[ResearchSource | None, str]:
    if not isinstance(src, list) or len(src) < 2:
        return None, ""

    title = ""
    url = ""
    source_report = ""

    # Fast research: [url, title, desc, type, ...]
    # Deep research (legacy): [None, title, None, type, ..., [report_markdown]]
    # Deep research (current): [None, [title, report_markdown], None, type, ...]
    # src[3] is the authoritative result_type when present.
    result_type = parse_result_type(src[3]) if len(src) > 3 else RESEARCH_RESULT_TYPE_WEB
    if src[0] is None and len(src) > 1:
        if (
            isinstance(src[1], list)
            and len(src[1]) >= 2
            and isinstance(src[1][0], str)
            and isinstance(src[1][1], str)
        ):
            title = src[1][0]
            source_report = src[1][1]
            url = ""
            if result_type == RESEARCH_RESULT_TYPE_WEB:
                result_type = RESEARCH_RESULT_TYPE_REPORT
        elif isinstance(src[1], str):
            title = src[1]
            url = ""
            if result_type == RESEARCH_RESULT_TYPE_WEB:
                result_type = RESEARCH_RESULT_TYPE_REPORT
    elif isinstance(src[0], str) or len(src) >= 3:
        url = src[0] if isinstance(src[0], str) else ""
        title = src[1] if len(src) > 1 and isinstance(src[1], str) else ""

    parsed_source = None
    if title or url:
        parsed_source = ResearchSource(
            url=url,
            title=title,
            result_type=result_type,
            research_task_id=task_id,
        )

    report = source_report
    if not report and not report_found:
        report = extract_legacy_report_chunks(src)
    if report and parsed_source is not None:
        parsed_source = parsed_source.with_report_markdown(report)

    return parsed_source, report


def _unwrap_poll_result(result: Any) -> list[Any]:
    if not result or not isinstance(result, list):
        return []
    if isinstance(result[0], list) and len(result[0]) > 0 and isinstance(result[0][0], list):
        return result[0]
    return result


def parse_research_task_models(result: Any) -> list[ResearchTask]:
    """Parse a raw ``POLL_RESEARCH`` result into typed task models."""
    parsed_tasks: list[ResearchTask] = []
    for task_data in _unwrap_poll_result(result):
        if not isinstance(task_data, list):
            continue

        task_id = _extract_task_id(task_data)
        task_info = _extract_task_info(task_data)
        if task_id is None or task_info is None:
            continue

        query_text = _extract_query_text(task_info) or ""
        sources_data, summary_opt = _extract_sources_and_summary(task_info)
        status_code = _extract_status_code(task_info)

        parsed_sources: list[ResearchSource] = []
        report = ""
        for src in sources_data:
            parsed_source, source_report = _parse_source_row(
                src, task_id=task_id, report_found=bool(report)
            )
            if parsed_source is not None:
                parsed_sources.append(parsed_source)
            if not report and source_report:
                report = source_report

        parsed_tasks.append(
            ResearchTask(
                task_id=task_id,
                status=_status_from_code(status_code),
                query=query_text,
                sources=tuple(parsed_sources),
                summary=summary_opt or "",
                report=report,
            )
        )

    return parsed_tasks


def parse_research_tasks(result: Any) -> list[dict[str, Any]]:
    """Parse a raw ``POLL_RESEARCH`` result into compatibility dictionaries.

    Each dict has the historical per-task shape (``task_id`` / ``status`` /
    ``query`` / ``sources`` / ``summary`` / ``report``); the top-level
    ``tasks`` sibling key belongs to :meth:`ResearchAPI.poll`'s result, not to
    these individual task dicts.
    """
    return [task._to_task_dict() for task in parse_research_task_models(result)]