Skip to content

Commit f7efd02

Browse files
committed
Improve readability in full-text extraction paths.
Wrap long extraction logic for readability and return typed API responses so the on-demand extraction endpoint passes strict type checks. Made-with: Cursor
1 parent 5742a8e commit f7efd02

3 files changed

Lines changed: 24 additions & 9 deletions

File tree

backend/apps/api/glean_api/routers/entries.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
from contextlib import suppress
8-
from typing import Annotated
8+
from typing import Annotated, cast
99

1010
from arq.connections import ArqRedis
1111
from fastapi import APIRouter, Depends, HTTPException, Query, status
@@ -191,15 +191,19 @@ async def extract_entry_fulltext(
191191

192192
try:
193193
job_result = await job.result(timeout=30)
194-
status_text = str(job_result.get("status", "unknown")) if isinstance(job_result, dict) else "unknown"
194+
if isinstance(job_result, dict):
195+
job_result_dict = cast(dict[str, object], job_result)
196+
status_text = str(job_result_dict.get("status", "unknown"))
197+
else:
198+
status_text = "unknown"
195199
except TimeoutError:
196200
status_text = "queued"
197201

198202
if status_text == "updated":
199203
updated_entry = await entry_service.get_entry(entry_id, current_user.id)
200-
return {"status": status_text, "entry": updated_entry}
204+
return FullTextExtractionResponse(status=status_text, entry=updated_entry)
201205

202-
return {"status": status_text, "entry": None}
206+
return FullTextExtractionResponse(status=status_text, entry=None)
203207

204208

205209
# M3: Preference signal endpoints

backend/apps/worker/glean_worker/tasks/entry_fulltext.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
logger = get_logger(__name__)
1515

1616

17-
async def extract_entry_fulltext(ctx: dict[str, Any], user_id: str, entry_id: str) -> dict[str, str]:
17+
async def extract_entry_fulltext(
18+
ctx: dict[str, Any], user_id: str, entry_id: str
19+
) -> dict[str, str]:
1820
"""
1921
Extract full article content for a specific entry and persist it.
2022
@@ -36,17 +38,24 @@ async def extract_entry_fulltext(ctx: dict[str, Any], user_id: str, entry_id: st
3638
entry = result.scalar_one_or_none()
3739

3840
if not entry:
39-
logger.warning("Entry not found or inaccessible", extra={"entry_id": entry_id, "user_id": user_id})
41+
logger.warning(
42+
"Entry not found or inaccessible", extra={"entry_id": entry_id, "user_id": user_id}
43+
)
4044
return {"status": "not_found"}
4145

4246
if not entry.url:
4347
logger.warning("Entry has no URL for extraction", extra={"entry_id": entry_id})
4448
return {"status": "no_url"}
4549

46-
logger.info("Starting on-demand full-text extraction", extra={"entry_id": entry_id, "url": entry.url})
50+
logger.info(
51+
"Starting on-demand full-text extraction",
52+
extra={"entry_id": entry_id, "url": entry.url},
53+
)
4754
extracted_content = await fetch_and_extract_fulltext(entry.url)
4855
if not extracted_content:
49-
logger.warning("Full-text extraction returned empty content", extra={"entry_id": entry_id})
56+
logger.warning(
57+
"Full-text extraction returned empty content", extra={"entry_id": entry_id}
58+
)
5059
return {"status": "empty"}
5160

5261
entry.readability_content = extracted_content

backend/packages/rss/glean_rss/extractor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ async def extract_fulltext(html: str, url: str | None = None) -> str | None:
194194
# pick the largest content-like container and sanitize it.
195195
try:
196196
soup = await asyncio.to_thread(BeautifulSoup, html, "html.parser")
197-
candidates = soup.select("article, main, [role='main'], .post-content, .entry-content, .article-body")
197+
candidates = soup.select(
198+
"article, main, [role='main'], .post-content, .entry-content, .article-body"
199+
)
198200
best_html = ""
199201
best_len = 0
200202
for candidate in candidates:

0 commit comments

Comments
 (0)