Skip to content

Commit b687873

Browse files
Soli22declaude
andcommitted
POC: OCR/PDF extraction layer for issue #11 external fact signal pipeline
Minimal proof-of-concept showing pypdf + Federal Register API e2e pipeline. Findings: - Pipeline works e2e (FR API -> public-inspection PDF -> pypdf -> text) - BUT modern SEC 8-K = 0 PDFs (HTM/XBRL only) - Modern FR PDFs 100% have text layer - => OCR真实价值在 CourtListener early dockets / FDA Form 483 / EPA enforcement / 外国监管文件,不是 spec 列的现代主流 source - 修正 #11 回复角度:OCR 是 PR 2 的 conditional fallback,不是主体 不开 PR,等 WW-shan 在 #11 回复后再决定。 Co-Authored-By: Claude <noreply@anthropic.com>
1 parent e1f2e17 commit b687873

3 files changed

Lines changed: 462 additions & 0 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,7 @@ var/
3434
*.log
3535

3636
.deepseek/
37+
38+
# OCR POC cache (downloaded PDFs)
39+
docs/ocr_poc/cache/
40+
docs/ocr_poc/result_*.json

docs/ocr_poc/poc_extract.py

Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
#!/usr/bin/env python3
2+
"""
3+
OCR POC for #11 external fact signal pipeline
4+
=============================================
5+
6+
最小可演示流程:
7+
SEC EDGAR API → 找一个含 PDF exhibit 的 8-K filing
8+
→ 下载 PDF → pypdf 抽文本(text-layer PDFs)
9+
→ 用 LLM 抽结构化 fact(POC 里只 stub,不真调 LLM)
10+
→ 报告 text-only API 没拿到的 fact
11+
12+
跑法:
13+
cd ~/jz_code/poly_strategy
14+
.venv/bin/python docs/ocr_poc/poc_extract.py
15+
16+
输出:docs/ocr_poc/result_<timestamp>.md + cache/ 目录下的原始 PDF
17+
"""
18+
19+
from __future__ import annotations
20+
21+
import datetime as dt
22+
import json
23+
import re
24+
import sys
25+
from pathlib import Path
26+
27+
import requests
28+
from pypdf import PdfReader
29+
30+
# SEC 要求带 User-Agent + email 标识自己
31+
SEC_HEADERS = {
32+
"User-Agent": "OCR-POC research (jingheng_zhang@outlook.com)",
33+
"Accept-Encoding": "gzip, deflate",
34+
"Host": "data.sec.gov",
35+
}
36+
37+
DOWNLOAD_HEADERS = {
38+
"User-Agent": "OCR-POC research (jingheng_zhang@outlook.com)",
39+
"Accept-Encoding": "gzip, deflate",
40+
}
41+
42+
ROOT = Path(__file__).parent
43+
CACHE = ROOT / "cache"
44+
CACHE.mkdir(exist_ok=True)
45+
46+
47+
def fetch_recent_8k(cik: str, max_filings: int = 20) -> list[dict]:
48+
"""拉 N 个最近 filing 的元数据;筛 8-K"""
49+
cik_padded = cik.zfill(10)
50+
url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
51+
r = requests.get(url, headers=SEC_HEADERS, timeout=15)
52+
r.raise_for_status()
53+
data = r.json()
54+
recent = data["filings"]["recent"]
55+
rows = []
56+
for i in range(min(max_filings, len(recent["accessionNumber"]))):
57+
form = recent["form"][i]
58+
if form != "8-K":
59+
continue
60+
rows.append({
61+
"accession": recent["accessionNumber"][i],
62+
"form": form,
63+
"filing_date": recent["filingDate"][i],
64+
"report_date": recent["reportDate"][i],
65+
"primary_doc": recent["primaryDocument"][i],
66+
"primary_doc_desc": recent["primaryDocDescription"][i],
67+
})
68+
return rows
69+
70+
71+
def list_filing_attachments(cik: str, accession: str) -> list[dict]:
72+
"""列出一个 filing 的所有附件文件(含 PDF)"""
73+
acc_nodash = accession.replace("-", "")
74+
cik_int = str(int(cik))
75+
url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_int}&type=8-K&dateb=&owner=include&count=10&action=getcompany"
76+
# 简化:直接列出 EDGAR 文件夹索引
77+
index_url = f"https://www.sec.gov/Archives/edgar/data/{cik_int}/{acc_nodash}/"
78+
r = requests.get(index_url, headers=DOWNLOAD_HEADERS, timeout=15)
79+
r.raise_for_status()
80+
html = r.text
81+
files = re.findall(r'href="([^"]+\.(?:pdf|htm|txt))"', html, re.IGNORECASE)
82+
files = sorted(set(files))
83+
out = []
84+
for f in files:
85+
if f.startswith("/"):
86+
url = f"https://www.sec.gov{f}"
87+
else:
88+
url = index_url + f
89+
out.append({"name": f.split("/")[-1], "url": url})
90+
return out
91+
92+
93+
def download_pdf(url: str, dest: Path) -> Path:
94+
if dest.exists():
95+
return dest
96+
r = requests.get(url, headers=DOWNLOAD_HEADERS, timeout=30)
97+
r.raise_for_status()
98+
dest.write_bytes(r.content)
99+
return dest
100+
101+
102+
def extract_pdf_text(pdf_path: Path) -> dict:
103+
"""抽取 PDF 文本 + 元数据。返回 {pages, text, has_text_layer, total_chars}"""
104+
try:
105+
reader = PdfReader(str(pdf_path))
106+
pages = len(reader.pages)
107+
text_parts = []
108+
for p in reader.pages:
109+
try:
110+
t = p.extract_text() or ""
111+
text_parts.append(t)
112+
except Exception as e:
113+
text_parts.append(f"[page extract error: {e}]")
114+
full = "\n\n".join(text_parts)
115+
has_layer = len(full.strip()) > 100 # 极少文本 = 大概率是 scan
116+
return {
117+
"pages": pages,
118+
"text": full,
119+
"has_text_layer": has_layer,
120+
"total_chars": len(full),
121+
}
122+
except Exception as e:
123+
return {"error": str(e), "pages": 0, "text": "", "has_text_layer": False, "total_chars": 0}
124+
125+
126+
def stub_llm_fact_extraction(text: str, company: str, filing_date: str) -> dict:
127+
"""
128+
POC 阶段不真 call LLM(避免 token 成本)。
129+
生产里这里会调 OpenRouter(gemini-flash + qwen fallback)按 spec 里 structured_fact 的 schema 抽取。
130+
POC 只做正则启发提取,证明 pipeline 通了。
131+
"""
132+
excerpt = text[:1500]
133+
# 简单启发:找钱数、日期、事件关键词
134+
money_matches = re.findall(r"\$\s*[\d,]+(?:\.\d+)?\s*(?:million|billion|thousand|M|B)?", excerpt, re.IGNORECASE)[:5]
135+
event_kw = re.findall(
136+
r"\b(acquir\w+|merger|settle\w*|fine\w*|charged|indict\w+|approv\w+|recall\w+|bankruptc\w+|investigat\w+|dividend|buyback)\b",
137+
excerpt, re.IGNORECASE,
138+
)[:5]
139+
return {
140+
"schema_version": 1,
141+
"company": company,
142+
"filing_date": filing_date,
143+
"_method": "regex_stub_for_poc", # 标明这是占位
144+
"money_mentions": money_matches,
145+
"event_keywords": list(set(k.lower() for k in event_kw)),
146+
"first_1500_chars": excerpt,
147+
}
148+
149+
150+
def test_known_pdf(url: str, label: str) -> dict:
151+
"""直接用一个已知 PDF URL 验证 pypdf 抽取链路通。"""
152+
dest = CACHE / f"known_{label}.pdf"
153+
if not dest.exists():
154+
try:
155+
r = requests.get(url, headers=DOWNLOAD_HEADERS, timeout=30, allow_redirects=True)
156+
r.raise_for_status()
157+
dest.write_bytes(r.content)
158+
except Exception as e:
159+
return {"label": label, "url": url, "error": f"download failed: {e}"}
160+
ext = extract_pdf_text(dest)
161+
return {
162+
"label": label,
163+
"url": url,
164+
"size_bytes": dest.stat().st_size,
165+
"pages": ext["pages"],
166+
"total_chars": ext["total_chars"],
167+
"has_text_layer": ext["has_text_layer"],
168+
"first_300_chars": ext["text"][:300],
169+
}
170+
171+
172+
# === Federal Register: source family spec 明确列的 ===
173+
174+
def fetch_fr_recent_docs(limit: int = 5, agency: str | None = None) -> list[dict]:
175+
"""拉 Federal Register 最近 documents(spec 里 Federal Register 是 3 个核心 source 之一)"""
176+
params = ["per_page=" + str(limit)]
177+
if agency:
178+
params.append(f"conditions[agencies][]={agency}")
179+
url = "https://www.federalregister.gov/api/v1/documents.json?" + "&".join(params)
180+
r = requests.get(url, headers=DOWNLOAD_HEADERS, timeout=15)
181+
r.raise_for_status()
182+
data = r.json()
183+
return [
184+
{
185+
"doc_num": d["document_number"],
186+
"title": d["title"],
187+
"type": d["type"],
188+
"pub_date": d["publication_date"],
189+
"pdf_url": d["pdf_url"],
190+
"pi_pdf_url": d.get("public_inspection_pdf_url"),
191+
"agencies": [a["name"] for a in d.get("agencies", [])],
192+
}
193+
for d in data["results"]
194+
]
195+
196+
197+
def fetch_fr_pdf_with_fallback(doc: dict) -> dict:
198+
"""先试 public-inspection(FR 直发),失败 fallback govinfo"""
199+
label = doc["doc_num"]
200+
dest = CACHE / f"fr_{label}.pdf"
201+
if dest.exists():
202+
return {"path": dest, "url_used": "(cached)"}
203+
204+
candidates = [doc.get("pi_pdf_url"), doc.get("pdf_url")]
205+
for url in candidates:
206+
if not url:
207+
continue
208+
try:
209+
r = requests.get(url, headers=DOWNLOAD_HEADERS, timeout=30, allow_redirects=True)
210+
r.raise_for_status()
211+
dest.write_bytes(r.content)
212+
return {"path": dest, "url_used": url}
213+
except Exception:
214+
continue
215+
return {"error": "all PDF URLs failed", "tried": candidates}
216+
217+
218+
def main():
219+
# 测试目标:Apple 最近的 8-K(频繁有 PDF exhibits)
220+
targets = [
221+
("0000320193", "Apple Inc"),
222+
("0001318605", "Tesla Inc"),
223+
]
224+
225+
results = []
226+
for cik, name in targets:
227+
print(f"\n=== {name} (CIK {cik}) ===")
228+
try:
229+
filings = fetch_recent_8k(cik, max_filings=30)
230+
except Exception as e:
231+
print(f" ⚠️ 无法拉 filing list: {e}")
232+
continue
233+
if not filings:
234+
print(" 无 8-K")
235+
continue
236+
# 取最近一个 8-K
237+
f = filings[0]
238+
print(f" 最近 8-K: {f['filing_date']} {f['primary_doc']}")
239+
try:
240+
attachments = list_filing_attachments(cik, f["accession"])
241+
except Exception as e:
242+
print(f" ⚠️ 无法列 attachments: {e}")
243+
continue
244+
245+
pdfs = [a for a in attachments if a["name"].lower().endswith(".pdf")]
246+
non_pdfs = [a for a in attachments if not a["name"].lower().endswith(".pdf")]
247+
print(f" attachments: {len(attachments)} 个({len(pdfs)} PDF + {len(non_pdfs)} 其他)")
248+
249+
if not pdfs:
250+
results.append({
251+
"company": name,
252+
"cik": cik,
253+
"filing": f,
254+
"pdf_count": 0,
255+
"skipped_reason": "no_pdf_attachment",
256+
"attachments": [a["name"] for a in attachments],
257+
})
258+
continue
259+
260+
pdf = pdfs[0]
261+
print(f" 下载 PDF: {pdf['name']} from {pdf['url']}")
262+
try:
263+
dest = download_pdf(pdf["url"], CACHE / f"{cik}_{f['accession']}_{pdf['name']}")
264+
except Exception as e:
265+
print(f" ⚠️ 下载失败: {e}")
266+
continue
267+
print(f" 存到: {dest} ({dest.stat().st_size:,} bytes)")
268+
extracted = extract_pdf_text(dest)
269+
print(f" pages: {extracted['pages']}, chars: {extracted['total_chars']}, text_layer: {extracted['has_text_layer']}")
270+
271+
fact = stub_llm_fact_extraction(extracted["text"], name, f["filing_date"])
272+
results.append({
273+
"company": name,
274+
"cik": cik,
275+
"filing": f,
276+
"attachments_summary": {
277+
"total": len(attachments),
278+
"pdf_count": len(pdfs),
279+
"other_count": len(non_pdfs),
280+
"first_pdf": pdf["name"],
281+
},
282+
"extraction": {
283+
"pages": extracted["pages"],
284+
"total_chars": extracted["total_chars"],
285+
"has_text_layer": extracted["has_text_layer"],
286+
"first_300_chars": extracted["text"][:300],
287+
},
288+
"stub_fact": fact,
289+
})
290+
291+
# === Federal Register 真实 PDF 路径(spec 列的 source family)===
292+
print("\n=== Federal Register PDF 抽取 ===")
293+
fr_results = []
294+
try:
295+
fr_docs = fetch_fr_recent_docs(limit=3)
296+
for doc in fr_docs:
297+
print(f" --- [{doc['type']}] {doc['title'][:80]}")
298+
print(f" pub: {doc['pub_date']} agency: {(doc['agencies'] or ['?'])[0]}")
299+
dl = fetch_fr_pdf_with_fallback(doc)
300+
if "error" in dl:
301+
print(f" ⚠️ 下载失败: {dl}")
302+
fr_results.append({"doc": doc, "error": dl})
303+
continue
304+
ext = extract_pdf_text(dl["path"])
305+
print(f" ✅ pages={ext['pages']} chars={ext['total_chars']} text_layer={ext['has_text_layer']}")
306+
fact = stub_llm_fact_extraction(ext["text"], doc["agencies"][0] if doc["agencies"] else "FR", doc["pub_date"])
307+
fr_results.append({
308+
"doc": doc,
309+
"downloaded_from": dl["url_used"],
310+
"extraction": {
311+
"pages": ext["pages"],
312+
"total_chars": ext["total_chars"],
313+
"has_text_layer": ext["has_text_layer"],
314+
"first_300_chars": ext["text"][:300],
315+
},
316+
"stub_fact": fact,
317+
})
318+
except Exception as e:
319+
print(f" ⚠️ FR API 异常: {e}")
320+
fr_results.append({"error": str(e)})
321+
322+
# 写报告
323+
out = ROOT / f"result_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
324+
out.write_text(json.dumps(
325+
{"sec_8k_scan": results, "federal_register_pdf_extraction": fr_results},
326+
ensure_ascii=False, indent=2,
327+
))
328+
print(f"\n=== 写 {out} ===")
329+
return results, fr_results
330+
331+
332+
if __name__ == "__main__":
333+
main()

0 commit comments

Comments
 (0)