-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathretry_failed.py
More file actions
126 lines (108 loc) · 4.57 KB
/
retry_failed.py
File metadata and controls
126 lines (108 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
"""Retry failed downloads with lower concurrency and retries.
Also expand pool to reach 100+ successful downloads."""
import json
import re
import requests
import time
import random
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
OUT_DIR = Path("/home/juke/naturalistic_fmri_pdfs")
PDF_DIR = OUT_DIR / "pdfs"
EXCLUDE = ["brain sciences"]
def is_excluded(journal):
if not journal:
return True
j = journal.lower()
return any(e in j for e in EXCLUDE)
def sanitize_filename(name, maxlen=150):
if not name:
return "untitled"
name = re.sub(r'<[^>]+>', '', name)
name = re.sub(r'[^\w\-. ]', '_', name)
name = re.sub(r'\s+', '_', name)
return name[:maxlen].strip('_.')
def make_filename(paper, idx):
year = paper.get("year") or "NA"
first_author = "Unknown"
if paper.get("authors"):
first_author = paper["authors"][0].split()[-1]
title = paper.get("title") or "untitled"
title_short = sanitize_filename(title, maxlen=80)
return f"{idx:03d}_{year}_{sanitize_filename(first_author, 30)}_{title_short}.pdf"
def download_one(paper, idx, max_retries=3):
pmc_id = paper.get("pmc_id")
if not pmc_id:
return (idx, paper.get("pmid"), "NO_PMC", None)
filename = make_filename(paper, idx)
fpath = PDF_DIR / filename
if fpath.exists() and fpath.stat().st_size > 10000:
return (idx, paper.get("pmid"), "SKIP_EXISTS", str(fpath))
pmc = pmc_id.replace("PMC", "")
url = f"https://europepmc.org/articles/PMC{pmc}?pdf=render"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
"Accept": "application/pdf,text/html,*/*;q=0.9",
"Accept-Language": "en-US,en;q=0.5",
}
last_error = None
for attempt in range(max_retries):
try:
time.sleep(0.2 + random.random() * 0.5)
r = requests.get(url, headers=headers, timeout=90, allow_redirects=True)
if r.status_code == 200 and (r.headers.get("Content-Type", "").startswith("application/pdf") or r.content[:4] == b"%PDF"):
with open(fpath, "wb") as f:
f.write(r.content)
return (idx, paper.get("pmid"), "OK", str(fpath))
last_error = f"HTTP_{r.status_code}"
if r.status_code in (500, 502, 503, 504, 429):
time.sleep(2 + attempt * 3) # backoff
continue
else:
break # non-retryable
except Exception as e:
last_error = f"ERROR_{type(e).__name__}"
time.sleep(2 + attempt)
return (idx, paper.get("pmid"), last_error or "FAIL", None)
def main():
# Load all PMC-available papers from filtered set
all_papers = json.loads((OUT_DIR / "papers_all.json").read_text())
filtered = [p for p in all_papers if not is_excluded(p.get("journal")) and p.get("has_pmc")]
filtered.sort(key=lambda r: -int(r.get("year") or 0))
print(f"Total PMC-available filtered papers: {len(filtered)}")
# Check which are already downloaded
existing_pdfs = {f.name: f for f in PDF_DIR.glob("*.pdf") if f.stat().st_size > 10000}
print(f"Existing valid PDFs: {len(existing_pdfs)}")
# Build list of candidates to try (all filtered papers up to 200)
candidates = filtered[:200]
print(f"Candidates to attempt: {len(candidates)}")
# Download with low concurrency + retries
results = []
with ThreadPoolExecutor(max_workers=3) as ex:
futures = {ex.submit(download_one, p, i+1): (i, p) for i, p in enumerate(candidates)}
done = 0
for fut in as_completed(futures):
idx, p = futures[fut]
res = fut.result()
results.append(res)
done += 1
status = res[2]
if status in ("OK", "SKIP_EXISTS"):
marker = "✓"
else:
marker = "✗"
print(f" {marker} [{done:3d}/{len(candidates)}] [{res[0]:03d}] → {status}")
ok = sum(1 for r in results if r[2] in ("OK", "SKIP_EXISTS"))
fails = [r for r in results if r[2] not in ("OK", "SKIP_EXISTS")]
print(f"\n=== Retry Summary ===")
print(f"Successful: {ok}")
print(f"Failed: {len(fails)}")
# Count actual PDFs
pdf_files = list(PDF_DIR.glob("*.pdf"))
valid_pdfs = [f for f in pdf_files if f.stat().st_size > 10000]
print(f"Total valid PDFs in folder: {len(valid_pdfs)}")
with open(OUT_DIR / "retry_manifest.json", "w") as f:
json.dump(results, f, indent=2)
if __name__ == "__main__":
main()