-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselect_100.py
More file actions
66 lines (55 loc) · 2.22 KB
/
select_100.py
File metadata and controls
66 lines (55 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
"""Select top 100 naturalistic fMRI PDFs from downloaded set based on title relevance."""
import json
import re
from pathlib import Path
PDF_DIR = Path("/home/juke/naturalistic_fmri_pdfs/pdfs")
STRONG = ["naturalistic", "movie watching", "movie-watching", "narrative", "inter-subject", "intersubject", "isc ", "story listening", "story comprehension", "free viewing", "naturalistic stimul"]
MODERATE = ["movie", " film", "story", "narrative", "listening", "audio", "natural viewing", "event segment", "real-world"]
NEGATIVE = ["task-based", "task based", "preclinical", "rodent", "mouse model", "rtms for psychiatric"]
def score_title(filename):
# Extract title portion from filename
lower = filename.lower()
score = 0
for w in STRONG:
if w in lower:
score += 3
for w in MODERATE:
if w in lower:
score += 1
for w in NEGATIVE:
if w in lower:
score -= 4
# Prefer newer years for equal score
m = re.search(r'_(\d{4})_', filename)
year = int(m.group(1)) if m else 2020
return (score, year)
def main():
pdfs = list(PDF_DIR.glob("*.pdf"))
print(f"Total PDFs: {len(pdfs)}")
scored = [(score_title(p.name), p) for p in pdfs]
scored.sort(key=lambda x: (-x[0][0], -x[0][1]))
# Select top 100
top100 = [p for (_, p) in scored[:100]]
print(f"Selected top 100 by relevance score")
# Print distribution
years = [int(re.search(r'_(\d{4})_', p.name).group(1)) for p in top100 if re.search(r'_(\d{4})_', p.name)]
from collections import Counter
yc = Counter(years)
for y in sorted(yc, reverse=True):
print(f" {y}: {yc[y]}")
# Save list
with open("/home/juke/naturalistic_fmri_pdfs/top100_paths.json", "w") as f:
json.dump([str(p) for p in top100], f, indent=2)
# Show excluded ones
excluded = [p for (_, p) in scored[100:]]
print(f"\nExcluded ({len(excluded)} papers):")
for p in excluded[:10]:
s, y = score_title(p.name)
print(f" score={s} year={y}: {p.name[:90]}")
print(f"\nFirst 10 selected:")
for p in top100[:10]:
s, y = score_title(p.name)
print(f" score={s} year={y}: {p.name[:90]}")
if __name__ == "__main__":
main()