Skip to content

Commit 28e6933

Browse files
compare search approaches
1 parent 9fbecff commit 28e6933

File tree

1 file changed

+207
-0
lines changed

1 file changed

+207
-0
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
"""Compare two metadata search approaches for Secretary-General reports."""
2+
import os
3+
import re
4+
import requests
5+
from dotenv import load_dotenv
6+
from joblib import Memory
7+
8+
memory = Memory(location=".cache", verbose=0)
9+
load_dotenv()
10+
AWS_API_URL = os.getenv("AWS_API_URL").rstrip("/")
11+
12+
@memory.cache
13+
def search_api(query: str, tag: str, skip: int = 0, limit: int = 100) -> list:
14+
url = f"{AWS_API_URL}/dev/list"
15+
res = requests.get(url, params={"tag": tag, "query": query, "limit": limit, "skip": skip})
16+
res.raise_for_status()
17+
return res.json()
18+
19+
def fetch_all(query: str, tag: str, start_year: int = 2020) -> list:
20+
all_results, skip, old_streak = [], 0, 0
21+
while True:
22+
batch = search_api(query, tag, skip=skip)
23+
if not batch:
24+
break
25+
dates = sorted(set(d for r in batch if (d := (r.get("269__a") or [None])[0])))
26+
print(f" {len(batch)} results, dates: {dates[0] if dates else '?'}{dates[-1] if dates else '?'}")
27+
all_results.extend(batch)
28+
if len(batch) < 100:
29+
break
30+
old_streak = old_streak + 1 if dates and dates[-1] < str(start_year) else 0
31+
if old_streak >= 3:
32+
break
33+
skip += 100
34+
return all_results
35+
36+
def get_symbols(results: list) -> set:
37+
return {s for r in results for s in (r.get("191__a") or [])}
38+
39+
def filter_sg_reports(results: list) -> list:
40+
"""Filter for reports with 'Secretary-General' in title or subtitle (case-insensitive)."""
41+
return [r for r in results if any(
42+
"secretary-general" in (t or "").lower()
43+
for t in (r.get("245__a") or []) + (r.get("245__b") or [])
44+
)]
45+
46+
if __name__ == "__main__":
47+
print("\n=== APPROACH 1: doc_type = 'Secretary-General's Reports' (989__c) ===")
48+
results1 = fetch_all("'Secretary-General's Reports'", "989__c")
49+
symbols1 = get_symbols(results1)
50+
print(f"Total: {len(results1)} records, {len(symbols1)} unique symbols")
51+
52+
# Analyze titles for noise patterns
53+
print("\n=== ANALYZING APPROACH 1 FOR NOISE ===")
54+
title_words = {}
55+
for r in results1:
56+
title = (r.get("245__a") or [""])[0].lower()
57+
for word in title.split()[:3]: # first 3 words
58+
word = word.strip(":,")
59+
if len(word) > 3:
60+
title_words[word] = title_words.get(word, 0) + 1
61+
print("\nMost common title starting words:")
62+
for w, c in sorted(title_words.items(), key=lambda x: -x[1])[:30]:
63+
print(f" {c:4d} - {w}")
64+
65+
# Group by title prefix patterns
66+
print("\n\nGrouped by title pattern (first 40 chars):")
67+
patterns = {}
68+
for r in results1:
69+
title = (r.get("245__a") or [""])[0][:40]
70+
patterns[title] = patterns.get(title, 0) + 1
71+
for p, c in sorted(patterns.items(), key=lambda x: -x[1])[:40]:
72+
print(f" {c:4d} - {p}")
73+
74+
# Look for potential noise keywords
75+
print("\n\nPotential noise (credentials, letters, notes, etc.):")
76+
noise_keywords = ["credential", "letter", "note by", "corrigendum", "addendum", "errata"]
77+
for kw in noise_keywords:
78+
matches = [r for r in results1 if kw in (r.get("245__a") or [""])[0].lower()]
79+
if matches:
80+
print(f"\n'{kw}' ({len(matches)} records):")
81+
for r in matches[:5]:
82+
sym = (r.get("191__a") or ["?"])[0]
83+
title = (r.get("245__a") or [""])[0][:70]
84+
print(f" {sym}: {title}")
85+
86+
print("\n=== APPROACH 2: doc_type = 'Reports' (989__b) + title filter ===")
87+
results2_raw = fetch_all("'Reports'", "989__b")
88+
results2 = filter_sg_reports(results2_raw)
89+
symbols2 = get_symbols(results2)
90+
print(f"Total: {len(results2_raw)} raw → {len(results2)} filtered, {len(symbols2)} unique symbols")
91+
92+
print("\n=== COMPARISON ===")
93+
only_in_1 = symbols1 - symbols2
94+
only_in_2 = symbols2 - symbols1
95+
common = symbols1 & symbols2
96+
97+
print(f"Common symbols: {len(common)}")
98+
print(f"Only in approach 1: {len(only_in_1)}")
99+
print(f"Only in approach 2: {len(only_in_2)}")
100+
101+
if only_in_1:
102+
print(f"\nSamples only in approach 1:")
103+
for s in sorted(only_in_1)[:10]:
104+
r = next((x for x in results1 if s in (x.get("191__a") or [])), None)
105+
title = (r.get("245__a") or [""])[0][:60] if r else ""
106+
print(f" {s}: {title}...")
107+
108+
if only_in_2:
109+
print(f"\nSamples only in approach 2:")
110+
for s in sorted(only_in_2)[:10]:
111+
r = next((x for x in results2 if s in (x.get("191__a") or [])), None)
112+
title = (r.get("245__a") or [""])[0][:60] if r else ""
113+
print(f" {s}: {title}...")
114+
115+
# Deeper analysis
116+
print("\n=== DEEPER ANALYSIS ===")
117+
118+
# What doc types (989__c) do approach-2-only records have?
119+
print("\nDoc types (989__c) of records only in approach 2:")
120+
types2 = {}
121+
for s in only_in_2:
122+
r = next((x for x in results2 if s in (x.get("191__a") or [])), None)
123+
if r:
124+
for t in (r.get("989__c") or ["(none)"]):
125+
types2[t] = types2.get(t, 0) + 1
126+
for t, c in sorted(types2.items(), key=lambda x: -x[1]):
127+
print(f" {c:4d} - {t}")
128+
129+
# What are title patterns in approach-1-only (why no "of the Secretary-General")?
130+
print("\nTitle patterns in approach-1-only (first 20):")
131+
for s in sorted(only_in_1)[:20]:
132+
r = next((x for x in results1 if s in (x.get("191__a") or [])), None)
133+
if r:
134+
title = (r.get("245__a") or [""])[0]
135+
subtitle = (r.get("245__b") or [""])[0] if r.get("245__b") else ""
136+
full = f"{title} | {subtitle}" if subtitle else title
137+
print(f" {s}: {full[:80]}")
138+
139+
# Check doc types of approach-1-only
140+
print("\nDoc types (989__c) of records only in approach 1:")
141+
types1 = {}
142+
for s in only_in_1:
143+
r = next((x for x in results1 if s in (x.get("191__a") or [])), None)
144+
if r:
145+
for t in (r.get("989__c") or ["(none)"]):
146+
types1[t] = types1.get(t, 0) + 1
147+
for t, c in sorted(types1.items(), key=lambda x: -x[1])[:15]:
148+
print(f" {c:4d} - {t}")
149+
150+
# Check why approach-1-only weren't caught by title filter
151+
print("\nAnalyzing why 373 in approach-1-only weren't found by title filter:")
152+
has_sg_in_title = has_sg_in_subtitle = has_sg_nowhere = 0
153+
sg_variants = {}
154+
for s in only_in_1:
155+
r = next((x for x in results1 if s in (x.get("191__a") or [])), None)
156+
if r:
157+
title = " ".join(r.get("245__a") or [])
158+
subtitle = " ".join(r.get("245__b") or [])
159+
combined = f"{title} {subtitle}".lower()
160+
if "secretary-general" in combined:
161+
if "secretary-general" in title.lower():
162+
has_sg_in_title += 1
163+
else:
164+
has_sg_in_subtitle += 1
165+
# Find the actual phrase used
166+
for m in re.findall(r"(?:of the |the )?secretary.general['']?s?", combined, re.I):
167+
sg_variants[m.lower().strip()] = sg_variants.get(m.lower().strip(), 0) + 1
168+
else:
169+
has_sg_nowhere += 1
170+
print(f" In title: {has_sg_in_title}, In subtitle only: {has_sg_in_subtitle}, Nowhere: {has_sg_nowhere}")
171+
print(f" Variants found: {sg_variants}")
172+
173+
# Sample the ones without SG in title at all
174+
if has_sg_nowhere:
175+
print(f"\nSamples with no 'Secretary-General' in title/subtitle:")
176+
count = 0
177+
for s in sorted(only_in_1):
178+
r = next((x for x in results1 if s in (x.get("191__a") or [])), None)
179+
if r:
180+
title = " ".join(r.get("245__a") or [])
181+
subtitle = " ".join(r.get("245__b") or [])
182+
if "secretary-general" not in f"{title} {subtitle}".lower():
183+
print(f" {s}: {title[:70]}")
184+
count += 1
185+
if count >= 10:
186+
break
187+
188+
# Year distribution comparison
189+
print("\nYear distribution:")
190+
def year_dist(results, symbols_filter=None):
191+
dist = {}
192+
for r in results:
193+
syms = set(r.get("191__a") or [])
194+
if symbols_filter and not syms & symbols_filter:
195+
continue
196+
date = (r.get("269__a") or [""])[0][:4]
197+
if date.isdigit():
198+
dist[date] = dist.get(date, 0) + 1
199+
return dist
200+
201+
years = sorted(set(year_dist(results1).keys()) | set(year_dist(results2).keys()))
202+
d1, d2 = year_dist(results1), year_dist(results2)
203+
d1_only = year_dist(results1, only_in_1)
204+
d2_only = year_dist(results2, only_in_2)
205+
print(f"{'Year':>6} {'App1':>6} {'App2':>6} {'Only1':>6} {'Only2':>6}")
206+
for y in years[-8:]: # last 8 years
207+
print(f"{y:>6} {d1.get(y,0):>6} {d2.get(y,0):>6} {d1_only.get(y,0):>6} {d2_only.get(y,0):>6}")

0 commit comments

Comments
 (0)