Skip to content

Commit 93c33bd

Browse files
use more sg queries, simplify sql
1 parent 28e6933 commit 93c33bd

File tree

4 files changed

+128
-73
lines changed

4 files changed

+128
-73
lines changed

python/get_reports.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ def _search_document_symbols(
5050
return res.json()
5151

5252

53-
def get_reports_metadata(doc_type = "Reports", start_date=2024):
53+
def get_reports_metadata(doc_type="Reports", tag="989__c", start_date=2024):
5454
all_results, skip, limit, old_streak = [], 0, 100, 0
5555
while True:
5656
batch = _search_document_symbols(
57-
query=f"'{doc_type}'", tag="989__c", skip=skip, limit=limit
57+
query=f"'{doc_type}'", tag=tag, skip=skip, limit=limit
5858
)
5959
if not batch:
6060
break
@@ -186,23 +186,57 @@ def convert_value(val, col_name):
186186
conn.close()
187187

188188

189-
if __name__ == "__main__":
190-
# Fetch raw reports from API
191-
raw_reports = get_reports_metadata(doc_type="Secretary-General's Reports", start_date=2020)
192-
print(f"Fetched {len(raw_reports)} raw reports")
189+
def fetch_and_store(doc_type: str, tag: str, start_date: int, fetch_text: bool = True):
190+
"""Fetch reports of given type, clean, optionally fetch PDFs, and store in DB."""
191+
print(f"\n{'='*60}\nFetching: {doc_type} (tag: {tag})\n{'='*60}")
192+
raw_reports = get_reports_metadata(doc_type=doc_type, tag=tag, start_date=start_date)
193+
print(f"Fetched {len(raw_reports)} raw records")
193194

194-
# Create DataFrame with raw_json column (propagates through explode)
195-
df = pd.DataFrame(raw_reports)
196-
df["raw_json"] = raw_reports # Each row gets its original dict
195+
if not raw_reports:
196+
return 0
197197

198-
# Clean metadata (explodes symbols, so raw_json stays with each row)
198+
df = pd.DataFrame(raw_reports)
199+
df["raw_json"] = raw_reports
199200
df = clean_metadata(df)
200201
print(f"After cleaning: {len(df)} reports")
201202

202-
# Fetch full text for each report
203-
df["text"] = [get_fulltext_or_none(symbol) for symbol in tqdm(df["symbol"], desc="Fetching PDFs")]
203+
if fetch_text:
204+
df["text"] = [get_fulltext_or_none(s) for s in tqdm(df["symbol"], desc="Fetching PDFs")]
205+
else:
206+
df["text"] = None
204207

205-
# Store in database
206208
store_reports_in_db(df)
209+
return len(df)
210+
211+
212+
# Sources to fetch for comprehensive SG reports coverage
213+
SOURCES = [
214+
# Approach 1: Classified as SG Reports
215+
("Secretary-General's Reports", "989__c"),
216+
# Approach 2: General reports (will be filtered by title in SQL view)
217+
("Reports", "989__b"),
218+
# Approach 3: Letters/notes (will be filtered by title in SQL view)
219+
("Letters and Notes Verbales", "989__b"),
220+
]
221+
222+
223+
if __name__ == "__main__":
224+
import argparse
225+
parser = argparse.ArgumentParser()
226+
parser.add_argument("--sg-only", action="store_true", help="Only fetch SG reports (989__c)")
227+
parser.add_argument("--no-text", action="store_true", help="Skip PDF text extraction")
228+
parser.add_argument("--start-year", type=int, default=2020)
229+
args = parser.parse_args()
230+
231+
fetch_text = not args.no_text
232+
counts = {}
233+
234+
if args.sg_only:
235+
counts["SG Reports"] = fetch_and_store("Secretary-General's Reports", "989__c", args.start_year, fetch_text)
236+
else:
237+
for doc_type, tag in SOURCES:
238+
counts[doc_type] = fetch_and_store(doc_type, tag, args.start_year, fetch_text)
207239

208-
print(df.head())
240+
print(f"\n{'='*60}\nSUMMARY\n{'='*60}")
241+
for src, cnt in counts.items():
242+
print(f" {src}: {cnt} reports")

sql/latest_versions_view.sql

Lines changed: 0 additions & 56 deletions
This file was deleted.

sql/views.sql

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
-- All views for SG Reports Survey
2+
-- Run: psql $DATABASE_URL -f sql/views.sql
3+
4+
DROP VIEW IF EXISTS sg_reports_survey.latest_versions;
5+
DROP VIEW IF EXISTS sg_reports_survey.sg_reports_stats;
6+
DROP VIEW IF EXISTS sg_reports_survey.sg_reports;
7+
8+
--------------------------------------------------------------------------------
9+
-- SG_REPORTS: Defines what counts as a Secretary-General report
10+
--------------------------------------------------------------------------------
11+
CREATE VIEW sg_reports_survey.sg_reports AS
12+
SELECT r.*,
13+
CASE
14+
WHEN r.resource_type_level3 @> ARRAY['Secretary-General''s Reports'] THEN 'sg_reports_metadata'
15+
WHEN (r.resource_type_level2 @> ARRAY['Reports']
16+
OR r.resource_type_level2 @> ARRAY['Letters and Notes Verbales'])
17+
AND (r.title ILIKE '%Secretary-General%'
18+
OR array_to_string(r.subtitle, ' ') ILIKE '%Secretary-General%')
19+
THEN 'title_filter'
20+
ELSE 'other'
21+
END as source
22+
FROM sg_reports_survey.reports r
23+
WHERE r.resource_type_level3 @> ARRAY['Secretary-General''s Reports']
24+
OR ((r.resource_type_level2 @> ARRAY['Reports']
25+
OR r.resource_type_level2 @> ARRAY['Letters and Notes Verbales'])
26+
AND (r.title ILIKE '%Secretary-General%'
27+
OR array_to_string(r.subtitle, ' ') ILIKE '%Secretary-General%'));
28+
29+
--------------------------------------------------------------------------------
30+
-- SG_REPORTS_STATS: Counts by source
31+
--------------------------------------------------------------------------------
32+
CREATE VIEW sg_reports_survey.sg_reports_stats AS
33+
SELECT source, COUNT(*) as count, COUNT(DISTINCT proper_title) as unique_series
34+
FROM sg_reports_survey.sg_reports
35+
WHERE proper_title IS NOT NULL
36+
AND symbol NOT LIKE '%/CORR.%'
37+
AND symbol NOT LIKE '%/REV.%'
38+
GROUP BY source;
39+
40+
--------------------------------------------------------------------------------
41+
-- LATEST_VERSIONS: Most recent version of each report series
42+
--------------------------------------------------------------------------------
43+
CREATE VIEW sg_reports_survey.latest_versions AS
44+
WITH version_counts AS (
45+
SELECT proper_title, COUNT(*)::int as version_count
46+
FROM sg_reports_survey.sg_reports
47+
WHERE proper_title IS NOT NULL
48+
AND symbol NOT LIKE '%/CORR.%' AND symbol NOT LIKE '%/REV.%'
49+
GROUP BY proper_title
50+
),
51+
ranked AS (
52+
SELECT r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
53+
r.un_body, r.subject_terms, r.source,
54+
COALESCE(r.date_year,
55+
CASE WHEN r.publication_date ~ '^\d{4}'
56+
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END
57+
) as effective_year,
58+
ROW_NUMBER() OVER (
59+
PARTITION BY r.proper_title
60+
ORDER BY COALESCE(r.date_year,
61+
CASE WHEN r.publication_date ~ '^\d{4}'
62+
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END) DESC NULLS LAST,
63+
r.publication_date DESC NULLS LAST, r.symbol DESC
64+
) as rn
65+
FROM sg_reports_survey.sg_reports r
66+
WHERE r.proper_title IS NOT NULL
67+
AND r.symbol NOT LIKE '%/CORR.%' AND r.symbol NOT LIKE '%/REV.%'
68+
)
69+
SELECT r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
70+
r.un_body, r.subject_terms, r.effective_year, r.source, vc.version_count
71+
FROM ranked r
72+
JOIN version_counts vc ON r.proper_title = vc.proper_title
73+
WHERE r.rn = 1;
74+
75+
\echo 'Views created. Stats:'
76+
SELECT * FROM sg_reports_survey.sg_reports_stats;

src/app/api/sg-reports/route.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ export async function GET(req: NextRequest) {
8080
}
8181

8282
// Build WHERE clauses for filters
83-
// Exclude corrigenda (CORR), revisions (REV), and credentials reports
83+
// Uses sg_reports view (SG reports from multiple sources)
84+
// Exclude corrigenda, revisions, and credentials
8485
const whereClauses: string[] = [
8586
"r.proper_title IS NOT NULL",
8687
"r.symbol NOT LIKE '%/CORR.%'",
@@ -171,7 +172,7 @@ export async function GET(req: NextRequest) {
171172
THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int
172173
END
173174
) as effective_year
174-
FROM ${DB_SCHEMA}.reports r
175+
FROM ${DB_SCHEMA}.sg_reports r
175176
LEFT JOIN ${DB_SCHEMA}.reporting_entities re ON r.symbol = re.symbol
176177
WHERE ${whereClause}
177178
) sub
@@ -182,7 +183,7 @@ export async function GET(req: NextRequest) {
182183
),
183184
query<{ total: number }>(
184185
`SELECT COUNT(DISTINCT r.proper_title)::int as total
185-
FROM ${DB_SCHEMA}.reports r
186+
FROM ${DB_SCHEMA}.sg_reports r
186187
LEFT JOIN ${DB_SCHEMA}.reporting_entities re ON r.symbol = re.symbol
187188
WHERE ${whereClause}`,
188189
params

0 commit comments

Comments
 (0)