use more sg queries, simplify sql

davidpomerenke · davidpomerenke · commit 93c33bd15ccb · 2026-01-22T16:54:20.000+01:00
diff --git a/python/get_reports.py b/python/get_reports.py
@@ -50,11 +50,11 @@ def _search_document_symbols(
     return res.json()
 
 
-def get_reports_metadata(doc_type = "Reports", start_date=2024):
+def get_reports_metadata(doc_type="Reports", tag="989__c", start_date=2024):
     all_results, skip, limit, old_streak = [], 0, 100, 0
     while True:
         batch = _search_document_symbols(
-            query=f"'{doc_type}'", tag="989__c", skip=skip, limit=limit
+            query=f"'{doc_type}'", tag=tag, skip=skip, limit=limit
         )
         if not batch:
             break
@@ -186,23 +186,57 @@ def convert_value(val, col_name):
         conn.close()
 
 
-if __name__ == "__main__":
-    # Fetch raw reports from API
-    raw_reports = get_reports_metadata(doc_type="Secretary-General's Reports", start_date=2020)
-    print(f"Fetched {len(raw_reports)} raw reports")
+def fetch_and_store(doc_type: str, tag: str, start_date: int, fetch_text: bool = True):
+    """Fetch reports of given type, clean, optionally fetch PDFs, and store in DB."""
+    print(f"\n{'='*60}\nFetching: {doc_type} (tag: {tag})\n{'='*60}")
+    raw_reports = get_reports_metadata(doc_type=doc_type, tag=tag, start_date=start_date)
+    print(f"Fetched {len(raw_reports)} raw records")
     
-    # Create DataFrame with raw_json column (propagates through explode)
-    df = pd.DataFrame(raw_reports)
-    df["raw_json"] = raw_reports  # Each row gets its original dict
+    if not raw_reports:
+        return 0
     
-    # Clean metadata (explodes symbols, so raw_json stays with each row)
+    df = pd.DataFrame(raw_reports)
+    df["raw_json"] = raw_reports
     df = clean_metadata(df)
     print(f"After cleaning: {len(df)} reports")
     
-    # Fetch full text for each report
-    df["text"] = [get_fulltext_or_none(symbol) for symbol in tqdm(df["symbol"], desc="Fetching PDFs")]
+    if fetch_text:
+        df["text"] = [get_fulltext_or_none(s) for s in tqdm(df["symbol"], desc="Fetching PDFs")]
+    else:
+        df["text"] = None
     
-    # Store in database
     store_reports_in_db(df)
+    return len(df)
+
+
+# Sources to fetch for comprehensive SG reports coverage
+SOURCES = [
+    # Approach 1: Classified as SG Reports
+    ("Secretary-General's Reports", "989__c"),
+    # Approach 2: General reports (will be filtered by title in SQL view)
+    ("Reports", "989__b"),
+    # Approach 3: Letters/notes (will be filtered by title in SQL view)
+    ("Letters and Notes Verbales", "989__b"),
+]
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sg-only", action="store_true", help="Only fetch SG reports (989__c)")
+    parser.add_argument("--no-text", action="store_true", help="Skip PDF text extraction")
+    parser.add_argument("--start-year", type=int, default=2020)
+    args = parser.parse_args()
+    
+    fetch_text = not args.no_text
+    counts = {}
+    
+    if args.sg_only:
+        counts["SG Reports"] = fetch_and_store("Secretary-General's Reports", "989__c", args.start_year, fetch_text)
+    else:
+        for doc_type, tag in SOURCES:
+            counts[doc_type] = fetch_and_store(doc_type, tag, args.start_year, fetch_text)
     
-    print(df.head())
+    print(f"\n{'='*60}\nSUMMARY\n{'='*60}")
+    for src, cnt in counts.items():
+        print(f"  {src}: {cnt} reports")
diff --git a/sql/latest_versions_view.sql b/sql/latest_versions_view.sql
diff --git a/sql/views.sql b/sql/views.sql
@@ -0,0 +1,76 @@
+-- All views for SG Reports Survey
+-- Run: psql $DATABASE_URL -f sql/views.sql
+
+DROP VIEW IF EXISTS sg_reports_survey.latest_versions;
+DROP VIEW IF EXISTS sg_reports_survey.sg_reports_stats;
+DROP VIEW IF EXISTS sg_reports_survey.sg_reports;
+
+--------------------------------------------------------------------------------
+-- SG_REPORTS: Defines what counts as a Secretary-General report
+--------------------------------------------------------------------------------
+CREATE VIEW sg_reports_survey.sg_reports AS
+SELECT r.*,
+  CASE
+    WHEN r.resource_type_level3 @> ARRAY['Secretary-General''s Reports'] THEN 'sg_reports_metadata'
+    WHEN (r.resource_type_level2 @> ARRAY['Reports'] 
+          OR r.resource_type_level2 @> ARRAY['Letters and Notes Verbales'])
+         AND (r.title ILIKE '%Secretary-General%' 
+              OR array_to_string(r.subtitle, ' ') ILIKE '%Secretary-General%')
+    THEN 'title_filter'
+    ELSE 'other'
+  END as source
+FROM sg_reports_survey.reports r
+WHERE r.resource_type_level3 @> ARRAY['Secretary-General''s Reports']
+   OR ((r.resource_type_level2 @> ARRAY['Reports'] 
+        OR r.resource_type_level2 @> ARRAY['Letters and Notes Verbales'])
+       AND (r.title ILIKE '%Secretary-General%' 
+            OR array_to_string(r.subtitle, ' ') ILIKE '%Secretary-General%'));
+
+--------------------------------------------------------------------------------
+-- SG_REPORTS_STATS: Counts by source
+--------------------------------------------------------------------------------
+CREATE VIEW sg_reports_survey.sg_reports_stats AS
+SELECT source, COUNT(*) as count, COUNT(DISTINCT proper_title) as unique_series
+FROM sg_reports_survey.sg_reports
+WHERE proper_title IS NOT NULL
+  AND symbol NOT LIKE '%/CORR.%'
+  AND symbol NOT LIKE '%/REV.%'
+GROUP BY source;
+
+--------------------------------------------------------------------------------
+-- LATEST_VERSIONS: Most recent version of each report series
+--------------------------------------------------------------------------------
+CREATE VIEW sg_reports_survey.latest_versions AS
+WITH version_counts AS (
+  SELECT proper_title, COUNT(*)::int as version_count
+  FROM sg_reports_survey.sg_reports
+  WHERE proper_title IS NOT NULL
+    AND symbol NOT LIKE '%/CORR.%' AND symbol NOT LIKE '%/REV.%'
+  GROUP BY proper_title
+),
+ranked AS (
+  SELECT r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
+         r.un_body, r.subject_terms, r.source,
+         COALESCE(r.date_year, 
+           CASE WHEN r.publication_date ~ '^\d{4}' 
+           THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END
+         ) as effective_year,
+         ROW_NUMBER() OVER (
+           PARTITION BY r.proper_title 
+           ORDER BY COALESCE(r.date_year, 
+             CASE WHEN r.publication_date ~ '^\d{4}' 
+             THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int END) DESC NULLS LAST,
+             r.publication_date DESC NULLS LAST, r.symbol DESC
+         ) as rn
+  FROM sg_reports_survey.sg_reports r
+  WHERE r.proper_title IS NOT NULL
+    AND r.symbol NOT LIKE '%/CORR.%' AND r.symbol NOT LIKE '%/REV.%'
+)
+SELECT r.id, r.symbol, r.proper_title, r.title, r.date_year, r.publication_date,
+       r.un_body, r.subject_terms, r.effective_year, r.source, vc.version_count
+FROM ranked r
+JOIN version_counts vc ON r.proper_title = vc.proper_title
+WHERE r.rn = 1;
+
+\echo 'Views created. Stats:'
+SELECT * FROM sg_reports_survey.sg_reports_stats;
diff --git a/src/app/api/sg-reports/route.ts b/src/app/api/sg-reports/route.ts
@@ -80,7 +80,8 @@ export async function GET(req: NextRequest) {
   }
 
   // Build WHERE clauses for filters
-  // Exclude corrigenda (CORR), revisions (REV), and credentials reports
+  // Uses sg_reports view (SG reports from multiple sources)
+  // Exclude corrigenda, revisions, and credentials
   const whereClauses: string[] = [
     "r.proper_title IS NOT NULL",
     "r.symbol NOT LIKE '%/CORR.%'",
@@ -171,7 +172,7 @@ export async function GET(req: NextRequest) {
               THEN SUBSTRING(r.publication_date FROM 1 FOR 4)::int 
             END
           ) as effective_year
-        FROM ${DB_SCHEMA}.reports r
+        FROM ${DB_SCHEMA}.sg_reports r
         LEFT JOIN ${DB_SCHEMA}.reporting_entities re ON r.symbol = re.symbol
         WHERE ${whereClause}
       ) sub
@@ -182,7 +183,7 @@ export async function GET(req: NextRequest) {
     ),
     query<{ total: number }>(
       `SELECT COUNT(DISTINCT r.proper_title)::int as total
-       FROM ${DB_SCHEMA}.reports r
+       FROM ${DB_SCHEMA}.sg_reports r
        LEFT JOIN ${DB_SCHEMA}.reporting_entities re ON r.symbol = re.symbol
        WHERE ${whereClause}`,
       params