@@ -50,11 +50,11 @@ def _search_document_symbols(
5050 return res .json ()
5151
5252
53- def get_reports_metadata (doc_type = "Reports" , start_date = 2024 ):
53+ def get_reports_metadata (doc_type = "Reports" , tag = "989__c " , start_date = 2024 ):
5454 all_results , skip , limit , old_streak = [], 0 , 100 , 0
5555 while True :
5656 batch = _search_document_symbols (
57- query = f"'{ doc_type } '" , tag = "989__c" , skip = skip , limit = limit
57+ query = f"'{ doc_type } '" , tag = tag , skip = skip , limit = limit
5858 )
5959 if not batch :
6060 break
@@ -186,23 +186,57 @@ def convert_value(val, col_name):
186186 conn .close ()
187187
188188
189- if __name__ == "__main__" :
190- # Fetch raw reports from API
191- raw_reports = get_reports_metadata (doc_type = "Secretary-General's Reports" , start_date = 2020 )
192- print (f"Fetched { len (raw_reports )} raw reports" )
189+ def fetch_and_store (doc_type : str , tag : str , start_date : int , fetch_text : bool = True ):
190+ """Fetch reports of given type, clean, optionally fetch PDFs, and store in DB."""
191+ print (f"\n { '=' * 60 } \n Fetching: { doc_type } (tag: { tag } )\n { '=' * 60 } " )
192+ raw_reports = get_reports_metadata (doc_type = doc_type , tag = tag , start_date = start_date )
193+ print (f"Fetched { len (raw_reports )} raw records" )
193194
194- # Create DataFrame with raw_json column (propagates through explode)
195- df = pd .DataFrame (raw_reports )
196- df ["raw_json" ] = raw_reports # Each row gets its original dict
195+ if not raw_reports :
196+ return 0
197197
198- # Clean metadata (explodes symbols, so raw_json stays with each row)
198+ df = pd .DataFrame (raw_reports )
199+ df ["raw_json" ] = raw_reports
199200 df = clean_metadata (df )
200201 print (f"After cleaning: { len (df )} reports" )
201202
202- # Fetch full text for each report
203- df ["text" ] = [get_fulltext_or_none (symbol ) for symbol in tqdm (df ["symbol" ], desc = "Fetching PDFs" )]
203+ if fetch_text :
204+ df ["text" ] = [get_fulltext_or_none (s ) for s in tqdm (df ["symbol" ], desc = "Fetching PDFs" )]
205+ else :
206+ df ["text" ] = None
204207
205- # Store in database
206208 store_reports_in_db (df )
209+ return len (df )
210+
211+
212+ # Sources to fetch for comprehensive SG reports coverage
213+ SOURCES = [
214+ # Approach 1: Classified as SG Reports
215+ ("Secretary-General's Reports" , "989__c" ),
216+ # Approach 2: General reports (will be filtered by title in SQL view)
217+ ("Reports" , "989__b" ),
218+ # Approach 3: Letters/notes (will be filtered by title in SQL view)
219+ ("Letters and Notes Verbales" , "989__b" ),
220+ ]
221+
222+
223+ if __name__ == "__main__" :
224+ import argparse
225+ parser = argparse .ArgumentParser ()
226+ parser .add_argument ("--sg-only" , action = "store_true" , help = "Only fetch SG reports (989__c)" )
227+ parser .add_argument ("--no-text" , action = "store_true" , help = "Skip PDF text extraction" )
228+ parser .add_argument ("--start-year" , type = int , default = 2020 )
229+ args = parser .parse_args ()
230+
231+ fetch_text = not args .no_text
232+ counts = {}
233+
234+ if args .sg_only :
235+ counts ["SG Reports" ] = fetch_and_store ("Secretary-General's Reports" , "989__c" , args .start_year , fetch_text )
236+ else :
237+ for doc_type , tag in SOURCES :
238+ counts [doc_type ] = fetch_and_store (doc_type , tag , args .start_year , fetch_text )
207239
208- print (df .head ())
240+ print (f"\n { '=' * 60 } \n SUMMARY\n { '=' * 60 } " )
241+ for src , cnt in counts .items ():
242+ print (f" { src } : { cnt } reports" )
0 commit comments