@@ -80,7 +80,9 @@ def fetch_with_retry(url, timeout=90, wait_between_retries=2):
8080 if strategy_index != preferred_strategy_index :
8181 preferred_strategy_index = strategy_index
8282 return response
83- tqdm .write (f"Strategy { strategy_index } : { url } returned status { response .status_code } " )
83+ tqdm .write (
84+ f"Strategy { strategy_index } : { url } returned status { response .status_code } "
85+ )
8486 except requests .exceptions .RequestException as e :
8587 tqdm .write (f"Strategy { strategy_index } : { url } failed: { e } " )
8688 time .sleep (wait_between_retries )
@@ -258,7 +260,9 @@ def bfs_search_pdfs(
258260 tqdm .write (f"Will stop after { max_pages } pages." )
259261 while queue :
260262 if max_pages and len (visited ) >= max_pages :
261- tqdm .write (f"Reached max_pages limit ({ max_pages } ), visited { len (visited )} pages." )
263+ tqdm .write (
264+ f"Reached max_pages limit ({ max_pages } ), visited { len (visited )} pages."
265+ )
262266 break
263267 node , depth = queue .popleft () # Get the next node from the queue
264268 pbar .update (1 )
@@ -355,7 +359,9 @@ def add_pdf_metadata(pdfs: dict) -> pd.DataFrame:
355359 raise RuntimeError (
356360 "Could not read document metadata to get page count."
357361 )
358- tqdm .write (f" { pdf_file .page_count } pages, scanning for images/tables..." )
362+ tqdm .write (
363+ f" { pdf_file .page_count } pages, scanning for images/tables..."
364+ )
359365 file_name = default_file_name
360366 pdf_title = pdf_file .metadata .get ("title" )
361367 if pdf_title and (len (pdf_title .strip ()) > 0 ):
@@ -511,7 +517,10 @@ def output_pdfs(pdf_df: pd.DataFrame, output_path: str, site_config: dict) -> No
511517 with open (args .crawled_links_json ) as f :
512518 crawled_pdfs = json .load (f )
513519 crawled_pdfs = add_pdf_metadata (crawled_pdfs )
514- print (f"Metadata collection complete. { len (crawled_pdfs )} documents processed." , flush = True )
520+ print (
521+ f"Metadata collection complete. { len (crawled_pdfs )} documents processed." ,
522+ flush = True ,
523+ )
515524 if args .comparison_crawl is not None :
516525 comparison_df = pd .read_csv (args .comparison_crawl )
517526 crawled_pdfs = compare_crawled_documents (crawled_pdfs , comparison_df )
0 commit comments