Skip to content

Commit e8aa36d

Browse files
committed
Fix linting issues.
1 parent 6f45118 commit e8aa36d

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

python_components/crawler/crawler.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ def fetch_with_retry(url, timeout=90, wait_between_retries=2):
8080
if strategy_index != preferred_strategy_index:
8181
preferred_strategy_index = strategy_index
8282
return response
83-
tqdm.write(f"Strategy {strategy_index}: {url} returned status {response.status_code}")
83+
tqdm.write(
84+
f"Strategy {strategy_index}: {url} returned status {response.status_code}"
85+
)
8486
except requests.exceptions.RequestException as e:
8587
tqdm.write(f"Strategy {strategy_index}: {url} failed: {e}")
8688
time.sleep(wait_between_retries)
@@ -258,7 +260,9 @@ def bfs_search_pdfs(
258260
tqdm.write(f"Will stop after {max_pages} pages.")
259261
while queue:
260262
if max_pages and len(visited) >= max_pages:
261-
tqdm.write(f"Reached max_pages limit ({max_pages}), visited {len(visited)} pages.")
263+
tqdm.write(
264+
f"Reached max_pages limit ({max_pages}), visited {len(visited)} pages."
265+
)
262266
break
263267
node, depth = queue.popleft() # Get the next node from the queue
264268
pbar.update(1)
@@ -355,7 +359,9 @@ def add_pdf_metadata(pdfs: dict) -> pd.DataFrame:
355359
raise RuntimeError(
356360
"Could not read document metadata to get page count."
357361
)
358-
tqdm.write(f" {pdf_file.page_count} pages, scanning for images/tables...")
362+
tqdm.write(
363+
f" {pdf_file.page_count} pages, scanning for images/tables..."
364+
)
359365
file_name = default_file_name
360366
pdf_title = pdf_file.metadata.get("title")
361367
if pdf_title and (len(pdf_title.strip()) > 0):
@@ -511,7 +517,10 @@ def output_pdfs(pdf_df: pd.DataFrame, output_path: str, site_config: dict) -> No
511517
with open(args.crawled_links_json) as f:
512518
crawled_pdfs = json.load(f)
513519
crawled_pdfs = add_pdf_metadata(crawled_pdfs)
514-
print(f"Metadata collection complete. {len(crawled_pdfs)} documents processed.", flush=True)
520+
print(
521+
f"Metadata collection complete. {len(crawled_pdfs)} documents processed.",
522+
flush=True,
523+
)
515524
if args.comparison_crawl is not None:
516525
comparison_df = pd.read_csv(args.comparison_crawl)
517526
crawled_pdfs = compare_crawled_documents(crawled_pdfs, comparison_df)

0 commit comments

Comments
 (0)