Skip to content

Commit 7c24d27

Browse files
committed
add progress bars for cleaner reporting
1 parent 83857ce commit 7c24d27

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

gget/gget_virus.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,7 @@ def _fetch_metadata_for_accession_list(
647647
logger.info("Fetching metadata for %d accessions in %d batch(es) with exponential backoff retries",
648648
len(accessions), len(batches))
649649

650-
for batch_num, batch in enumerate(batches, 1):
650+
for batch_num, batch in tqdm(enumerate(batches, 1), total=len(batches), desc="Fetching accession batches", unit="batch", disable=len(batches)==1):
651651
logger.info("Processing accession batch %d/%d (%d accessions)",
652652
batch_num, len(batches), len(batch))
653653

@@ -683,11 +683,11 @@ def fetch_batch_metadata():
683683

684684
if success and batch_reports:
685685
all_reports.extend(batch_reports)
686-
logger.info("✅ Batch %d: Retrieved %d records", batch_num, len(batch_reports))
686+
tqdm.write(f"✅ Batch {batch_num}: Retrieved {len(batch_reports)} records")
687687
else:
688688
# Batch failed or returned empty
689689
error_msg = error_info['error'] if error_info else "No data returned"
690-
logger.warning("❌ Batch %d failed after %d retries: %s", batch_num, API_MAX_RETRIES, error_msg)
690+
tqdm.write(f"❌ Batch {batch_num} failed after {API_MAX_RETRIES} retries: {error_msg}")
691691

692692
# Build URL with applied filters for manual retry
693693
base_url = f"{NCBI_API_BASE}/virus/accession/{accession_string}/dataset_report"
@@ -933,6 +933,7 @@ def fetch_virus_metadata(
933933
all_reports = [] # Will store all metadata records across all pages
934934
page_token = None # Token for accessing subsequent pages
935935
page_count = 0 # Track number of pages processed for logging
936+
pages_pbar = None # Progress bar for pagination (created when we know total pages)
936937

937938
# Create a temporary file to stream metadata as it arrives from the API
938939
# This prevents large datasets from consuming all system RAM
@@ -951,7 +952,6 @@ def fetch_virus_metadata(
951952
loop = True
952953
while loop:
953954
page_count += 1
954-
logger.info("Fetching page %d of results...", page_count)
955955

956956
# Add pagination token if we're not on the first page
957957
if page_token:
@@ -989,7 +989,15 @@ def fetch_single_page():
989989
if success and page_data:
990990
# Extract the virus reports from the response
991991
reports = page_data.get('reports', [])
992-
logger.info("Page %d contains %d virus records", page_count, len(reports))
992+
# Create progress bar on first page when we know total pages
993+
if pages_pbar is None and page_count == 1:
994+
total_pages = page_data.get('total_count', 0)
995+
if total_pages > 0:
996+
total_pages = (total_pages + API_PAGE_SIZE - 1) // API_PAGE_SIZE
997+
pages_pbar = tqdm(total=max(total_pages, 1), desc="Fetching pages", unit="page", leave=False)
998+
if pages_pbar:
999+
pages_pbar.update(1)
1000+
pages_pbar.set_postfix({"records": len(all_reports)})
9931001

9941002
# Stream reports to temporary file if available
9951003
if metadata_file and reports:
@@ -1007,7 +1015,8 @@ def fetch_single_page():
10071015
# Check if there are more pages to retrieve
10081016
next_page_token = page_data.get('next_page_token')
10091017
if not next_page_token:
1010-
logger.debug("No more pages available, pagination complete")
1018+
if pages_pbar:
1019+
pages_pbar.close()
10111020
loop = False
10121021
break
10131022

@@ -1067,7 +1076,9 @@ def fetch_single_page():
10671076
logger.error(error_msg)
10681077
logger.error("=" * 80)
10691078

1070-
# Close temporary file before raising exception
1079+
# Close temporary file and progress bar before raising exception
1080+
if pages_pbar:
1081+
pages_pbar.close()
10711082
if metadata_file:
10721083
try:
10731084
metadata_file.close()
@@ -1249,6 +1260,8 @@ def fetch_single_page():
12491260
'records_retrieved': len(all_reports),
12501261
})
12511262

1263+
if pages_pbar:
1264+
pages_pbar.close()
12521265
loop = False
12531266
break
12541267
else:
@@ -1346,7 +1359,7 @@ def fetch_virus_metadata_chunked(
13461359
logger.info(f"Will process {total_chunks} year(s) from {start_year} to {current_year}")
13471360
logger.info("=" * 80)
13481361

1349-
for year in range(start_year, current_year + 1):
1362+
for year in tqdm(range(start_year, current_year + 1), total=total_chunks, desc="Fetching yearly chunks", unit="year"):
13501363
chunk_start = f"{year}-01-01"
13511364
chunk_end = f"{year}-12-31"
13521365

@@ -1355,11 +1368,7 @@ def fetch_virus_metadata_chunked(
13551368
chunk_end = current_date.strftime("%Y-%m-%d")
13561369

13571370
chunk_num = year - start_year + 1
1358-
logger.info("")
1359-
logger.info("=" * 80)
1360-
logger.info(f"📥 Chunk {chunk_num}/{total_chunks}: Fetching data for year {year}")
1361-
logger.info(f" Date range: {chunk_start} to {chunk_end}")
1362-
logger.info("=" * 80)
1371+
tqdm.write(f"📥 Chunk {chunk_num}/{total_chunks}: Fetching data for year {year} ({chunk_start} to {chunk_end})")
13631372

13641373
try:
13651374
# Fetch metadata for this date chunk
@@ -1385,8 +1394,7 @@ def fetch_virus_metadata_chunked(
13851394
chunk_count = len(chunk_reports)
13861395
all_reports.extend(chunk_reports)
13871396

1388-
logger.info(f"✅ Chunk {chunk_num}/{total_chunks} complete: Retrieved {chunk_count:,} records")
1389-
logger.info(f" Running total: {len(all_reports):,} records")
1397+
tqdm.write(f"✅ Chunk {chunk_num}/{total_chunks}: Retrieved {chunk_count:,} records (total: {len(all_reports):,})")
13901398

13911399
# Add a small delay between chunks to be respectful to NCBI servers
13921400
if year < current_year:

0 commit comments

Comments
 (0)