111111
112112# File Size Configuration
113113BYTES_PER_MB = 1024 * 1024 # Bytes in a megabyte for file size display diviser
114- MIN_VALID_ZIP_SIZE = 100 * 1024 # 100 KB in bytes (minimum size for a valid ZIP file from cached downloads)
114+ MIN_VALID_ZIP_SIZE = 5 * 1024 # 100 KB in bytes (minimum size for a valid ZIP file from cached downloads)
115115MIN_VALID_FASTA_SIZE_MB = 0.1 # Minimum size in MB for a valid FASTA file (100 KB)
116116
117117# URL Length Configuration
144144# Cache for the datasets path to avoid repeated checks
145145_datasets_path_cache = None
146146
147-
148147# =============================================================================
149148# HELPER FUNCTIONS FOR RETRIES AND ERROR TRACKING
150149# =============================================================================
@@ -384,6 +383,49 @@ def _get_datasets_path():
384383 )
385384
386385
386+ def _get_datasets_version ():
387+ """
388+ Get the version of the NCBI datasets CLI if available.
389+
390+ Attempts to retrieve the version string from the datasets binary.
391+ Returns None if datasets is not available or version check fails.
392+
393+ Returns:
394+ str or None: Version string from datasets (e.g., "16.11.0") or None if unavailable.
395+ """
396+ try :
397+ datasets_path = _get_datasets_path ()
398+ result = subprocess .run (
399+ [datasets_path , "--version" ],
400+ capture_output = True ,
401+ text = True ,
402+ timeout = SUBPROCESS_VERSION_TIMEOUT ,
403+ )
404+ if result .returncode == 0 :
405+ # Extract version from output (e.g., "datasets version 16.11.0")
406+ version_output = result .stdout .strip ()
407+ logger .debug ("Datasets version output: %s" , version_output )
408+ return version_output
409+ except (RuntimeError , subprocess .TimeoutExpired , OSError ) as e :
410+ logger .debug ("Could not retrieve datasets version: %s" , e )
411+
412+ return None
413+
414+
415+ def _get_gget_version ():
416+ """
417+ Get the version of gget.
418+
419+ Returns:
420+ str: Version string (e.g., "1.2.0") or "unknown" if not available.
421+ """
422+ try :
423+ from . import __version__
424+ return __version__
425+ except (ImportError , AttributeError ):
426+ return "unknown"
427+
428+
387429def _get_modified_virus_name (virus_name , attempt = 1 ):
388430 """
389431 Modify the virus name for retry attempts when the NCBI server is unreachable.
@@ -3160,10 +3202,12 @@ def save_command_summary(
31603202 total_final_sequences ,
31613203 output_files ,
31623204 filtered_metadata ,
3205+ datasets_version ,
31633206 success = True ,
31643207 error_message = None ,
31653208 failed_commands = None ,
3166- genbank_error = None
3209+ genbank_error = None ,
3210+ gget_version = None
31673211):
31683212 """
31693213 Save a summary file documenting the command execution and results.
@@ -3172,6 +3216,10 @@ def save_command_summary(
31723216 output files, and any errors encountered.
31733217 """
31743218
3219+ # Get versions if not provided
3220+ if gget_version is None :
3221+ gget_version = _get_gget_version ()
3222+
31753223 summary_file = os .path .join (outfolder , "command_summary.txt" )
31763224
31773225 try :
@@ -3185,6 +3233,15 @@ def save_command_summary(
31853233 f .write (f"Execution Date: { datetime .now ().strftime ('%Y-%m-%d %H:%M:%S' )} \n " )
31863234 f .write (f"Output Folder: { outfolder } \n \n " )
31873235
3236+ # Version information
3237+ f .write ("-" * 80 + "\n " )
3238+ f .write ("SOFTWARE VERSIONS\n " )
3239+ f .write ("-" * 80 + "\n " )
3240+ f .write (f"gget version: { gget_version } \n " )
3241+ if datasets_version is not None :
3242+ f .write (f"{ datasets_version } \n " )
3243+ f .write ("\n " )
3244+
31883245 # Command line
31893246 f .write ("-" * 80 + "\n " )
31903247 f .write ("COMMAND LINE\n " )
@@ -5348,6 +5405,7 @@ def virus(
53485405 cached_metadata_dict = None
53495406 used_cached_download = False
53505407 cached_zip_file = None # Track zip file path for cleanup
5408+ datasets_version = None # Track datasets version for logging and summary
53515409
53525410 # For SARS-CoV-2 queries, use cached data packages with hierarchical fallback
53535411 if _skip_cache :
@@ -5375,6 +5433,7 @@ def virus(
53755433 applied_filters = download_result [1 ]
53765434 missing_filters = download_result [2 ]
53775435 cached_zip_file = zip_file # Track for cleanup
5436+ datasets_version = _get_datasets_version ()
53785437
53795438 cached_fasta_file , cached_metadata_dict , used_cached_download = process_cached_download (
53805439 zip_file , virus_type = "SARS-CoV-2"
@@ -5421,6 +5480,7 @@ def virus(
54215480 applied_filters = download_result [1 ]
54225481 missing_filters = download_result [2 ]
54235482 cached_zip_file = zip_file # Track for cleanup
5483+ datasets_version = _get_datasets_version ()
54245484
54255485 cached_fasta_file , cached_metadata_dict , used_cached_download = process_cached_download (
54265486 zip_file , virus_type = "Alphainfluenza"
@@ -5566,6 +5626,7 @@ def virus(
55665626 total_final_sequences = 0 ,
55675627 output_files = {},
55685628 filtered_metadata = [],
5629+ datasets_version = datasets_version ,
55695630 success = False ,
55705631 error_message = str (e ),
55715632 failed_commands = failed_commands ,
@@ -5584,6 +5645,7 @@ def virus(
55845645 total_final_sequences = 0 ,
55855646 output_files = {},
55865647 filtered_metadata = [],
5648+ datasets_version = datasets_version ,
55875649 success = True ,
55885650 error_message = "No virus records found matching the specified criteria (API returned 0 records)" ,
55895651 failed_commands = failed_commands
@@ -5663,6 +5725,7 @@ def virus(
56635725 total_final_sequences = 0 ,
56645726 output_files = output_files_dict ,
56655727 filtered_metadata = [],
5728+ datasets_version = datasets_version ,
56665729 success = True ,
56675730 error_message = "No sequences passed the metadata filters" ,
56685731 failed_commands = failed_commands
@@ -5913,6 +5976,7 @@ def virus(
59135976 total_final_sequences = total_final_sequences ,
59145977 output_files = output_files_dict ,
59155978 filtered_metadata = final_metadata_for_summary ,
5979+ datasets_version = datasets_version ,
59165980 success = True ,
59175981 error_message = None ,
59185982 failed_commands = failed_commands ,
@@ -5939,6 +6003,7 @@ def virus(
59396003 total_final_sequences = 0 ,
59406004 output_files = output_files_dict ,
59416005 filtered_metadata = [],
6006+ datasets_version = datasets_version ,
59426007 success = True ,
59436008 error_message = "No sequences passed all filters" ,
59446009 failed_commands = failed_commands
@@ -6060,6 +6125,7 @@ def virus(
60606125 total_final_sequences = total_final_sequences ,
60616126 output_files = output_files_dict ,
60626127 filtered_metadata = final_metadata_for_summary ,
6128+ datasets_version = datasets_version ,
60636129 success = False ,
60646130 error_message = str (e ),
60656131 failed_commands = failed_commands
0 commit comments