4949GENBANK_MAX_BATCH_SIZE_WARNING = 500 # Warn user if batch size exceeds this
5050GENBANK_RETRY_ATTEMPTS = 5 # Number of retry attempts for GenBank requests
5151GENBANK_XML_CHUNK_SIZE = 10000 # Rows to process before writing to CSV
52+ GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
5253
5354# Subprocess and Download Configuration
5455SUBPROCESS_VERSION_TIMEOUT = 5 # Timeout for version check commands
@@ -1620,6 +1621,9 @@ def process_cached_download(zip_file, virus_type="virus"):
16201621 # Extract segment
16211622 metadata ['segment' ] = report .get ('segment' )
16221623
1624+ # Extract vaccine strain flag
1625+ metadata ['is_vaccine_strain' ] = report .get ('is_vaccine_strain' , False )
1626+
16231627 cached_metadata_dict [accession ] = metadata
16241628
16251629 logger .info ("Loaded %d metadata records from %s" , len (cached_metadata_dict ), metadata_file )
@@ -2689,6 +2693,7 @@ def load_metadata_from_api_reports(api_reports):
26892693 "proteinCount" : report .get ("protein_count" ), # Number of proteins
26902694 "maturePeptideCount" : report .get ("mature_peptide_count" ), # Number of mature peptides
26912695 "segment" : report .get ("segment" ), # Virus segment identifier (e.g., 'HA', 'NA', 'PB1')
2696+ "isVaccineStrain" : report .get ("is_vaccine_strain" , False ), # Whether this is a vaccine strain
26922697 }
26932698
26942699 # Store the metadata using accession as the key
@@ -3395,6 +3400,7 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil
33953400 "Nuc Completeness" , # Completeness status (complete/partial)
33963401 "Proteins/Segments" , # Protein/segment information from FASTA headers
33973402 "Segment" , # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
3403+ "Is Vaccine Strain" , # Whether this sequence is from a vaccine strain
33983404 "Geographic Region" , # Geographic region where sample was collected
33993405 "Geographic Location" ,# Specific geographic location
34003406 "Host" , # Host organism name
@@ -3459,7 +3465,8 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil
34593465 "Length" : metadata .get ("length" , pd .NA ),
34603466 "Nuc Completeness" : metadata .get ("completeness" , pd .NA ),
34613467 "Proteins/Segments" : protein_headers [i ] if i < len (protein_headers ) else pd .NA ,
3462- "Segment" : metadata .get ("segment" , pd .NA ), # Virus segment identifier
3468+ "Segment" : metadata .get ("segment" , pd .NA ),
3469+ "Is Vaccine Strain" : metadata .get ("isVaccineStrain" , metadata .get ("is_vaccine_strain" , pd .NA )),
34633470
34643471 # Geographic information
34653472 "Geographic Region" : metadata .get ("region" , pd .NA ),
@@ -3715,7 +3722,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
37153722 'id' : accession_string , # Comma-separated accession numbers
37163723 'rettype' : 'gb' , # GenBank format
37173724 'retmode' : 'xml' , # XML output for structured parsing
3718- 'complexity' : 0 , # Requesting the entire blob of data content to be returned
3725+ 'complexity' : GENBANK_COMPLEXITY ,
37193726 }
37203727
37213728 # Create a requests.Session with urllib3 Retry/HTTPAdapter for robust retries
@@ -4233,6 +4240,7 @@ def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=N
42334240 "Nuc Completeness" , # Completeness status (complete/partial)
42344241 "Proteins/Segments" , # Protein/segment information from FASTA headers
42354242 "Segment" , # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
4243+ "Is Vaccine Strain" , # Whether this sequence is from a vaccine strain
42364244 "Geographic Region" , # Geographic region where sample was collected
42374245 "Geographic Location" ,# Specific geographic location
42384246 "Host" , # Host organism name
@@ -4304,6 +4312,7 @@ def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=N
43044312 "Nuc Completeness" : metadata .get ('completeness' , pd .NA ),
43054313 "Proteins/Segments" : pd .NA , # Not available from GenBank XML parsing
43064314 "Segment" : metadata .get ('segment' , pd .NA ), # Virus segment identifier
4315+ "Is Vaccine Strain" : metadata .get ('isVaccineStrain' , metadata .get ('is_vaccine_strain' , pd .NA )),
43074316
43084317 # Geographic information
43094318 "Geographic Region" : metadata .get ('region' , pd .NA ),
@@ -4575,6 +4584,7 @@ def filter_metadata_only(
45754584 max_protein_count = None ,
45764585 annotated = None ,
45774586 segment = None ,
4587+ is_vaccine_strain = False ,
45784588):
45794589 """
45804590 Filter metadata records based on metadata-only criteria.
@@ -4595,12 +4605,12 @@ def filter_metadata_only(
45954605 logger .debug ("Applying metadata-only filters: seq_length(%s-%s), gene_count(%s-%s), "
45964606 "completeness(%s), lab_passaged(%s), annotated(%s), "
45974607 "submitter_country(%s), collection_date(%s-%s), max_release_date(%s), "
4598- "peptide_count(%s-%s), protein_count(%s-%s), segment(%s)" ,
4608+ "peptide_count(%s-%s), protein_count(%s-%s), segment(%s), is_vaccine_strain(%s) " ,
45994609 min_seq_length , max_seq_length , min_gene_count , max_gene_count ,
46004610 nuc_completeness , lab_passaged , annotated ,
46014611 submitter_country , min_collection_date , max_collection_date , max_release_date ,
46024612 min_mature_peptide_count , max_mature_peptide_count ,
4603- min_protein_count , max_protein_count , segment )
4613+ min_protein_count , max_protein_count , segment , is_vaccine_strain )
46044614
46054615 # Convert date filters to datetime objects for proper comparison
46064616 min_collection_date = (
@@ -4641,6 +4651,7 @@ def filter_metadata_only(
46414651 'mature_peptide_count' : 0 ,
46424652 'protein_count' : 0 ,
46434653 'segment' : 0 ,
4654+ 'is_vaccine_strain' : 0 ,
46444655 }
46454656
46464657 logger .info ("Processing %d metadata records..." , total_sequences )
@@ -4855,6 +4866,14 @@ def filter_metadata_only(
48554866 filter_stats ['segment' ] += 1
48564867 continue
48574868
4869+ # FILTER 12: Vaccine strain filter
4870+ if is_vaccine_strain :
4871+ is_vaccine = metadata .get ("isVaccineStrain" , metadata .get ("is_vaccine_strain" , False ))
4872+ if not is_vaccine :
4873+ logger .debug ("Skipping %s: not a vaccine strain" , accession )
4874+ filter_stats ['is_vaccine_strain' ] += 1
4875+ continue
4876+
48584877 # If we reach this point, the metadata record has passed all filters
48594878 filtered_accessions .append (accession )
48604879 filtered_metadata_list .append (metadata )
@@ -4923,6 +4942,7 @@ def virus(
49234942 is_sars_cov2 = False ,
49244943 is_alphainfluenza = False ,
49254944 segment = None ,
4945+ is_vaccine_strain = False ,
49264946 lineage = None ,
49274947 genbank_metadata = False ,
49284948 genbank_batch_size = 200 ,
@@ -4968,6 +4988,8 @@ def virus(
49684988 max_ambiguous_chars (int): Maximum ambiguous nucleotide character filter
49694989 is_sars_cov2 (bool): Flag to indicate if the accession is for SARS-CoV-2, enabling optimized download method (default: False)
49704990 is_alphainfluenza (bool): Flag to indicate if the query is for Alphainfluenza, enabling optimized download method (default: False)
4991+ segment (str/list): Virus segment filter (e.g., 'HA', 'NA', or 'HA,NA')
4992+ is_vaccine_strain (bool): Filter for vaccine strains only. If True, only sequences marked as vaccine strains will be returned (default: False)
49714993 lineage (str): Virus lineage filter (SARS-CoV-2 specific)
49724994 genbank_metadata (bool): Whether to fetch detailed GenBank metadata (default: False)
49734995 genbank_batch_size (int): Batch size for GenBank API requests (default: 200)
@@ -5021,8 +5043,8 @@ def virus(
50215043
50225044 logger .info ("Query parameters: virus='%s', is_accession=%s, outfolder='%s'" ,
50235045 virus , is_accession , outfolder )
5024- logger .debug ("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, refseq_only=%s, keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, genbank_metadata=%s, genbank_batch_size=%s" ,
5025- host , min_seq_length , max_seq_length , min_gene_count , max_gene_count , nuc_completeness , annotated , refseq_only , keep_temp , lab_passaged , geographic_location , submitter_country , min_collection_date , max_collection_date , min_release_date , max_release_date , min_protein_count , max_protein_count , min_mature_peptide_count , max_mature_peptide_count , max_ambiguous_chars , has_proteins , proteins_complete , genbank_metadata , genbank_batch_size )
5046+ logger .debug ("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, refseq_only=%s, keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, is_vaccine_strain=%s, genbank_metadata=%s, genbank_batch_size=%s" ,
5047+ host , min_seq_length , max_seq_length , min_gene_count , max_gene_count , nuc_completeness , annotated , refseq_only , keep_temp , lab_passaged , geographic_location , submitter_country , min_collection_date , max_collection_date , min_release_date , max_release_date , min_protein_count , max_protein_count , min_mature_peptide_count , max_mature_peptide_count , max_ambiguous_chars , has_proteins , proteins_complete , segment , is_vaccine_strain , genbank_metadata , genbank_batch_size )
50265048
50275049 # SECTION 1: INPUT VALIDATION AND OUTPUT DIRECTORY SETUP
50285050 # Validate and normalize input arguments before proceeding
@@ -5081,6 +5103,11 @@ def virus(
50815103 raise TypeError (
50825104 "Argument 'genbank_metadata' must be a boolean (True or False)."
50835105 )
5106+
5107+ if is_vaccine_strain is not None and not isinstance (is_vaccine_strain , bool ):
5108+ raise TypeError (
5109+ "Argument 'is_vaccine_strain' must be a boolean (True or False)."
5110+ )
50845111
50855112 if genbank_batch_size is not None :
50865113 if not isinstance (genbank_batch_size , int ) or genbank_batch_size <= 0 :
@@ -5455,6 +5482,7 @@ def virus(
54555482 # annotated=False needs client-side filtering (annotated=True is handled server-side)
54565483 "annotated" : annotated if annotated is False else None ,
54575484 "segment" : segment ,
5485+ "is_vaccine_strain" : is_vaccine_strain ,
54585486 }
54595487
54605488 all_metadata_filters_none_except_nuc = all (
0 commit comments