Skip to content

Commit f0f8995

Browse files
committed
add vaccine strain filtering functionality
1 parent b76dec5 commit f0f8995

File tree

1 file changed

+34
-6
lines changed

1 file changed

+34
-6
lines changed

gget/gget_virus.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
GENBANK_MAX_BATCH_SIZE_WARNING = 500 # Warn user if batch size exceeds this
5050
GENBANK_RETRY_ATTEMPTS = 5 # Number of retry attempts for GenBank requests
5151
GENBANK_XML_CHUNK_SIZE = 10000 # Rows to process before writing to CSV
52+
GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
5253

5354
# Subprocess and Download Configuration
5455
SUBPROCESS_VERSION_TIMEOUT = 5 # Timeout for version check commands
@@ -1620,6 +1621,9 @@ def process_cached_download(zip_file, virus_type="virus"):
16201621
# Extract segment
16211622
metadata['segment'] = report.get('segment')
16221623

1624+
# Extract vaccine strain flag
1625+
metadata['is_vaccine_strain'] = report.get('is_vaccine_strain', False)
1626+
16231627
cached_metadata_dict[accession] = metadata
16241628

16251629
logger.info("Loaded %d metadata records from %s", len(cached_metadata_dict), metadata_file)
@@ -2689,6 +2693,7 @@ def load_metadata_from_api_reports(api_reports):
26892693
"proteinCount": report.get("protein_count"), # Number of proteins
26902694
"maturePeptideCount": report.get("mature_peptide_count"), # Number of mature peptides
26912695
"segment": report.get("segment"), # Virus segment identifier (e.g., 'HA', 'NA', 'PB1')
2696+
"isVaccineStrain": report.get("is_vaccine_strain", False), # Whether this is a vaccine strain
26922697
}
26932698

26942699
# Store the metadata using accession as the key
@@ -3395,6 +3400,7 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil
33953400
"Nuc Completeness", # Completeness status (complete/partial)
33963401
"Proteins/Segments", # Protein/segment information from FASTA headers
33973402
"Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
3403+
"Is Vaccine Strain", # Whether this sequence is from a vaccine strain
33983404
"Geographic Region", # Geographic region where sample was collected
33993405
"Geographic Location",# Specific geographic location
34003406
"Host", # Host organism name
@@ -3459,7 +3465,8 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil
34593465
"Length": metadata.get("length", pd.NA),
34603466
"Nuc Completeness": metadata.get("completeness", pd.NA),
34613467
"Proteins/Segments": protein_headers[i] if i < len(protein_headers) else pd.NA,
3462-
"Segment": metadata.get("segment", pd.NA), # Virus segment identifier
3468+
"Segment": metadata.get("segment", pd.NA),
3469+
"Is Vaccine Strain": metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", pd.NA)),
34633470

34643471
# Geographic information
34653472
"Geographic Region": metadata.get("region", pd.NA),
@@ -3715,7 +3722,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
37153722
'id': accession_string, # Comma-separated accession numbers
37163723
'rettype': 'gb', # GenBank format
37173724
'retmode': 'xml', # XML output for structured parsing
3718-
'complexity': 0, # Requesting the entire blob of data content to be returned
3725+
'complexity': GENBANK_COMPLEXITY,
37193726
}
37203727

37213728
# Create a requests.Session with urllib3 Retry/HTTPAdapter for robust retries
@@ -4233,6 +4240,7 @@ def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=N
42334240
"Nuc Completeness", # Completeness status (complete/partial)
42344241
"Proteins/Segments", # Protein/segment information from FASTA headers
42354242
"Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
4243+
"Is Vaccine Strain", # Whether this sequence is from a vaccine strain
42364244
"Geographic Region", # Geographic region where sample was collected
42374245
"Geographic Location",# Specific geographic location
42384246
"Host", # Host organism name
@@ -4304,6 +4312,7 @@ def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=N
43044312
"Nuc Completeness": metadata.get('completeness', pd.NA),
43054313
"Proteins/Segments": pd.NA, # Not available from GenBank XML parsing
43064314
"Segment": metadata.get('segment', pd.NA), # Virus segment identifier
4315+
"Is Vaccine Strain": metadata.get('isVaccineStrain', metadata.get('is_vaccine_strain', pd.NA)),
43074316

43084317
# Geographic information
43094318
"Geographic Region": metadata.get('region', pd.NA),
@@ -4575,6 +4584,7 @@ def filter_metadata_only(
45754584
max_protein_count=None,
45764585
annotated=None,
45774586
segment=None,
4587+
is_vaccine_strain=False,
45784588
):
45794589
"""
45804590
Filter metadata records based on metadata-only criteria.
@@ -4595,12 +4605,12 @@ def filter_metadata_only(
45954605
logger.debug("Applying metadata-only filters: seq_length(%s-%s), gene_count(%s-%s), "
45964606
"completeness(%s), lab_passaged(%s), annotated(%s), "
45974607
"submitter_country(%s), collection_date(%s-%s), max_release_date(%s), "
4598-
"peptide_count(%s-%s), protein_count(%s-%s), segment(%s)",
4608+
"peptide_count(%s-%s), protein_count(%s-%s), segment(%s), is_vaccine_strain(%s)",
45994609
min_seq_length, max_seq_length, min_gene_count, max_gene_count,
46004610
nuc_completeness, lab_passaged, annotated,
46014611
submitter_country, min_collection_date, max_collection_date, max_release_date,
46024612
min_mature_peptide_count, max_mature_peptide_count,
4603-
min_protein_count, max_protein_count, segment)
4613+
min_protein_count, max_protein_count, segment, is_vaccine_strain)
46044614

46054615
# Convert date filters to datetime objects for proper comparison
46064616
min_collection_date = (
@@ -4641,6 +4651,7 @@ def filter_metadata_only(
46414651
'mature_peptide_count': 0,
46424652
'protein_count': 0,
46434653
'segment': 0,
4654+
'is_vaccine_strain': 0,
46444655
}
46454656

46464657
logger.info("Processing %d metadata records...", total_sequences)
@@ -4855,6 +4866,14 @@ def filter_metadata_only(
48554866
filter_stats['segment'] += 1
48564867
continue
48574868

4869+
# FILTER 12: Vaccine strain filter
4870+
if is_vaccine_strain:
4871+
is_vaccine = metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", False))
4872+
if not is_vaccine:
4873+
logger.debug("Skipping %s: not a vaccine strain", accession)
4874+
filter_stats['is_vaccine_strain'] += 1
4875+
continue
4876+
48584877
# If we reach this point, the metadata record has passed all filters
48594878
filtered_accessions.append(accession)
48604879
filtered_metadata_list.append(metadata)
@@ -4923,6 +4942,7 @@ def virus(
49234942
is_sars_cov2=False,
49244943
is_alphainfluenza=False,
49254944
segment=None,
4945+
is_vaccine_strain=False,
49264946
lineage=None,
49274947
genbank_metadata=False,
49284948
genbank_batch_size=200,
@@ -4968,6 +4988,8 @@ def virus(
49684988
max_ambiguous_chars (int): Maximum ambiguous nucleotide character filter
49694989
is_sars_cov2 (bool): Flag to indicate if the accession is for SARS-CoV-2, enabling optimized download method (default: False)
49704990
is_alphainfluenza (bool): Flag to indicate if the query is for Alphainfluenza, enabling optimized download method (default: False)
4991+
segment (str/list): Virus segment filter (e.g., 'HA', 'NA', or 'HA,NA')
4992+
is_vaccine_strain (bool): Filter for vaccine strains only. If True, only sequences marked as vaccine strains will be returned (default: False)
49714993
lineage (str): Virus lineage filter (SARS-CoV-2 specific)
49724994
genbank_metadata (bool): Whether to fetch detailed GenBank metadata (default: False)
49734995
genbank_batch_size (int): Batch size for GenBank API requests (default: 200)
@@ -5021,8 +5043,8 @@ def virus(
50215043

50225044
logger.info("Query parameters: virus='%s', is_accession=%s, outfolder='%s'",
50235045
virus, is_accession, outfolder)
5024-
logger.debug("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, refseq_only=%s, keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, genbank_metadata=%s, genbank_batch_size=%s",
5025-
host, min_seq_length, max_seq_length, min_gene_count, max_gene_count, nuc_completeness, annotated, refseq_only, keep_temp, lab_passaged, geographic_location, submitter_country, min_collection_date, max_collection_date, min_release_date, max_release_date, min_protein_count, max_protein_count, min_mature_peptide_count, max_mature_peptide_count, max_ambiguous_chars, has_proteins, proteins_complete, genbank_metadata, genbank_batch_size)
5046+
logger.debug("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, refseq_only=%s, keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, is_vaccine_strain=%s, genbank_metadata=%s, genbank_batch_size=%s",
5047+
host, min_seq_length, max_seq_length, min_gene_count, max_gene_count, nuc_completeness, annotated, refseq_only, keep_temp, lab_passaged, geographic_location, submitter_country, min_collection_date, max_collection_date, min_release_date, max_release_date, min_protein_count, max_protein_count, min_mature_peptide_count, max_mature_peptide_count, max_ambiguous_chars, has_proteins, proteins_complete, segment, is_vaccine_strain, genbank_metadata, genbank_batch_size)
50265048

50275049
# SECTION 1: INPUT VALIDATION AND OUTPUT DIRECTORY SETUP
50285050
# Validate and normalize input arguments before proceeding
@@ -5081,6 +5103,11 @@ def virus(
50815103
raise TypeError(
50825104
"Argument 'genbank_metadata' must be a boolean (True or False)."
50835105
)
5106+
5107+
if is_vaccine_strain is not None and not isinstance(is_vaccine_strain, bool):
5108+
raise TypeError(
5109+
"Argument 'is_vaccine_strain' must be a boolean (True or False)."
5110+
)
50845111

50855112
if genbank_batch_size is not None:
50865113
if not isinstance(genbank_batch_size, int) or genbank_batch_size <= 0:
@@ -5455,6 +5482,7 @@ def virus(
54555482
# annotated=False needs client-side filtering (annotated=True is handled server-side)
54565483
"annotated": annotated if annotated is False else None,
54575484
"segment": segment,
5485+
"is_vaccine_strain": is_vaccine_strain,
54585486
}
54595487

54605488
all_metadata_filters_none_except_nuc = all(

0 commit comments

Comments
 (0)