naturalis · domivika · Dec 5, 2024 · Dec 5, 2024 · Dec 9, 2024 · Jan 2, 2025
diff --git a/config/config.yaml b/config/config.yaml
@@ -4,15 +4,6 @@
 # a reasonable value might be 4, or 8.
 cpu_cores: 4
 
-# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in 
-# fasta_filter that have records for the specified marker. In practice, this means that the input BCDM TSV
-# file has to have exactly this many distinct (not empty) values for the `family` column where the column
-# named under fasta_filter.filter_level has has value fasta_filter.filter_name (e.g. the `order` must be
-# `Odonata`. TODO: make this so that it is done automatically from the data. At present, this needs to be 
-# calculated by the user from the input file, e.g. by first making the sqlite database, or by grepping the 
-# TSV file somehow.
-nfamilies: 63
-
 # Number of outgroups to include in each family-level analysis. Minimum is 2.
 outgroups: 3
 
@@ -46,8 +37,13 @@ datatype: NT
 # Choose which records to use from the database for the pipeline. filter_name only takes one name, so does filter level.
 # filter levels: class, order, family, genus, all (no filter)
 fasta_filter:
+  # max level: family
   filter_level: kingdom
   filter_name: Animalia
+  # Maximum number of sequences per fasta
+  # if above, fasta files are generated not at the family level but at lower rank i.e., genus or, if still
+  # too many sequences, at species level
+  max_fasta_seq: 200
 
 name: phylogeny
 
@@ -61,6 +57,6 @@ file_names:
   bold_tsv: resources/BOLD_Public.21-Jun-2024.curated.NL.tsv
   open_tre: resources/opentree/opentree14.9_tree/labelled_supertree/labelled_supertree.tre
   hmm: resources/hmm/COI-5P.hmm
-  fasta_dir: results/fasta/family
+  fasta_dir: results/fasta/
   blast_dir: results/blast
 
diff --git a/results/fasta/family/README.md b/results/fasta/family/README.md
diff --git a/workflow/Snakefile b/workflow/Snakefile
diff --git a/workflow/envs/blast.txt b/workflow/envs/blast.txt
@@ -1 +1,4 @@
-biopython
+os
+subprocess
+argparse
+re
diff --git a/workflow/envs/choose_exemplars.txt b/workflow/envs/choose_exemplars.txt
@@ -1,3 +1,4 @@
-db-sqlite3
 biopython
-dendropy
+argparse
+dendropy
+statistics
diff --git a/workflow/envs/create_database.txt b/workflow/envs/create_database.txt
@@ -1 +1,3 @@
-db-sqlite3
+util
+argparse
+subprocess
diff --git a/workflow/envs/family_constraint.txt b/workflow/envs/family_constraint.txt
@@ -1,2 +1,3 @@
-requests
-dendropy
+argparse
+util
+opentol
diff --git a/workflow/envs/family_constraint.yml b/workflow/envs/family_constraint.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - sqlite
   - pip
   - pip:
     - -r ../../workflow/envs/family_constraint.txt
diff --git a/workflow/envs/family_fasta.txt b/workflow/envs/family_fasta.txt
@@ -1,2 +1,5 @@
-db-sqlite3
-pandas
+pandas
+argparse
+util
+re
+os
diff --git a/workflow/envs/family_fasta.yml b/workflow/envs/family_fasta.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - sqlite
   - pip
   - pip:
     - -r ../../workflow/envs/family_fasta.txt
diff --git a/workflow/envs/graft_clades.txt b/workflow/envs/graft_clades.txt
@@ -1,2 +1,4 @@
+argparse
 dendropy
-biopython
+os
+util
diff --git a/workflow/envs/map_opentol.txt b/workflow/envs/map_opentol.txt
@@ -1,4 +1,6 @@
-db-sqlite3
-pandas
-numpy
-requests
+biopython
+logging
+tempfile
+argparse
+os
+util
diff --git a/workflow/envs/prep_raxml.txt b/workflow/envs/prep_raxml.txt
@@ -1,2 +1,4 @@
+argparse
+os
 biopython
-db-sqlite3
+util
diff --git a/workflow/envs/prep_raxml.yml b/workflow/envs/prep_raxml.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - sqlite
   - pip
   - pip:
-    - -r ../../workflow/envs/prep_raxml.txt
+    - -r ../../workflow/envs/prep_raxml.txt
diff --git a/workflow/envs/prep_raxml_backbone.txt b/workflow/envs/prep_raxml_backbone.txt
@@ -1,4 +1,6 @@
-db-sqlite3
-biopython
-requests
-dendropy
+argparse
+subprocess
+util
+opentol
+dendropy
+biopython
diff --git a/workflow/envs/reroot_backbone.txt b/workflow/envs/reroot_backbone.txt
@@ -1 +1,3 @@
-dendropy
+argparse
+dendropy
+util
diff --git a/workflow/scripts/check_reverse_sequences.py b/workflow/scripts/check_reverse_sequences.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+import argparse
+import tempfile
+import os
+import logging
+import sys
+import concurrent.futures
+from functools import partial
+from copy import deepcopy
+from Bio import SeqIO
+from Bio.AlignIO import read as read_alignment
+from subprocess import run
+from tqdm import tqdm  # Add this for progress bars
+
+
+def setup_logger(name, level_str):
+    """Set up and return a logger with the specified name and level"""
+    level = getattr(logging, level_str.upper())
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    handler = logging.StreamHandler(sys.stdout)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+
+def align_score(record, hmmfile, logger):
+    """
+    Uses a Hidden Markov Model to align a sequence using hmmalign.
+    Returns the score based on posterior probabilities.
+    """
+    try:
+        # Make a copy of the record to avoid modifying the original
+        clean_record = deepcopy(record)
+
+        # Remove dashes from the sequence
+        clean_record.seq = clean_record.seq.replace('-', '')
+
+        # Use unique prefix for temp files to ensure thread safety
+        prefix = f"seq_{record.id.replace('|', '_')}_"
+
+        # Open temporary files for the sequence and alignment
+        with tempfile.NamedTemporaryFile(mode='w+', prefix=prefix) as temp_fasta, \
+             tempfile.NamedTemporaryFile(mode='w+', prefix=prefix) as temp_stockholm:
+
+            # Save the cleaned sequence to the temporary file
+            SeqIO.write(clean_record, temp_fasta.name, 'fasta')
+
+            # Run hmm align, read the aligned sequence
+            run(['hmmalign', '--trim', '-o', temp_stockholm.name, hmmfile, temp_fasta.name], check=True)
+            alignment = read_alignment(temp_stockholm.name, "stockholm")
+
+        # Get posterior probability string
+        quality_string = alignment.column_annotations.get('posterior_probability', '')
+
+        count = 0
+        # Count . and * characters
+        dot_count = quality_string.count('.')
+        star_count = quality_string.count('*')
+
+        # Give value 0 to . and value 10 to *
+        count += star_count * 10
+
+        # Add all numbers
+        digit_sum = sum(int(char) for char in quality_string if char.isdigit())
+        count += digit_sum
+
+        # Calculate average count
+        average_count = count / len(quality_string) if quality_string else 0
+        return average_count, alignment[0]
+
+    except Exception as e:
+        logger.error(f"Error in align_score for {record.id}: {e}")
+        return 0, None
+
+def process_sequence(record, hmmfile, logger):
+    """
+    Process a single sequence, testing both orientations.
+    Returns the record in the correct orientation and whether it was reversed.
+    """
+    # Score original orientation
+    count_fwd, alignment_fwd = align_score(record, hmmfile, logger)
+    logger.debug(f'Forward alignment score {count_fwd} for {record.id}')
+
+    # Make a copy for reverse complement
+    rev_record = deepcopy(record)
+    rev_record.seq = rev_record.seq.reverse_complement()
+
+    # Score reverse orientation
+    count_rev, alignment_rev = align_score(rev_record, hmmfile, logger)
+    logger.debug(f'Reverse alignment score {count_rev} for {record.id}')
+
+    # Keep the orientation with higher score
+    if count_fwd >= count_rev:
+        logger.debug(f'Keeping forward orientation for {record.id}')
+        return record, False
+    else:
+        logger.debug(f'Keeping reverse orientation for {record.id}')
+        return rev_record, True
+
+def correct_revcom(hmmfile, sequences, logger, threads=4):
+    """
+    Check each sequence with hmmalign in both orientations
+    and keep the orientation with the higher score.
+    Multi-threaded version with progress reporting.
+    """
+    corrected_seqs = []
+    reversed_count = 0
+
+    # Create a partial function for processing sequences
+    process_func = partial(process_sequence, hmmfile=hmmfile, logger=logger)
+
+    # Process sequences in parallel with progress bar
+    logger.info(f"Starting parallel processing with {threads} threads")
+
+    # Using ThreadPoolExecutor with tqdm for progress tracking
+    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
+        # Submit all tasks and get futures
+        futures = [executor.submit(process_func, seq) for seq in sequences]
+
+        # Process results as they complete with progress bar
+        for i, future in enumerate(tqdm(concurrent.futures.as_completed(futures), 
+                                        total=len(futures), 
+                                        desc="Processing sequences")):
+            record, is_reversed = future.result()
+            corrected_seqs.append(record)
+            if is_reversed:
+                reversed_count += 1
+
+            # Log progress periodically (e.g., every 1000 sequences)
+            if (i + 1) % 1000 == 0 or i == 0:
+                logger.info(f"Processed {i + 1}/{len(sequences)} sequences, {reversed_count} reversed so far")
+
+    logger.info(f'Corrected {reversed_count} reverse complemented sequences out of {len(sequences)}')
+    return corrected_seqs
+
+def main():
+    parser = argparse.ArgumentParser(description='Check and correct reverse complemented sequences using HMM')
+    parser.add_argument('fasta', help='Input FASTA file')
+    parser.add_argument('hmm', help='HMM model file')
+    parser.add_argument('-o', '--output', help='Output FASTA file (default: corrected_output.fa)', 
+                        default="corrected_output.fa")
+    parser.add_argument('-v', '--verbosity', default='INFO', 
+                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 
+                        help='Log level (default: INFO)')
+    parser.add_argument('-t', '--threads', type=int, default=4,
+                        help='Number of threads to use (default: 4)')
+    parser.add_argument('-l', '--log', help='Log file (optional, if not specified logs to stdout)')
+
+    args = parser.parse_args()
+
+    # Set up logging to file if specified
+    logger = setup_logger('check_reverse', args.verbosity)
+    if args.log:
+        file_handler = logging.FileHandler(args.log)
+        file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+        logger.addHandler(file_handler)
+
+    # Read sequences
+    sequences = list(SeqIO.parse(args.fasta, "fasta"))
+    logger.info(f"Read {len(sequences)} sequences from {args.fasta}")
+
+    # Correct sequences with multiple threads
+    corrected_sequences = correct_revcom(args.hmm, sequences, logger, args.threads)
+
+    # Write corrected sequences
+    SeqIO.write(corrected_sequences, args.output, "fasta")
+    logger.info(f"Wrote {len(corrected_sequences)} corrected sequences to {args.output}")
+
+if __name__ == "__main__":
+    main()