diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1c628998..6962547c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -97,6 +97,7 @@ jobs: run: pytest --full --cov=ppanggolin --cov-report=term --cpu $NUM_CPUS - name: Archive diff files + if: always() uses: actions/upload-artifact@v4 with: name: comparison-results_${{ matrix.os }}_python${{ matrix.python-version }} diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index bde573d7..44353090 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -8,7 +8,7 @@ from multiprocessing import Value from subprocess import Popen, PIPE import ast -from collections import defaultdict +from collections import defaultdict, Counter from typing import Dict, List, Optional, Union, Generator, Tuple from pathlib import Path import shutil @@ -155,6 +155,7 @@ def launch_prodigal( closed=True, # -c: Closed ends. Do not allow genes to run off edges. mask=True, # -m: Treat runs of N as masked sequence; don't build genes across them. min_gene=120, # This is to prevent error with mmseqs translatenucs that cut too short sequences + min_mask=9, # Minimum length of masked sequence to trigger masking (default 50). ) if not use_meta: @@ -162,18 +163,34 @@ def launch_prodigal( *contig_sequences.values(), force_nonsd=False, translation_table=code ) # -g: Specify a translation table to use (default 11). gene_counter = 1 + + genetic_code_count = Counter() + for contig_name, sequence in sequences.items(): - for pred in gene_finder.find_genes(sequence): + + genes = gene_finder.find_genes(sequence) + translation_table = genes.training_info.translation_table + genetic_code_count[translation_table] += 1 + + for pred in genes: gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}") gene.fill_annotations( start=pred.begin, stop=pred.end, strand="-" if pred.strand == -1 else "+", gene_type="CDS", - genetic_code=code, + genetic_code=translation_table, ) gene_counter += 1 gene_objs[contig_name].add(gene) + + if len(genetic_code_count) > 1: + logging.getLogger("PPanGGOLiN").warning( + f"Pyrodigal used multiple translation tables for genome '{org.name}' " + f"using mode {'meta' if use_meta else 'single'}, which is unexpected. " + f"A genome is expected to have a single genetic code. " + f"Translation table distribution across contigs: {dict(genetic_code_count)}" + ) return gene_objs @@ -541,18 +558,19 @@ def annotate_organism( contig_sequences = get_contigs_from_fasta_file(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) + if procedure is None: # prodigal procedure is not force by user - max_contig_len = max(len(contig) for contig in org.contigs) - if max_contig_len < 20000: # case of short sequence + sum_contig_len = sum(len(contig) for contig in org.contigs) + if sum_contig_len < 20000: # case of short sequence use_meta = True logging.getLogger("PPanGGOLiN").info( - f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size." + f"Using metagenomic mode for {org_name} (total contig length: {sum_contig_len}bp < 20kb threshold)" ) - else: use_meta = False else: use_meta = True if procedure == "meta" else False + genes = syntaxic_annotation( org, fasta_file, contig_sequences, tmpdir, norna, kingdom, code, use_meta ) diff --git a/testingDataset/expected_info_files/expected_hashes.json b/testingDataset/expected_info_files/expected_hashes.json index 947b0b11..aed930b0 100644 --- a/testingDataset/expected_info_files/expected_hashes.json +++ b/testingDataset/expected_info_files/expected_hashes.json @@ -1,6 +1,6 @@ { - "mybasicpangenome/gene_families.tsv": "9d6219523e890b08c467c936590b37436e915840a797ea161c9707041c3b821c", - "stepbystep/gene_families.tsv": "9d6219523e890b08c467c936590b37436e915840a797ea161c9707041c3b821c", + "mybasicpangenome/gene_families.tsv": "ce592d62c3bdb31747fb48d6dca2037f1d46489505f3e90fe52060cf8937e26f", + "stepbystep/gene_families.tsv": "ce592d62c3bdb31747fb48d6dca2037f1d46489505f3e90fe52060cf8937e26f", "myannopang/gene_families.tsv": "5cb88afbc1a49e7d21531d36d5ee3471a87834f4bb82578347dc92231ba0a452", "config_pangenome/gene_families.tsv": "d9698f054a14d8fcea35542c7d8338ea73f6d128bd287adfa01e99be40c80dc4", "readclusters/gene_families.tsv": "d9698f054a14d8fcea35542c7d8338ea73f6d128bd287adfa01e99be40c80dc4", diff --git a/testingDataset/expected_info_files/mybasicpangenome_info.yaml b/testingDataset/expected_info_files/mybasicpangenome_info.yaml index 55a69775..215719e2 100644 --- a/testingDataset/expected_info_files/mybasicpangenome_info.yaml +++ b/testingDataset/expected_info_files/mybasicpangenome_info.yaml @@ -11,30 +11,30 @@ Status: PPanGGOLiN_Version: 2.2.6 Content: - Genes: 45429 + Genes: 45423 Genomes: 51 - Families: 1010 - Edges: 1075 + Families: 1004 + Edges: 1070 Persistent: Family_count: 874 - min_genomes_frequency: 0.86 + min_genomes_frequency: 0.84 max_genomes_frequency: 1.0 sd_genomes_frequency: 0.01 mean_genomes_frequency: 0.99 Shell: - Family_count: 34 + Family_count: 35 min_genomes_frequency: 0.22 max_genomes_frequency: 0.78 - sd_genomes_frequency: 0.15 - mean_genomes_frequency: 0.46 + sd_genomes_frequency: 0.16 + mean_genomes_frequency: 0.45 Cloud: - Family_count: 102 + Family_count: 95 min_genomes_frequency: 0.02 max_genomes_frequency: 0.22 sd_genomes_frequency: 0.05 mean_genomes_frequency: 0.05 Number_of_partitions: 3 - RGP: 45 + RGP: 46 Spots: 2 Modules: Number_of_modules: 4 diff --git a/testingDataset/expected_info_files/stepbystep_info.yaml b/testingDataset/expected_info_files/stepbystep_info.yaml index 5e32ab8d..23cff5c4 100644 --- a/testingDataset/expected_info_files/stepbystep_info.yaml +++ b/testingDataset/expected_info_files/stepbystep_info.yaml @@ -11,13 +11,13 @@ Status: PPanGGOLiN_Version: 2.2.6 Content: - Genes: 45429 + Genes: 45423 Genomes: 51 - Families: 1010 - Edges: 1075 + Families: 1004 + Edges: 1070 Persistent: Family_count: 873 - min_genomes_frequency: 0.86 + min_genomes_frequency: 0.84 max_genomes_frequency: 1.0 sd_genomes_frequency: 0.01 mean_genomes_frequency: 0.99 @@ -28,7 +28,7 @@ Content: sd_genomes_frequency: 0.18 mean_genomes_frequency: 0.45 Cloud: - Family_count: 99 + Family_count: 93 min_genomes_frequency: 0.02 max_genomes_frequency: 0.22 sd_genomes_frequency: 0.05 @@ -36,9 +36,9 @@ Content: Number_of_partitions: 3 Genomes_fluidity: all: 0.025 - shell: 0.553 - cloud: 0.039 - accessory: 0.624 + shell: 0.554 + cloud: 0.041 + accessory: 0.623 RGP: 66 Spots: 2 Modules: