|
| 1 | +import pytest |
| 2 | +import os |
| 3 | +import microbiorust |
| 4 | +import textwrap |
| 5 | +from pathlib import Path |
| 6 | + |
| 7 | +# --- FIXTURES: Generating Mock Data --- |
| 8 | + |
| 9 | +@pytest.fixture |
| 10 | +def mock_gbk(tmp_path): |
| 11 | + """Creates a minimal valid GenBank file.""" |
| 12 | + path = tmp_path / "test.gbk" |
| 13 | + content = textwrap.dedent(""" |
| 14 | + LOCUS source_1 910 bp DNA linear CON 01-NOV-2024 |
| 15 | + DEFINITION Escherichia coli K-12 substr. MG1655. |
| 16 | + ACCESSION source_1 |
| 17 | + VERSION source_1 |
| 18 | + KEYWORDS . |
| 19 | + SOURCE Escherichia coli K-12 substr. MG1655 |
| 20 | + ORGANISM Escherichia coli K-12 substr. MG1655 |
| 21 | + FEATURES Location/Qualifiers |
| 22 | + source 1..910 |
| 23 | + /organism="K-12 substr. MG1655" |
| 24 | + /mol_type="DNA" |
| 25 | + gene complement(1..354) |
| 26 | + /locus_tag="b3304" |
| 27 | + CDS complement(1..354) |
| 28 | + /locus_tag="b3304" |
| 29 | + /codon_start=1 |
| 30 | + /gene="rplR" |
| 31 | + /translation="MDKKSARIRRATRARRKLQELGATRLVVHRTPRHIYAQVIAPNGSLVAASTVEKAIAEQLKYTGNKDAAAAVGKAVAERALEKGIKDVSFDRSGFQYHGRVQALDAAREAGLQ" |
| 32 | + /product="50S ribosomal subunit protein L18" |
| 33 | + gene complement(364..897) |
| 34 | + /locus_tag="b3305" |
| 35 | + CDS complement(364..897) |
| 36 | + /locus_tag="b3305" |
| 37 | + /codon_start=1 |
| 38 | + /gene="rplF" |
| 39 | + /translation="MSRVAKAPVVVPAGVDVKINGQVITIKGKNGELTRTLNDAVEVKHNTLTFGPRDGYADGWAQAGTARALLNSMVIGVTEGFTKKLQLVGVGYRAAVKGNVINLSGFSHPVDHQLPAGITAECPTQTEIVLKGADKQVIGQVAADLRAYRRPEPYKGKGVRYADVVRTKEAKK" |
| 40 | + /product="50S ribosomal subunit protein L6" |
| 41 | + ORIGIN |
| 42 | + 1 TTAGAACTGA AGGCCAGCTT CACGGGCAGC ATCTGCCAGT GCCTGGACAC GACCATGATA |
| 43 | + 61 TTGGAACCCG GAACGGTCAA AGGATACATC TTTGATGCCT TTTTCCAGAG CGCGTTCAGC |
| 44 | + 121 GACAGCTTTA CCCACAGCTG CAGCCGCGTC TTTGTTACCG GTGTACTTCA GTTGTTCAGC |
| 45 | + 181 GATAGCTTTT TCTACAGTAG AAGCAGCTAC CAGAACTTCA GAACCGTTCG GTGCAATTAC |
| 46 | + 241 CTGTGCGTAA ATGTGACGCG GGGTACGATG TACCACCAGG CGAGTTGCGC CCAGCTCCTG |
| 47 | + 301 GAGCTTGCGG CGTGCGCGGG TCGCACGACG GATACGAGCA GATTTCTTAT CCATAGTGTT |
| 48 | + 361 ACCTTACTTC TTCTTAGCCT CTTTGGTACG CACGACTTCG TCGGCGTAAC GAACACCCTT |
| 49 | + 421 GCCTTTATAA GGCTCAGGAC GACGGTAGGC GCGCAGATCC GCTGCAACCT GGCCGATCAC |
| 50 | + 481 CTGCTTATCA GCGCCTTTCA GCACGATTTC AGTCTGAGTC GGACATTCAG CAGTGATACC |
| 51 | + 541 CGCAGGCAGC TGATGGTCAA CAGGATGAGA GAAACCCAGA GACAGGTTAA TCACATTGCC |
| 52 | + 601 TTTAACCGCT GCACGGTAAC CTACACCAAC CAGCTGCAGC TTCTTAGTGA AGCCTTCGGT |
| 53 | + 661 AACACCGATA ACCATTGAGT TCAGCAGGGC ACGCGCGGTA CCAGCCTGTG CCCAACCGTC |
| 54 | + 721 TGCGTAACCA TCACGCGGAC CGAAGGTCAG GGTATTATCT GCATGTTTAA CTTCAACAGC |
| 55 | + 781 ATCGTTGAGA GTACGAGTCA GCTCGCCGTT TTTACCTTTG ATCGTAATAA CCTGACCGTT |
| 56 | + 841 GATTTTTACG TCAACGCCGG CAGGAACAAC GACCGGTGCT TTAGCAACAC GAGACA |
| 57 | + // |
| 58 | + """) |
| 59 | + path.write_text(content) |
| 60 | + return str(path) |
| 61 | + |
| 62 | +@pytest.fixture |
| 63 | +def mock_msa(tmp_path): |
| 64 | + """Creates a mock FASTA alignment file.""" |
| 65 | + path = tmp_path / "align.fasta" |
| 66 | + content = ( |
| 67 | + ">Seq1\nATGC--AT\n" |
| 68 | + ">Seq2\nATGC--TT\n" |
| 69 | + ">Seq3\nATGCGGTT\n" |
| 70 | + ) |
| 71 | + path.write_text(content) |
| 72 | + return str(path) |
| 73 | + |
| 74 | +@pytest.fixture |
| 75 | +def mock_blast_tab(tmp_path): |
| 76 | + """Creates a mock BLAST tabular file.""" |
| 77 | + path = tmp_path / "results.tab" |
| 78 | + # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore |
| 79 | + content = "seqA\tseqB\t99.0\t100\t1\t0\t1\t100\t1\t100\t1e-10\t200.0\n" |
| 80 | + path.write_text(content) |
| 81 | + return str(path) |
| 82 | + |
| 83 | +#tests for parsers (gbk, embl) |
| 84 | + |
| 85 | +def test_gbk_conversions(mock_gbk): |
| 86 | + #test protein sequence extraction |
| 87 | + #with open(mock_gbk, 'rb') as f: |
| 88 | + # print(f"\n[DEBUG] Raw File Content: {f.read()}") |
| 89 | + faa = microbiorust.gbk_to_faa(mock_gbk) |
| 90 | + assert os.path.exists(mock_gbk) |
| 91 | + assert any("b3304" in line for line in faa) |
| 92 | + |
| 93 | + #test count |
| 94 | + count = microbiorust.gbk_to_faa_count(mock_gbk) |
| 95 | + assert count == 2 |
| 96 | + |
| 97 | + #test nucleotide sequence extraction |
| 98 | + fna = microbiorust.gbk_to_fna(mock_gbk) |
| 99 | + assert any("ttagaactga" in line.lower() for line in fna) |
| 100 | + |
| 101 | +def test_gbk_to_gff(mock_gbk): |
| 102 | + #this function writes to {filename}.gff and reads again to check |
| 103 | + microbiorust.gbk_to_gff(mock_gbk, dna=True) |
| 104 | + gff_path = f"{mock_gbk}.gff" |
| 105 | + assert os.path.exists(gff_path) |
| 106 | + with open(gff_path, "r") as f: |
| 107 | + assert "source_source_1_1" in f.read() |
| 108 | + |
| 109 | +#tests for multiple sequence alignment |
| 110 | + |
| 111 | +def test_subset_msa(mock_msa): |
| 112 | + #subset mock alignment: Rows 0-2 (Seq1 & Seq2), Cols 0-4 (ATGC) |
| 113 | + subset = microbiorust.subset_msa_alignment(mock_msa, (0, 2), (0, 4)) |
| 114 | + print("subset", subset) |
| 115 | + assert len(subset) == 2 # 2 headers + 2 sequences |
| 116 | + assert ">Seq1" in subset[0] |
| 117 | + assert "ATGC" in subset[1] |
| 118 | + |
| 119 | +def test_purge_gaps(mock_msa, tmp_path): |
| 120 | + out_path = str(tmp_path / "purged.fasta") |
| 121 | + #threshold 0.5 should remove the '--' columns in Seq1 and Seq2 and write to file |
| 122 | + microbiorust.purge_gaps(mock_msa, out_path, 0.5) |
| 123 | + assert os.path.exists(out_path) |
| 124 | + |
| 125 | +def test_get_consensus(mock_msa): |
| 126 | + #given 'ATGC' is constant in the mock, it should be in consensus |
| 127 | + consensus = microbiorust.get_consensus(mock_msa) |
| 128 | + assert consensus.startswith("ATGC") |
| 129 | + |
| 130 | +#tests for Sequence Metrics |
| 131 | + |
| 132 | +def test_hydrophobicity(): |
| 133 | + seq = "MALWMRLLPLLALLALWGPDPAAAFVN" |
| 134 | + scores = microbiorust.hydrophobicity(seq, window_size=3) |
| 135 | + assert len(scores) > 0 |
| 136 | + assert all(isinstance(s, float) for s in scores) |
| 137 | + |
| 138 | +def test_amino_counts(): |
| 139 | + seq = "MATAG" |
| 140 | + counts = microbiorust.amino_counts(seq) |
| 141 | + assert counts['M'] == 1 |
| 142 | + assert counts['A'] == 2 |
| 143 | + |
| 144 | +#test for Async Tabular Parser |
| 145 | + |
| 146 | +def test_parse_tabular(mock_blast_tab): |
| 147 | + results = microbiorust.parse_tabular(mock_blast_tab) |
| 148 | + assert len(results) == 1 |
| 149 | + assert results[0]['qseqid'] == "seqA" |
| 150 | + assert results[0]['bitscore'] == 200.0 |
0 commit comments