|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import re |
| 3 | +import csv |
| 4 | +import sys |
| 5 | +import argparse |
| 6 | +import json |
| 7 | + |
| 8 | + |
| 9 | +# Extract CDS information from mrna and transcript sections |
| 10 | +def extract_cds_info(file): |
| 11 | + # Define regex patterns for different statistics |
| 12 | + patterns = { |
| 13 | + "TRANSC_MRNA": re.compile(r"Number of mrna\s+(\d+)"), |
| 14 | + "PCG": re.compile(r"Number of gene\s+(\d+)"), |
| 15 | + "CDS_PER_GENE": re.compile(r"mean mrnas per gene\s+([\d.]+)"), |
| 16 | + "EXONS_PER_TRANSC": re.compile(r"mean exons per mrna\s+([\d.]+)"), |
| 17 | + "CDS_LENGTH": re.compile(r"mean mrna length \(bp\)\s+([\d.]+)"), |
| 18 | + "EXON_SIZE": re.compile(r"mean exon length \(bp\)\s+([\d.]+)"), |
| 19 | + "INTRON_SIZE": re.compile(r"mean intron in cds length \(bp\)\s+([\d.]+)"), |
| 20 | + } |
| 21 | + |
| 22 | + # Initialize a dictionary to store content for different sections |
| 23 | + section_content = {"mrna": "", "transcript": ""} |
| 24 | + |
| 25 | + # Variable to keep track of the current section being processed |
| 26 | + current_section = None |
| 27 | + |
| 28 | + with open(file, "r") as f: |
| 29 | + lines = f.read().splitlines() # read all lines in the file |
| 30 | + |
| 31 | + for line in lines: |
| 32 | + line = line.strip() # Remove any leading/trailing whitespace including newline characters |
| 33 | + |
| 34 | + if "---------------------------------- mrna ----------------------------------" in line: |
| 35 | + current_section = "mrna" # Switch to 'mrna' section |
| 36 | + elif "---------------------------------- transcript ----------------------------------" in line: |
| 37 | + current_section = "transcript" # Switch to 'transcript' section |
| 38 | + elif "----------" in line: |
| 39 | + current_section = None # End of current section |
| 40 | + elif current_section: |
| 41 | + section_content[current_section] += ( |
| 42 | + line + " " |
| 43 | + ) # Accumulate content for the current section, separate lines by a space |
| 44 | + |
| 45 | + cds_info = {} |
| 46 | + |
| 47 | + for label, pattern in patterns.items(): |
| 48 | + text_to_search = section_content["mrna"] if label != "EXONS_PER_TRANSC" else section_content["transcript"] |
| 49 | + match = re.search(pattern, text_to_search) |
| 50 | + if match: |
| 51 | + cds_info[label] = match.group(1) |
| 52 | + |
| 53 | + return cds_info |
| 54 | + |
| 55 | + |
| 56 | +# Function to extract the number of non-coding genes from the second file |
| 57 | +def extract_non_coding_genes(file): |
| 58 | + non_coding_genes = {"ncrna_gene": 0} |
| 59 | + |
| 60 | + with open(file, "r") as f: |
| 61 | + for line in f: |
| 62 | + parts = line.split() |
| 63 | + if len(parts) < 2: |
| 64 | + continue |
| 65 | + |
| 66 | + gene_type = parts[0] |
| 67 | + try: |
| 68 | + count = int(parts[1]) |
| 69 | + except ValueError: |
| 70 | + continue |
| 71 | + |
| 72 | + if gene_type in non_coding_genes: |
| 73 | + non_coding_genes[gene_type] += count |
| 74 | + |
| 75 | + NCG = sum(non_coding_genes.values()) |
| 76 | + return {"NCG": NCG} |
| 77 | + |
| 78 | + |
| 79 | +# Extract the one_line_summary from a BUSCO JSON file |
| 80 | +def extract_busco_results(busco_stats_file): |
| 81 | + try: |
| 82 | + with open(busco_stats_file, "r") as file: |
| 83 | + busco_data = json.load(file) |
| 84 | + # Extract the one_line_summary from the results section |
| 85 | + one_line_summary = busco_data.get("results", {}).get("one_line_summary") |
| 86 | + if one_line_summary: |
| 87 | + # Use regex to extract everything after the first colon |
| 88 | + match = re.search(r':\s*"(.*)"', one_line_summary) |
| 89 | + if match: |
| 90 | + one_line_summary = match.group(1) # Get text after the colon |
| 91 | + return {"BUSCO_PROTEIN_SCORES": one_line_summary} if one_line_summary else {} |
| 92 | + except (json.JSONDecodeError, FileNotFoundError) as e: |
| 93 | + print(f"Error loading BUSCO JSON file: {e}") |
| 94 | + return {} |
| 95 | + |
| 96 | + |
| 97 | +# Function to write the extracted data to a CSV file |
| 98 | +def write_to_csv(data, output_file, busco_stats_file): |
| 99 | + busco_results = extract_busco_results(busco_stats_file) |
| 100 | + |
| 101 | + descriptions = { |
| 102 | + "TRANSC_MRNA": "The number of transcribed mRNAs", |
| 103 | + "PCG": "The number of protein coding genes", |
| 104 | + "NCG": "The number of non-coding genes", |
| 105 | + "CDS_PER_GENE": "The average number of coding transcripts per gene", |
| 106 | + "EXONS_PER_TRANSC": "The average number of exons per transcript", |
| 107 | + "CDS_LENGTH": "The average length of coding sequence", |
| 108 | + "EXON_SIZE": "The average length of a coding exon", |
| 109 | + "INTRON_SIZE": "The average length of coding intron size", |
| 110 | + "BUSCO_PROTEIN_SCORES": "BUSCO results summary from running BUSCO in protein mode", |
| 111 | + } |
| 112 | + |
| 113 | + with open(output_file, "w", newline="") as csvfile: |
| 114 | + writer = csv.writer(csvfile) |
| 115 | + |
| 116 | + # Write descriptions at the top of the CSV file |
| 117 | + for key, description in descriptions.items(): |
| 118 | + csvfile.write(f"# {key}: {description}\n") |
| 119 | + |
| 120 | + # Write the Variable and Value columns header |
| 121 | + writer.writerow(["#paramName", "paramValue"]) |
| 122 | + |
| 123 | + # Write the data |
| 124 | + for key, value in data.items(): |
| 125 | + writer.writerow([key, value]) |
| 126 | + |
| 127 | + # Add the BUSCO results summary |
| 128 | + for key, value in busco_results.items(): |
| 129 | + writer.writerow([key, value]) |
| 130 | + |
| 131 | + |
| 132 | +# Main function to take input files and output file as arguments |
| 133 | +def main(): |
| 134 | + Description = "Parse contents of the agat_spstatistics, buscoproteins and agat_sqstatbasic to extract relevant annotation statistics information." |
| 135 | + Epilog = ( |
| 136 | + "Example usage: python extract_annotation_statistics_info.py <basic_stats> <other_stats> <busco_stats> <output>" |
| 137 | + ) |
| 138 | + |
| 139 | + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) |
| 140 | + parser.add_argument("basic_stats", help="Input txt file with basic_feature_statistics.") |
| 141 | + parser.add_argument("other_stats", help="Input txt file with other_feature_statistics.") |
| 142 | + parser.add_argument("busco_stats", help="Input JSON file for the BUSCO statistics.") |
| 143 | + parser.add_argument("output", help="Output file.") |
| 144 | + parser.add_argument("--version", action="version", version="%(prog)s 1.0") |
| 145 | + args = parser.parse_args() |
| 146 | + |
| 147 | + cds_info = extract_cds_info(args.other_stats) |
| 148 | + non_coding_genes = extract_non_coding_genes(args.basic_stats) |
| 149 | + data = {**cds_info, **non_coding_genes} |
| 150 | + write_to_csv(data, args.output, args.busco_stats) |
| 151 | + |
| 152 | + |
| 153 | +if __name__ == "__main__": |
| 154 | + sys.exit(main()) |
0 commit comments