Merge pull request #46 from EBI-Metagenomics/hotfix/assembly_gff_builder

mberacochea · web-flow · commit 3dca9b4991e9 · 2022-02-25T14:17:10.000Z
Several build_assembly_gff fixes
diff --git a/collect_scripts.py b/collect_scripts.py
@@ -86,5 +86,5 @@
 
                 shutil.copy(file_path, dest)
                 print(f"Script {file_path} copied to {dest}")
-                os.chmod(dest, S_IREAD | S_IRGRP | S_IWUSR | S_IRGRP | S_IEXEC)
+                os.chmod(dest, S_IREAD | S_IRGRP | S_IWUSR | S_IRGRP | S_IEXEC | S_IXGRP)
                 print(f"- made {dest} as writable")
diff --git a/tools/Assembly/GFF/build_assembly_gff.py b/tools/Assembly/GFF/build_assembly_gff.py
@@ -23,7 +23,6 @@
 
 
 class Annotation:
-
     @classmethod
     def merge(cls, annotations):
         """Merge the annotations.
@@ -43,37 +42,41 @@ def merge(cls, annotations):
     @classmethod
     def clean_seq_name(cls, name):
         # prodigal clean up
-        prodigal_match = re.match('_\d+$', name)
+        prodigal_match = re.search("_\d+$", name)
         if prodigal_match:
-            return re.sub(r'_\d+$', '', name)
+            return re.sub(r"_\d+$", "", name)
         # fgs clean_up
-        fgs_match = re.match('^(?P<contig>.+?)_\d+_\d+\_.', name)
+        fgs_match = re.search("^(?P<contig>.+?)_\d+_\d+\_.", name)
         if fgs_match:
             groups = fgs_match.groupdict()
-            return groups['contig']
-        
+            return groups["contig"]
+
         # no fgs or prodigal
-        return name
+        raise Exception("Error parsing the header:" + name)
 
     def _split_line(self, line):
-        return line.replace('\n', ' ').replace('\r', '').split('\t')
+        return line.replace("\n", " ").replace("\r", "").split("\t")
 
     def _get_value(self, value, split=True):
         if split:
-            return list(filter(None, value.strip().split(',')))
+            return list(filter(None, value.strip().split(",")))
         else:
             return [value.strip()] if value else []
 
     def get(self):
         """
         Get the annotation in an array with [Key,Value] structure
         """
-        return [[a, v] for a, v in sorted(self.__dict__.items()) if a != 'query_name' and len(v)]
+        return [
+            [a, v]
+            for a, v in sorted(self.__dict__.items())
+            if a != "query_name" and len(v)
+        ]
 
 
 class EggResult(Annotation):
-    """EggNOG tsv result row.
-    """
+    """EggNOG tsv result row."""
+
     def __init__(self, line):
         """Lines parsed according to the documentation
         https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2#v200
@@ -96,57 +99,48 @@ def __init__(self, line):
 
 
 class InterProResult(Annotation):
-    """InterPro scan result row.
-    """
+    """InterPro scan result row."""
+
     def __init__(self, line):
         columns = self._split_line(line)
         self.query_name = columns[0].strip()
         pfam = columns[4]
-        if re.match('PF\d+', pfam): # noqa
+        if re.match("PF\d+", pfam):  # noqa
             self.pfam = self._get_value(pfam)
         if len(columns) > 11:
             self.interpro = self._get_value(columns[11])
 
 
-def parse_fasta_header_mags(header):
-    match = re.match(
-        '^\>(?P<contig>.+\-\-contig:-.*)\s\#\s(?P<start>\d+)\s\#\s(?P<end>\d+)\s\#\s(?P<strand>\-1|1)\s.*$', header) # noqa
-    if match:
-        groups = match.groupdict()
-        return groups['contig'], groups['start'], groups['end'], groups['strand']
-    return None, None, None, None
-
-
 def parse_fasta_header(header):
-    # FIXME: add sanity check
-    # FIXME: what should we do with cases like :
-    #        ERZ477576.1085103-NODE-1085103-length-126-cov-0.633803_1_126_+
- 
-    # Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340  
-    prodigal_match = re.match('^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#', header)
+    """Parse the hader header, only 2 supported formats are prodigal and FGS."""
+
+    # Prodigal header example: >NODE-3-length-2984-cov-4.247866_3 # 1439 # 1894 # 1 # ID=3_3;partial=00;start_type=TTG;rbs_motif=TAA;rbs_spacer=8bp;gc_cont=0.340
+    prodigal_match = re.match(
+        "^>(?P<contig>.+?)\s#\s(?P<start>\d+)\s#\s(?P<end>\d+)\s#\s(?P<strand>.+?)\s#",
+        header,
+    )
     if prodigal_match:
         groups = prodigal_match.groupdict()
-        return groups['contig'], groups['start'], groups['end'], groups['strand']
-    
+        return groups["contig"], groups["start"], groups["end"], groups["strand"]
+
     # FGS header example: >ERZ1759872.3-contig-100_3188_4599_-
-    fgs_match = re.match('^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)', header)
+    fgs_match = re.match("^>.+?_(?P<start>\d+)_(?P<end>\d+)_(?P<strand>.)", header)
     if fgs_match:
         groups = fgs_match.groupdict()
-        strand = '1'
-        if groups['strand'] == '-':
+        strand = "1"
+        if groups["strand"] == "-":
             strand = "-1"
-        return header.rstrip().replace(">", ""), groups['start'], groups['end'], strand
-    
+        return header.rstrip().replace(">", ""), groups["start"], groups["end"], strand
+
     # unable to parse fasta header
-    return None, None, None, None
-    
+    raise Exception("Unable to parse fasta header " + header)
+
 
 def load_annotation(file, klass, annotations):
-    """Load the annotations of a TSV by `query_name` (contig name at the moment)
-    """
-    with open(file, 'rt') as ann_file:
+    """Load the annotations of a TSV by `query_name` (contig name at the moment)"""
+    with open(file, "rt") as ann_file:
         for line in ann_file:
-            if '#' in line:
+            if "#" in line:
                 continue
             parsed_line = klass(line)
             if parsed_line.query_name in annotations:
@@ -187,86 +181,90 @@ def build_gff(annotations, faa):
     ctg123 . exon            5000  5500  .  +  .  ID=exon00004;Parent=mrna0001
     ctg123 . exon            7000  9000  .  +  .  ID=exon00005;Parent=mrna0001
     """
-    with open(faa, 'rt') as faa_file:
+    with open(faa, "rt") as faa_file:
         for line in faa_file:
-            if '>' not in line:
+            if ">" not in line:
                 continue
 
             # each fasta is suffixed on the annotated faa if a prefix _INT (_1 .. _n)
             contig_name, start, end, strand = parse_fasta_header(line)
             if None in (contig_name, start, end, strand):
-                print(line, end='', file=sys.stderr)
+                print(
+                    "It was not possible to parse the " + line, end="", file=sys.stderr
+                )
                 continue
 
             clean_name = Annotation.clean_seq_name(contig_name)
 
-            row_annotations = Annotation.merge([ann.get() for ann in annotations.get(contig_name, [])])
-            
-            ann_string = ';'.join(['{}={}'.format(k, ','.join(v).strip()) for k, v in row_annotations.items()])
+            row_annotations = Annotation.merge(
+                [ann.get() for ann in annotations.get(contig_name, [])]
+            )
+
+            ann_string = ";".join(
+                [
+                    "{}={}".format(k, ",".join(v).strip())
+                    for k, v in row_annotations.items()
+                ]
+            )
 
-            eggNOGScore = ''.join(row_annotations.get('eggNOG_score', []))
+            eggNOGScore = "".join(row_annotations.get("eggNOG_score", []))
 
             if len(ann_string):
                 yield [
                     clean_name,
-                    'eggNOG-v2',
-                    'CDS',
+                    "eggNOG-v2",
+                    "CDS",
                     start,
                     end,
-                    eggNOGScore or '.',
-                    '+' if strand == '1' else '-',
-                    '.',
-                    'ID=' + clean_name + ';' + ann_string
+                    eggNOGScore or ".",
+                    "+" if strand == "1" else "-",
+                    ".",
+                    "ID=" + clean_name + ";" + ann_string,
                 ]
 
-def error_exit(gff_file):
-    if os.path.exists(gff_file):
-        os.remove(gff_file)
-    open("no_antismash", "w").close()
-    sys.exit(0)
 
-
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='Build an assembly GFF file (sorted and indexed using samtools and tabix)')
+        description="Build an assembly GFF file (sorted and indexed using samtools and tabix)"
+    )
     parser.add_argument(
-        '-e', dest='egg', help='EggNOG tsv results. eggNOG version 2 required.', required=False)
+        "-e",
+        dest="egg",
+        help="EggNOG tsv results. eggNOG version 2 required.",
+        required=True,
+    )
     parser.add_argument(
-        '-i', dest='interpro', help='InterProScan tsv results', required=False)
-    # FIXME: Are we going to use this?
-    # parser.add_argument(
-        # '-a', dest='antismash', help='antiSMASH tsv results (geneclusters.txt)', required=False)
-    # FIXME: Are we going to use this?
-    # parser.add_argument(
-    #     '-k', dest='keggmodule', help='KEGG Modules per contig annotation', required=False)
+        "-i", dest="interpro", help="InterProScan tsv results", required=True
+    )
     parser.add_argument(
-        '-f', dest='faa', help='FASTA with the CDS annotated (faa)', required=True)
-    parser.add_argument(
-        '-o', dest='out', help='Ouput GFF file name', required=True)
+        "-f", dest="faa", help="FASTA with the CDS annotated (faa)", required=True
+    )
+    parser.add_argument("-o", dest="out", help="Ouput GFF file name", required=True)
     args = parser.parse_args()
 
     annotations = {}
     load_annotation(args.egg, EggResult, annotations)
     load_annotation(args.interpro, InterProResult, annotations)
 
     if len(annotations) < 1:
-        print("No annotations loaded, aborting")
-        error_exit(args.out)
+        raise Exception("No annotations loaded, aborting")
 
     records = 0
-    with open(args.out, 'w', buffering=1) as out_handle:
-        print('##gff-version 3', file=out_handle)
+    with open(args.out, "w", buffering=1) as out_handle:
+        print("##gff-version 3", file=out_handle)
         for row in build_gff(annotations, args.faa):
-            print('\t'.join(row), file=out_handle)
+            print("\t".join(row), file=out_handle)
             records += 1
+
     if records == 0:
-        print("No annotations in GFF, aborting")
-        error_exit(args.out)
-
-    print('Sorting...')
-    subprocess.call('(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)'.format(args.out) +
-                    ' | bgzip > {0}.bgz'.format(args.out), shell=True)
-    print('Building the index...')
-    subprocess.call(['tabix', '-p', 'gff', '{}.bgz'.format(args.out)])
-    print('Bye')
+        raise Exception("No annotations in GFF, aborting")
+
+    print("Sorting...")
+    subprocess.call(
+        '(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)'.format(args.out)
+        + " | bgzip > {0}.bgz".format(args.out),
+        shell=True,
+    )
+    print("Building the index...")
+    subprocess.call(["tabix", "-p", "gff", "{}.bgz".format(args.out)])
+    print("Bye")