EBI-Metagenomics
diff --git a/‎README.md‎
Lines changed: 82 additions & 44 deletions b/‎README.md‎
Lines changed: 82 additions & 44 deletions
diff --git a/‎assets/output_folder_structure.json‎
Lines changed: 19 additions & 0 deletions b/‎assets/output_folder_structure.json‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎bin/add_genomes_to_remove_list.py‎
Lines changed: 60 additions & 0 deletions b/‎bin/add_genomes_to_remove_list.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎bin/check_catalogue_structure.py‎
Lines changed: 82 additions & 47 deletions b/‎bin/check_catalogue_structure.py‎
Lines changed: 82 additions & 47 deletions
diff --git a/‎bin/classify_folders.py‎
Lines changed: 13 additions & 6 deletions b/‎bin/classify_folders.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎bin/create_metadata_table.py‎
Lines changed: 9 additions & 2 deletions b/‎bin/create_metadata_table.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎bin/extract_info_from_metadata_table.py‎
Lines changed: 14 additions & 4 deletions b/‎bin/extract_info_from_metadata_table.py‎
Lines changed: 14 additions & 4 deletions
@@ -0,0 +1,19 @@
+{
+  "ftp": [
+      "genomes-all_metadata.tsv", 
+      "all_genomes", 
+      "species_catalogue", 
+      "all_genomes.msh"
+  ],
+  "additional_data": [
+      "panaroo_output", 
+      "mgyg_genomes",
+    {
+      "intermediate_files": [
+          "extra_weight_table.txt", 
+          "renamed_genomes_name_mapping.tsv"
+      ]
+    }
+  ]
+}
+
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+# This file is part of MGnify genome analysis pipeline.
+#
+# MGnify genome analysis pipeline is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# MGnify genome analysis pipeline is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+import os
+
+
+def main(remove_list_file, add_list_file, message, output_file):
+    already_in_remove_list = set()
+    if os.path.isfile(remove_list_file):
+        with open(remove_list_file, 'r') as file_in, open(output_file, "w") as file_out:
+            for line in file_in:
+                if line.startswith("MGYG"):
+                    file_out.write(line)
+                    accession = line.split('\t')[0].strip()
+                    already_in_remove_list.add(accession)
+
+    with open(add_list_file, 'r') as file_in, open(output_file, 'a') as file_out:
+        for line in file_in:
+            accession = line.strip()
+            if accession not in already_in_remove_list:
+                file_out.write(f"{accession}\t{message}\n")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Script adds genomes that existed in the previous version of a '
+                                                 'catalogue but failed QC checks during the update to the remove '
+                                                 'list. Only genomes that are not already present in the remove '
+                                                 'list are added.')
+    parser.add_argument('-r', '--remove-list', required=True,
+                        help='Path to the file containing the list of genomes to remove. File should be tab-delimited '
+                             'with the MGYG accession in the first column and reason for removal in the second.')
+    parser.add_argument('-a', '--add-list', required=True,
+                        help='Path to the file containing the list of genomes to add to the remove list.')
+    parser.add_argument('-m', '--message', required=True,
+                        help='Reason for removal that will be printed to the remove file.')
+    parser.add_argument('-o', '--output', required=True,
+                        help='Name of the output file.')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args.remove_list, args.add_list, args.message, args.output)
+    
@@ -15,63 +15,98 @@
 # You should have received a copy of the GNU General Public License
 # along with MGnify genomes catalogue pipeline. If not, see <https://www.gnu.org/licenses/>.
 
+
 import argparse
+import json
 import logging
-import os
+import sys
+from pathlib import Path
+from jsonschema import validate
 
 logging.basicConfig(level=logging.INFO)
 
 
-def main(input_folder):
-    issues = list()
-    # verify that all expected folders are where they are
-    main_folders = ["ftp", "additional_data"]  # not checking "website" because we don't need it
-    for folder in main_folders:
-        if not verify_folder(input_folder, folder):
-            issues.append("Folder {} is not found.".format(os.path.join(input_folder, folder)))
-    ftp_checklist = ["genomes-all_metadata.tsv", "all_genomes", "species_catalogue"]
-    additional_data_checklist = ["panaroo_output", "mgyg_genomes"]
-    intermediate_files_checklist = ["extra_weight_table.txt", "drep_data_tables.tar.gz",
-                                    "renamed_genomes_name_mapping.tsv"]
-    for element in ftp_checklist:
-        ftp_path = os.path.join(input_folder, "ftp")
-        if not verify_folder(ftp_path, element):
-            issues.append("{} is not found.".format(os.path.join(ftp_path, element)))
-    for element in additional_data_checklist:
-        additional_data_path = os.path.join(input_folder, "additional_data")
-        if not verify_folder(additional_data_path, element):
-            issues.append("{} is not found.".format(os.path.join(additional_data_path, element)))
-    for element in intermediate_files_checklist:
-        intermediate_files_path = os.path.join(input_folder, "additional_data", "intermediate_files")
-        if not verify_folder(intermediate_files_path, element):
-            issues.append("{} is not found.".format(os.path.join(intermediate_files_path, element)))
-    if len(issues) > 0:
-        with open("PREVIOUS_CATALOGUE_STRUCTURE_ERRORS.txt", "w") as f:
-            f.write("\n".join(issues))
-        logging.error("Catalogue structure issues found")
-    else:
-        with open("PREVIOUS_CATALOGUE_STRUCTURE_OK.txt", "w") as f:
-            logging.info("Catalogue structure OK")
+def check_structure(base_folder: Path, structure: dict):
+    """Recursively verify that expected structure exists under base_folder."""
+    issues = []
 
+    for key, expected_items in structure.items():
+        folder_path = base_folder / key
+        if not folder_path.exists():
+            issues.append(f"Missing folder: {folder_path}")
+            continue
+
+        for item in expected_items:
+            # If it's a string → file or folder expected
+            if isinstance(item, str):
+                item_path = folder_path / item
+                if not item_path.exists():
+                    issues.append(f"Missing: {item_path}")
+
+            # If it's a dict → nested structure
+            elif isinstance(item, dict):
+                issues.extend(check_structure(folder_path, item))
+
+    return issues
 
-def verify_folder(main_path, element_to_check):
-    if os.path.exists(os.path.join(main_path, element_to_check)):
-        return True
-    else:
-        return False
 
+def main():
+    parser = argparse.ArgumentParser(
+        description="Validate folder structure of the previous catalogue version using a JSON definition."
+    )
+    parser.add_argument(
+        "-i", "--input_folder", required=True,
+        help="Path to the output folder of the previous catalogue version. Folders 'ftp' and 'additional_data' should"
+             "be inside of it."
+    )
+    parser.add_argument(
+        "-s", "--schema", required=True,
+        help="Path to JSON file defining expected folder structure."
+    )
+    args = parser.parse_args()
 
-def parse_args():
-    parser = argparse.ArgumentParser(description='The script is part of the catalogue update pipeline. It checks '
-                                                 'that all expected files from the previous version of the catalogue '
-                                                 'are present in the expected locations.')
-    parser.add_argument('-i', dest='input_folder', required=True, help='Location of the previous catalogue. '
-                                                                       'Folders "ftp", "website", "additional_data" '
-                                                                       'should be inside this folder')
+    base_folder = Path(args.input_folder)
+    schema_path = Path(args.schema)
 
-    return parser.parse_args()
+    if not base_folder.exists():
+        logging.error(f"Input folder does not exist: {base_folder}")
+        sys.exit(1)
+
+    if not schema_path.exists():
+        logging.error(f"Schema file not found: {schema_path}")
+        sys.exit(1)
+
+    with open(schema_path) as f:
+        expected_structure = json.load(f)
+
+    # Validate JSON format itself
+    validate(
+        instance=expected_structure,
+        schema={
+            "type": "object",
+            "patternProperties": {
+                ".*": {
+                    "type": "array",
+                    "items": {
+                        "anyOf": [
+                            {"type": "string"},
+                            {"type": "object"}
+                        ]
+                    }
+                }
+            }
+        }
+    )
+
+    issues = check_structure(base_folder, expected_structure)
+
+    if issues:
+        Path("PREVIOUS_CATALOGUE_STRUCTURE_ERRORS.txt").write_text("\n".join(issues))
+        logging.error("Catalogue structure issues found.")
+    else:
+        Path("PREVIOUS_CATALOGUE_STRUCTURE_OK.txt").write_text("Catalogue structure OK")
+        logging.info("Catalogue structure OK.")
 
 
-if __name__ == '__main__':
-    args = parse_args()
-    main(args.input_folder)
+if __name__ == "__main__":
+    main()
@@ -16,6 +16,7 @@
 # along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
 
 
+import glob
 import os
 import shutil
 import argparse
@@ -34,7 +35,7 @@ def classify_split_folders(input_folder):
     clusters = os.listdir(drep_clusters)
     for cluster in clusters:
         dir_files = os.listdir(os.path.join(drep_clusters, cluster))
-        genomes = [i for i in dir_files if i.endswith(".fa")]
+        genomes = [i for i in dir_files if i.endswith((".fa", ".fna"))]
         number_of_genomes = len(genomes)
         path_cluster_many = os.path.join(NAME_MANY_GENOMES, cluster)
         path_cluster_one = os.path.join(NAME_ONE_GENOME, cluster)
@@ -67,14 +68,20 @@ def classify_by_file(split_text, genomes_folder):
         for line in file_in:
             main_folder, cluster, genomes_str = line.strip().split(":")
             genomes = genomes_str.split(",")
-            cluster_name = genomes[0].split(".")[0]  # cluster
+            cluster_name = genomes[0].rsplit(".", 1)[0]  # cluster
             path_cluster = os.path.join(main_folder, cluster_name)
             if not os.path.exists(path_cluster):
                 os.mkdir(path_cluster)
             for genome in genomes:
-                old_path = os.path.join(genomes_folder, genome)
-                new_path = os.path.join(path_cluster, genome)
-                shutil.copy(old_path, new_path)
+                base_name, _ = os.path.splitext(genome)  # remove extension from genome filename in cluster split file
+                pattern = os.path.join(genomes_folder, base_name + ".*")  # match any extension
+                matches = glob.glob(pattern)
+                if matches:
+                    old_path = matches[0]  # take the first matching file
+                    new_path = os.path.join(path_cluster, os.path.basename(old_path))
+                    shutil.copy(old_path, new_path)
+                else:
+                    sys.exit("Cannot find expected genome {}".format(genome))
 
 
 if __name__ == "__main__":
@@ -113,7 +120,7 @@ def classify_by_file(split_text, genomes_folder):
             os.makedirs(NAME_ONE_GENOME)
 
         if args.input_folder:
-            print("Classify splitted folders")
+            print("Classify split folders")
             classify_split_folders(args.input_folder)
         elif args.text_file:
             if args.genomes:
 
@@ -29,6 +29,8 @@
 
 logging.basicConfig(level=logging.INFO)
 
+UNINFORMATIVE = {"not collected", "not present", "na", "n/a", "missing", "not applicable"}
+
 
 def main(
     genomes_dir,
@@ -137,14 +139,19 @@ def add_sample_project_loc(df, location_file, previous_version_data):
     df = df.merge(location_data_df, on='Genome_accession', how='left')
 
     # If previous version data is available, override where applicable
+    # Replace uninformative country/continent with "not provided" as is already done for all new genomes
     if not previous_version_data.empty:
         prev_data_dict = previous_version_data.set_index("Genome").to_dict(orient="index")
         for idx, row in df.iterrows():
             genome = row["Genome"]
             if genome in prev_data_dict:
                 for col in ["Sample_accession", "Study_accession", "Country", "Continent"]:
-                    # TODO: add checks that this is informative (not "not provided")
-                    df.at[idx, col] = prev_data_dict[genome][col]
+                    val = prev_data_dict[genome][col]
+                    if col in ["Country", "Continent"]:
+                        if isinstance(val, str) and val.strip().lower() in UNINFORMATIVE:
+                            df.at[idx, col] = "not provided"
+                            continue
+                    df.at[idx, col] = val
 
     return df
 
 
@@ -17,12 +17,15 @@
 
 import argparse
 import csv
+import os
 import sys
 
 
-def main(metadata_table, prefix):
+def main(metadata_table, prefix, genomes_folder):
     outfile_quality = f'{prefix}_checkm_quality.csv'
     outfile_stats = f'{prefix}_assembly_stats.tsv'
+    # Make a dictionary from the genomes FASTA folder where keys=accessions, values=FASTA file names with extensions
+    genome_dict = {os.path.splitext(f)[0]: f for f in os.listdir(genomes_folder)}
     with (open(metadata_table, "r") as file_in, open(outfile_quality, "w") as checkm_out,
           open(outfile_stats, "w") as stats_out):
         csv_writer_checkm = csv.writer(checkm_out)
@@ -43,9 +46,14 @@ def main(metadata_table, prefix):
 
         for line in file_in:
             parts = line.strip().split("\t")
-            csv_writer_stats.writerow([parts[genome_idx], parts[length_idx], parts[n50_idx], parts[gc_idx], 
+            genome_name = parts[genome_idx]  # this is the genome name without extension
+            try:
+                fasta_name = genome_dict[genome_name]
+            except KeyError:
+                raise KeyError(f"Genome '{genome_name}' not found in the genome directory")
+            csv_writer_stats.writerow([genome_name, parts[length_idx], parts[n50_idx], parts[gc_idx], 
                                        parts[num_contigs_idx]])
-            csv_writer_checkm.writerow([f"{parts[genome_idx]}.fa", parts[comp_idx], parts[cont_idx]])
+            csv_writer_checkm.writerow([fasta_name, parts[comp_idx], parts[cont_idx]])
 
 
 def get_field_index(field_name, fields, metadata_table):
@@ -62,10 +70,12 @@ def parse_args():
     parser.add_argument('-i', dest='metadata_table', required=True, help='Location of the metadata table from the '
                                                                          'previous catalogue version.')
     parser.add_argument('-o', dest='prefix', required=True, help='Prefix for the output files.')
+    parser.add_argument('--genomes', required=True, help='Path to the mgyg_genomes folder for the previous catalogue '
+                                                         'version.')
 
     return parser.parse_args()
 
 
 if __name__ == '__main__':
     args = parse_args()
-    main(args.metadata_table, args.prefix)
+    main(args.metadata_table, args.prefix, args.genomes)