Skip to content

Commit 6753b02

Browse files
authored
Merge pull request #127 from EBI-Metagenomics/feature/finalised-update-pipeline
Feature/finalised update pipeline
2 parents 09da707 + a0164eb commit 6753b02

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+2628
-1204
lines changed

README.md

Lines changed: 82 additions & 44 deletions
Large diffs are not rendered by default.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"ftp": [
3+
"genomes-all_metadata.tsv",
4+
"all_genomes",
5+
"species_catalogue",
6+
"all_genomes.msh"
7+
],
8+
"additional_data": [
9+
"panaroo_output",
10+
"mgyg_genomes",
11+
{
12+
"intermediate_files": [
13+
"extra_weight_table.txt",
14+
"renamed_genomes_name_mapping.tsv"
15+
]
16+
}
17+
]
18+
}
19+

bin/add_genomes_to_remove_list.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env python3
2+
# coding=utf-8
3+
4+
# This file is part of MGnify genome analysis pipeline.
5+
#
6+
# MGnify genome analysis pipeline is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU General Public License as published by
8+
# the Free Software Foundation, either version 3 of the License, or
9+
# (at your option) any later version.
10+
11+
# MGnify genome analysis pipeline is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU General Public License for more details.
15+
16+
# You should have received a copy of the GNU General Public License
17+
# along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
18+
19+
import argparse
20+
import os
21+
22+
23+
def main(remove_list_file, add_list_file, message, output_file):
24+
already_in_remove_list = set()
25+
if os.path.isfile(remove_list_file):
26+
with open(remove_list_file, 'r') as file_in, open(output_file, "w") as file_out:
27+
for line in file_in:
28+
if line.startswith("MGYG"):
29+
file_out.write(line)
30+
accession = line.split('\t')[0].strip()
31+
already_in_remove_list.add(accession)
32+
33+
with open(add_list_file, 'r') as file_in, open(output_file, 'a') as file_out:
34+
for line in file_in:
35+
accession = line.strip()
36+
if accession not in already_in_remove_list:
37+
file_out.write(f"{accession}\t{message}\n")
38+
39+
40+
def parse_args():
41+
parser = argparse.ArgumentParser(description='Script adds genomes that existed in the previous version of a '
42+
'catalogue but failed QC checks during the update to the remove '
43+
'list. Only genomes that are not already present in the remove '
44+
'list are added.')
45+
parser.add_argument('-r', '--remove-list', required=True,
46+
help='Path to the file containing the list of genomes to remove. File should be tab-delimited '
47+
'with the MGYG accession in the first column and reason for removal in the second.')
48+
parser.add_argument('-a', '--add-list', required=True,
49+
help='Path to the file containing the list of genomes to add to the remove list.')
50+
parser.add_argument('-m', '--message', required=True,
51+
help='Reason for removal that will be printed to the remove file.')
52+
parser.add_argument('-o', '--output', required=True,
53+
help='Name of the output file.')
54+
return parser.parse_args()
55+
56+
57+
if __name__ == '__main__':
58+
args = parse_args()
59+
main(args.remove_list, args.add_list, args.message, args.output)
60+

bin/check_catalogue_structure.py

Lines changed: 82 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,63 +15,98 @@
1515
# You should have received a copy of the GNU General Public License
1616
# along with MGnify genomes catalogue pipeline. If not, see <https://www.gnu.org/licenses/>.
1717

18+
1819
import argparse
20+
import json
1921
import logging
20-
import os
22+
import sys
23+
from pathlib import Path
24+
from jsonschema import validate
2125

2226
logging.basicConfig(level=logging.INFO)
2327

2428

25-
def main(input_folder):
26-
issues = list()
27-
# verify that all expected folders are where they are
28-
main_folders = ["ftp", "additional_data"] # not checking "website" because we don't need it
29-
for folder in main_folders:
30-
if not verify_folder(input_folder, folder):
31-
issues.append("Folder {} is not found.".format(os.path.join(input_folder, folder)))
32-
ftp_checklist = ["genomes-all_metadata.tsv", "all_genomes", "species_catalogue"]
33-
additional_data_checklist = ["panaroo_output", "mgyg_genomes"]
34-
intermediate_files_checklist = ["extra_weight_table.txt", "drep_data_tables.tar.gz",
35-
"renamed_genomes_name_mapping.tsv"]
36-
for element in ftp_checklist:
37-
ftp_path = os.path.join(input_folder, "ftp")
38-
if not verify_folder(ftp_path, element):
39-
issues.append("{} is not found.".format(os.path.join(ftp_path, element)))
40-
for element in additional_data_checklist:
41-
additional_data_path = os.path.join(input_folder, "additional_data")
42-
if not verify_folder(additional_data_path, element):
43-
issues.append("{} is not found.".format(os.path.join(additional_data_path, element)))
44-
for element in intermediate_files_checklist:
45-
intermediate_files_path = os.path.join(input_folder, "additional_data", "intermediate_files")
46-
if not verify_folder(intermediate_files_path, element):
47-
issues.append("{} is not found.".format(os.path.join(intermediate_files_path, element)))
48-
if len(issues) > 0:
49-
with open("PREVIOUS_CATALOGUE_STRUCTURE_ERRORS.txt", "w") as f:
50-
f.write("\n".join(issues))
51-
logging.error("Catalogue structure issues found")
52-
else:
53-
with open("PREVIOUS_CATALOGUE_STRUCTURE_OK.txt", "w") as f:
54-
logging.info("Catalogue structure OK")
29+
def check_structure(base_folder: Path, structure: dict):
30+
"""Recursively verify that expected structure exists under base_folder."""
31+
issues = []
5532

33+
for key, expected_items in structure.items():
34+
folder_path = base_folder / key
35+
if not folder_path.exists():
36+
issues.append(f"Missing folder: {folder_path}")
37+
continue
38+
39+
for item in expected_items:
40+
# If it's a string → file or folder expected
41+
if isinstance(item, str):
42+
item_path = folder_path / item
43+
if not item_path.exists():
44+
issues.append(f"Missing: {item_path}")
45+
46+
# If it's a dict → nested structure
47+
elif isinstance(item, dict):
48+
issues.extend(check_structure(folder_path, item))
49+
50+
return issues
5651

57-
def verify_folder(main_path, element_to_check):
58-
if os.path.exists(os.path.join(main_path, element_to_check)):
59-
return True
60-
else:
61-
return False
6252

53+
def main():
54+
parser = argparse.ArgumentParser(
55+
description="Validate folder structure of the previous catalogue version using a JSON definition."
56+
)
57+
parser.add_argument(
58+
"-i", "--input_folder", required=True,
59+
help="Path to the output folder of the previous catalogue version. Folders 'ftp' and 'additional_data' should"
60+
"be inside of it."
61+
)
62+
parser.add_argument(
63+
"-s", "--schema", required=True,
64+
help="Path to JSON file defining expected folder structure."
65+
)
66+
args = parser.parse_args()
6367

64-
def parse_args():
65-
parser = argparse.ArgumentParser(description='The script is part of the catalogue update pipeline. It checks '
66-
'that all expected files from the previous version of the catalogue '
67-
'are present in the expected locations.')
68-
parser.add_argument('-i', dest='input_folder', required=True, help='Location of the previous catalogue. '
69-
'Folders "ftp", "website", "additional_data" '
70-
'should be inside this folder')
68+
base_folder = Path(args.input_folder)
69+
schema_path = Path(args.schema)
7170

72-
return parser.parse_args()
71+
if not base_folder.exists():
72+
logging.error(f"Input folder does not exist: {base_folder}")
73+
sys.exit(1)
74+
75+
if not schema_path.exists():
76+
logging.error(f"Schema file not found: {schema_path}")
77+
sys.exit(1)
78+
79+
with open(schema_path) as f:
80+
expected_structure = json.load(f)
81+
82+
# Validate JSON format itself
83+
validate(
84+
instance=expected_structure,
85+
schema={
86+
"type": "object",
87+
"patternProperties": {
88+
".*": {
89+
"type": "array",
90+
"items": {
91+
"anyOf": [
92+
{"type": "string"},
93+
{"type": "object"}
94+
]
95+
}
96+
}
97+
}
98+
}
99+
)
100+
101+
issues = check_structure(base_folder, expected_structure)
102+
103+
if issues:
104+
Path("PREVIOUS_CATALOGUE_STRUCTURE_ERRORS.txt").write_text("\n".join(issues))
105+
logging.error("Catalogue structure issues found.")
106+
else:
107+
Path("PREVIOUS_CATALOGUE_STRUCTURE_OK.txt").write_text("Catalogue structure OK")
108+
logging.info("Catalogue structure OK.")
73109

74110

75-
if __name__ == '__main__':
76-
args = parse_args()
77-
main(args.input_folder)
111+
if __name__ == "__main__":
112+
main()

bin/classify_folders.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# along with MGnify genome analysis pipeline. If not, see <https://www.gnu.org/licenses/>.
1717

1818

19+
import glob
1920
import os
2021
import shutil
2122
import argparse
@@ -34,7 +35,7 @@ def classify_split_folders(input_folder):
3435
clusters = os.listdir(drep_clusters)
3536
for cluster in clusters:
3637
dir_files = os.listdir(os.path.join(drep_clusters, cluster))
37-
genomes = [i for i in dir_files if i.endswith(".fa")]
38+
genomes = [i for i in dir_files if i.endswith((".fa", ".fna"))]
3839
number_of_genomes = len(genomes)
3940
path_cluster_many = os.path.join(NAME_MANY_GENOMES, cluster)
4041
path_cluster_one = os.path.join(NAME_ONE_GENOME, cluster)
@@ -67,14 +68,20 @@ def classify_by_file(split_text, genomes_folder):
6768
for line in file_in:
6869
main_folder, cluster, genomes_str = line.strip().split(":")
6970
genomes = genomes_str.split(",")
70-
cluster_name = genomes[0].split(".")[0] # cluster
71+
cluster_name = genomes[0].rsplit(".", 1)[0] # cluster
7172
path_cluster = os.path.join(main_folder, cluster_name)
7273
if not os.path.exists(path_cluster):
7374
os.mkdir(path_cluster)
7475
for genome in genomes:
75-
old_path = os.path.join(genomes_folder, genome)
76-
new_path = os.path.join(path_cluster, genome)
77-
shutil.copy(old_path, new_path)
76+
base_name, _ = os.path.splitext(genome) # remove extension from genome filename in cluster split file
77+
pattern = os.path.join(genomes_folder, base_name + ".*") # match any extension
78+
matches = glob.glob(pattern)
79+
if matches:
80+
old_path = matches[0] # take the first matching file
81+
new_path = os.path.join(path_cluster, os.path.basename(old_path))
82+
shutil.copy(old_path, new_path)
83+
else:
84+
sys.exit("Cannot find expected genome {}".format(genome))
7885

7986

8087
if __name__ == "__main__":
@@ -113,7 +120,7 @@ def classify_by_file(split_text, genomes_folder):
113120
os.makedirs(NAME_ONE_GENOME)
114121

115122
if args.input_folder:
116-
print("Classify splitted folders")
123+
print("Classify split folders")
117124
classify_split_folders(args.input_folder)
118125
elif args.text_file:
119126
if args.genomes:

bin/create_metadata_table.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
logging.basicConfig(level=logging.INFO)
3131

32+
UNINFORMATIVE = {"not collected", "not present", "na", "n/a", "missing", "not applicable"}
33+
3234

3335
def main(
3436
genomes_dir,
@@ -137,14 +139,19 @@ def add_sample_project_loc(df, location_file, previous_version_data):
137139
df = df.merge(location_data_df, on='Genome_accession', how='left')
138140

139141
# If previous version data is available, override where applicable
142+
# Replace uninformative country/continent with "not provided" as is already done for all new genomes
140143
if not previous_version_data.empty:
141144
prev_data_dict = previous_version_data.set_index("Genome").to_dict(orient="index")
142145
for idx, row in df.iterrows():
143146
genome = row["Genome"]
144147
if genome in prev_data_dict:
145148
for col in ["Sample_accession", "Study_accession", "Country", "Continent"]:
146-
# TODO: add checks that this is informative (not "not provided")
147-
df.at[idx, col] = prev_data_dict[genome][col]
149+
val = prev_data_dict[genome][col]
150+
if col in ["Country", "Continent"]:
151+
if isinstance(val, str) and val.strip().lower() in UNINFORMATIVE:
152+
df.at[idx, col] = "not provided"
153+
continue
154+
df.at[idx, col] = val
148155

149156
return df
150157

bin/extract_info_from_metadata_table.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@
1717

1818
import argparse
1919
import csv
20+
import os
2021
import sys
2122

2223

23-
def main(metadata_table, prefix):
24+
def main(metadata_table, prefix, genomes_folder):
2425
outfile_quality = f'{prefix}_checkm_quality.csv'
2526
outfile_stats = f'{prefix}_assembly_stats.tsv'
27+
# Make a dictionary from the genomes FASTA folder where keys=accessions, values=FASTA file names with extensions
28+
genome_dict = {os.path.splitext(f)[0]: f for f in os.listdir(genomes_folder)}
2629
with (open(metadata_table, "r") as file_in, open(outfile_quality, "w") as checkm_out,
2730
open(outfile_stats, "w") as stats_out):
2831
csv_writer_checkm = csv.writer(checkm_out)
@@ -43,9 +46,14 @@ def main(metadata_table, prefix):
4346

4447
for line in file_in:
4548
parts = line.strip().split("\t")
46-
csv_writer_stats.writerow([parts[genome_idx], parts[length_idx], parts[n50_idx], parts[gc_idx],
49+
genome_name = parts[genome_idx] # this is the genome name without extension
50+
try:
51+
fasta_name = genome_dict[genome_name]
52+
except KeyError:
53+
raise KeyError(f"Genome '{genome_name}' not found in the genome directory")
54+
csv_writer_stats.writerow([genome_name, parts[length_idx], parts[n50_idx], parts[gc_idx],
4755
parts[num_contigs_idx]])
48-
csv_writer_checkm.writerow([f"{parts[genome_idx]}.fa", parts[comp_idx], parts[cont_idx]])
56+
csv_writer_checkm.writerow([fasta_name, parts[comp_idx], parts[cont_idx]])
4957

5058

5159
def get_field_index(field_name, fields, metadata_table):
@@ -62,10 +70,12 @@ def parse_args():
6270
parser.add_argument('-i', dest='metadata_table', required=True, help='Location of the metadata table from the '
6371
'previous catalogue version.')
6472
parser.add_argument('-o', dest='prefix', required=True, help='Prefix for the output files.')
73+
parser.add_argument('--genomes', required=True, help='Path to the mgyg_genomes folder for the previous catalogue '
74+
'version.')
6575

6676
return parser.parse_args()
6777

6878

6979
if __name__ == '__main__':
7080
args = parse_args()
71-
main(args.metadata_table, args.prefix)
81+
main(args.metadata_table, args.prefix, args.genomes)

0 commit comments

Comments
 (0)