zeus/database_validator.py at main · CLIMB-TRE/zeus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import jsonschema
from pathlib import Path
import json
import csv
import pyfastx
import hashlib
import pandas as pd


def md5_checksum(sequence: str) -> str:
    """Calculate the MD5 checksum of a given sequence.

    Args:
        sequence (str): The input sequence.

    Returns:
        str: The MD5 checksum of the sequence.
    """

    md5 = hashlib.md5()
    md5.update(sequence.encode("utf-8"))
    return md5.hexdigest()


def import_taxonomy_names(names_dmp_path: Path) -> dict:
    """Import taxonomy names from a names.dmp file.

    Args:
        names_dmp_path (Path): The path to the names.dmp file.

    Returns:
        dict: A dictionary mapping taxonomic IDs to their scientific names.
    """
    taxonomy = {}
    with open(names_dmp_path, "r") as f:
        for line in f:
            fields = [i.lstrip() for i in line.split("\t|")]
            taxon_id, name, name_type = fields[0], fields[1], fields[3]
            if "scientific name" in name_type:
                taxonomy[taxon_id] = name

    return taxonomy


def validate_metadata_tsv(metadata: Path, schema_path: Path) -> None:
    """Validate the metadata TSV file against the provided JSON schema.
    Args:
        metadata (Path): The path to the metadata TSV file.
        schema_path (Path): The path to the JSON schema file.

        Raises:
            jsonschema.ValidationError: If the data does not conform to the schema.
            jsonschema.SchemaError: If the schema itself is invalid.
    """
    with open(schema_path, "r") as f:
        schema = json.load(f)

    metadata_df = pd.read_csv(metadata, sep="\t")

    jsonschema.validate(instance=metadata_df.to_dict("records"), schema=schema)


def validate_index_json(index: Path, schema_path: Path) -> None:
    """Validate the database index JSON file against the provided JSON schema.
    Args:
        index (Path): The path to the database index JSON file.
        schema_path (Path): The path to the JSON schema file.

        Raises:
            jsonschema.ValidationError: If the data does not conform to the schema.
            jsonschema.SchemaError: If the schema itself is invalid.
    """
    with open(schema_path, "r") as f:
        schema = json.load(f)

    with open(index, "r") as f:
        index_data = json.load(f)

    jsonschema.validate(instance=index_data, schema=schema)


def run(args):

    name_lookup = import_taxonomy_names(args.names_dmp)

    with open(args.metadata, "r") as f:
        reader = csv.DictReader(f, delimiter="\t")

        metadata = [x for x in reader]

    # Basic structural validation of the metadata TSV
    validate_metadata_tsv(args.metadata, args.metadata_schema)
    print(f"Metadata TSV {args.metadata} passed schema validation.")

    # Basic structural validation of the database index JSON
    validate_index_json(args.index, args.index_schema)
    print(f"Database index JSON {args.index} passed schema validation.")

    # Check that all taxonomic IDs in the metadata are valid
    for row in metadata:
        taxon_id = row.get("taxon_id")
        if taxon_id not in name_lookup.keys():
            raise ValueError(
                f"taxon_id: {taxon_id} does not exist in the taxonomy database provided. Observed in record: {row}"
            )

        if row["human_readable"] != name_lookup[taxon_id]:
            raise ValueError(
                f"Mismatch between taxon_id: {taxon_id} and human_readable name: {row['human_readable']}."
            )

    # Validate FASTA file order, headers, and md5 checksums
    fasta = pyfastx.Fastx(str(args.fasta), uppercase=True, comment=True)

    fasta_entries = 0
    for metadata_row, (name, seq, comment) in zip(metadata, fasta):
        fasta_entries += 1
        if metadata_row["unique_accession"] != name:
            raise ValueError(
                f"FASTA header {name} does not match unique_accession {metadata_row['unique_accession']} in metadata. This may be due to ordering issues or incorrect metadata / FASTA header."
            )

        if metadata_row["accession_description"] != comment:
            raise ValueError(
                f"FASTA description for {name} does not match accession_description '{metadata_row['accession_description']}' in metadata."
            )

        if metadata_row["sequence_md5"] != md5_checksum(seq):
            raise ValueError(
                f"FASTA sequence for {name} does not match sequence_md5 '{metadata_row['sequence_md5']}' in metadata."
            )

        if int(metadata_row["sequence_length"]) != len(seq):
            raise ValueError(
                f"FASTA sequence length for {name}: {len(seq)} does not match sequence_length '{metadata_row['sequence_length']}' in metadata."
            )

    if fasta_entries != len(metadata):
        raise ValueError(
            f"Number of entries in FASTA ({fasta_entries}) does not match number of entries in metadata ({len(metadata)})."
        )

    print(f"Validation completed successfully. {fasta_entries} entries checked.")


def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="Validate a metadata TSV file against a JSON schema and check taxonomic IDs."
    )
    parser.add_argument(
        "--names_dmp",
        type=Path,
        required=True,
        help="Path to the names.dmp file for taxonomy lookup.",
    )
    parser.add_argument(
        "--metadata_schema",
        type=Path,
        required=True,
        help="Path to the JSON schema file for metadata TSV validation.",
    )
    parser.add_argument(
        "--index_schema",
        type=Path,
        required=True,
        help="Path to the JSON schema file for database index JSON validation.",
    )
    parser.add_argument(
        "--metadata",
        type=Path,
        required=True,
        help="Path to the metadata TSV file to validate.",
    )
    parser.add_argument(
        "--index",
        type=Path,
        required=True,
        help="Path to the database index JSON file to validate.",
    )
    parser.add_argument(
        "--fasta",
        type=Path,
        required=True,
        help="Path to the FASTA file to validate.",
    )

    args = parser.parse_args()
    run(args)


if __name__ == "__main__":
    main()