HiFi-human-WGS-editing-QC-WDL/genbank_to_crispr_json.py at main · yttria-aniseia/HiFi-human-WGS-editing-QC-WDL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env python3
"""
Convert GenBank format CRISPR edit plasmid files to CRISPR edit JSON format.

Extracts homology arms, payload sequence, and payload components from GenBank
annotation and fetches target gene location from Ensembl.
"""

import argparse
import json
import re
import sys
from typing import Dict, List, Optional, Tuple

try:
    from Bio import SeqIO
    from Bio.SeqFeature import SeqFeature
except ImportError as e:
    print(f"Error: Biopython not found. Please install it:", file=sys.stderr)
    print(f"  pip install biopython", file=sys.stderr)
    sys.exit(1)

import urllib.request
import urllib.error


def find_feature_by_label(features: List[SeqFeature], patterns: List[str],
                          feature_name: str) -> Optional[SeqFeature]:
    """
    Find a feature by matching its label against a list of patterns (case-insensitive).

    Args:
        features: List of SeqFeature objects from the GenBank record
        patterns: List of regex patterns to match against the label
        feature_name: Name of the feature (for error messages)

    Returns:
        The matching SeqFeature or None if not found
    """
    for feature in features:
        if 'label' in feature.qualifiers:
            label = feature.qualifiers['label'][0]
            for pattern in patterns:
                if re.search(pattern, label, re.IGNORECASE):
                    return feature
    return None


def get_feature_sequence(feature: SeqFeature, record) -> str:
    """Extract the sequence for a feature from the record."""
    return str(feature.extract(record.seq))


def find_payload_components(features: List[SeqFeature], record, payload_start: int,
                            payload_end: int) -> List[Dict[str, str]]:
    """
    Find all sub-features within the payload region.

    Args:
        features: List of all SeqFeature objects
        record: The SeqRecord object containing the sequence
        payload_start: Start position of payload (0-based)
        payload_end: End position of payload (0-based)

    Returns:
        List of NamedSequence dicts with 'name' and 'seq' keys
    """
    components = []

    for feature in features:
        # Skip the payload feature itself
        if 'label' in feature.qualifiers:
            label = feature.qualifiers['label'][0]
            if re.search(r'^payload$', label, re.IGNORECASE):
                continue

        # Check if feature is within payload region
        feat_start = int(feature.location.start)
        feat_end = int(feature.location.end)

        # Feature must be completely contained within payload
        if feat_start >= payload_start and feat_end <= payload_end:
            if 'label' in feature.qualifiers:
                name = feature.qualifiers['label'][0]
                # Only process misc_feature types, skip CDS
                if feature.type == 'misc_feature':
                    # Get the sequence
                    seq = str(feature.extract(record.seq))

                    components.append({
                        'name': name,
                        'seq': seq
                    })

    # Sort by start position to maintain order
    components.sort(key=lambda x: int(features[
        [i for i, f in enumerate(features)
         if f.qualifiers.get('label', [''])[0] == x['name']][0]
    ].location.start))

    return components


def fetch_gene_location(transcript_id: str) -> Dict[str, any]:
    """
    Fetch gene location from Ensembl REST API for a given transcript ID.

    Args:
        transcript_id: ENSEMBL transcript ID (e.g., ENST00000356245)

    Returns:
        Dict with keys: chr, start, end, symbol, strand

    Raises:
        Exception if the transcript cannot be found or API fails
    """
    # Use Ensembl REST API
    server = "https://rest.ensembl.org"
    ext = f"/lookup/id/{transcript_id}?expand=1"

    try:
        req = urllib.request.Request(
            server + ext,
            headers={"Content-Type": "application/json"}
        )
        with urllib.request.urlopen(req) as response:
            if response.status != 200:
                raise Exception(f"Failed to fetch transcript {transcript_id}: HTTP {response.status}")
            data = json.loads(response.read().decode())

        # Extract chromosome (remove 'chr' prefix if present, add it back)
        chrom = data['seq_region_name']
        if not chrom.startswith('chr'):
            chrom = f'chr{chrom}'

        # Get gene symbol from display_name or gene name
        symbol = None
        if 'display_name' in data:
            symbol = data['display_name']
        elif 'Parent' in data:
            # Fetch parent gene
            gene_id = data['Parent']
            gene_ext = f"/lookup/id/{gene_id}"
            gene_req = urllib.request.Request(
                server + gene_ext,
                headers={"Content-Type": "application/json"}
            )
            try:
                with urllib.request.urlopen(gene_req) as gene_response:
                    if gene_response.status == 200:
                        gene_data = json.loads(gene_response.read().decode())
                        symbol = gene_data.get('display_name', gene_id)
            except urllib.error.URLError:
                pass  # Continue without symbol

        return {
            'chr': chrom,
            'start': data['start'],
            'end': data['end'],
            'symbol': symbol,
            'strand': '+' if data['strand'] == 1 else '-'
        }

    except urllib.error.URLError as e:
        raise Exception(f"Network error fetching transcript {transcript_id}: {e}")
    except (KeyError, json.JSONDecodeError) as e:
        raise Exception(f"Error parsing Ensembl response for {transcript_id}: {e}")


def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str,
                                  transcript_id: str, grna_seq: str,
                                  edit_type: str = "INS",
                                  strand: Optional[str] = None) -> Dict:
    """
    Parse GenBank file and generate CRISPR edit JSON.

    Args:
        genbank_path: Path to GenBank format file
        edit_id: Identifier for this edit
        transcript_id: ENSEMBL transcript ID (e.g., ENST00000356245)
        grna_seq: gRNA sequence
        edit_type: Type of edit (SNV, INS, or DEL), default INS
        strand: Strand of edit (+ or -), if None will use gene strand

    Returns:
        Dict matching the CRISPR edit JSON schema
    """
    # Parse GenBank file
    try:
        record = SeqIO.read(genbank_path, "genbank")
    except Exception as e:
        print(f"Error reading GenBank file {genbank_path}: {e}", file=sys.stderr)
        sys.exit(1)

    features = list(record.features)

    # Find left homology arm
    left_ha_patterns = [r"5['\u2019\u0027]?[\s_]*(homology|ha)", r"left\s+ha\b", r"left\s+homology\s+arm"]
    left_ha_feature = find_feature_by_label(features, left_ha_patterns, "left homology arm")
    if not left_ha_feature:
        print("Error: Could not find left homology arm feature", file=sys.stderr)
        print("  Expected label matching: 5' Homology, Left HA, or Left Homology Arm", file=sys.stderr)
        sys.exit(1)

    # Find right homology arm
    right_ha_patterns = [r"3['\u2019\u0027]?[\s_]*(homology|ha)", r"right\s+ha\b", r"right\s+homology\s+arm"]
    right_ha_feature = find_feature_by_label(features, right_ha_patterns, "right homology arm")
    if not right_ha_feature:
        print("Error: Could not find right homology arm feature", file=sys.stderr)
        print("  Expected label matching: 3' Homology, Right HA, or Right Homology Arm", file=sys.stderr)
        sys.exit(1)

    # Find payload
    payload_patterns = [r"^payload$"]
    payload_feature = find_feature_by_label(features, payload_patterns, "payload")
    if not payload_feature:
        print("Error: Could not find payload feature", file=sys.stderr)
        print("  Expected label: Payload", file=sys.stderr)
        sys.exit(1)

    # Extract sequences
    left_ha_seq = get_feature_sequence(left_ha_feature, record)
    right_ha_seq = get_feature_sequence(right_ha_feature, record)
    payload_seq = get_feature_sequence(payload_feature, record)

    # Find payload components
    payload_start = int(payload_feature.location.start)
    payload_end = int(payload_feature.location.end)
    payload_components = find_payload_components(features, record, payload_start, payload_end)

    # Fetch gene location
    print(f"Fetching gene location for {transcript_id}...", file=sys.stderr)
    try:
        gene_info = fetch_gene_location(transcript_id)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

    # Use gene strand if not specified
    if strand is None:
        strand = gene_info['strand']

    # Build the JSON structure
    crispr_json = {
        "ref": "hg38",
        "edits": [
            {
                "id": edit_id,
                "target": {
                    "chr": gene_info['chr'],
                    "start": gene_info['start'],
                    "end": gene_info['end'],
                },
                "edit": {
                    "type": edit_type,
                    "strand": strand,
                    "guide": grna_seq,
                    "left_ha": left_ha_seq,
                    "payload": payload_seq,
                    "right_ha": right_ha_seq,
                }
            }
        ]
    }

    # Add optional symbol if available
    if gene_info.get('symbol'):
        crispr_json['edits'][0]['target']['symbol'] = gene_info['symbol']

    # Add payload components if found
    if payload_components:
        crispr_json['edits'][0]['edit']['payload_components'] = payload_components

    return crispr_json


def main():
    parser = argparse.ArgumentParser(
        description='Convert GenBank CRISPR plasmid to CRISPR edit JSON format',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Example usage:
  %(prog)s -i plasmid.gb -e g3bp1_n_term_tag -t ENST00000356245 \\
           -g CGCCCGACCAGCAGGGGACT -o g3bp1_edit.json

  %(prog)s -i plasmid.gb -e my_edit -t ENST00000356245 \\
           -g CGCCCGACCAGCAGGGGACT --type INS --strand +
        """
    )

    parser.add_argument('-i', '--input', required=True,
                       help='Input GenBank format file')
    parser.add_argument('-o', '--output', required=True,
                       help='Output JSON file')
    parser.add_argument('-e', '--edit-id', required=True,
                       help='Edit identifier (e.g., g3bp1_n_term_tag)')
    parser.add_argument('-t', '--transcript', required=True,
                       help='ENSEMBL transcript ID (e.g., ENST00000356245)')
    parser.add_argument('-g', '--grna', required=True,
                       help='gRNA sequence (e.g., CGCCCGACCAGCAGGGGACT)')
    parser.add_argument('--type', choices=['SNV', 'INS', 'DEL'], default='INS',
                       help='Edit type (default: INS)')
    parser.add_argument('--strand', choices=['+', '-'],
                       help='Edit strand (default: use gene strand)')

    args = parser.parse_args()

    # Generate JSON
    crispr_json = parse_genbank_to_crispr_json(
        args.input,
        args.edit_id,
        args.transcript,
        args.grna,
        args.type,
        args.strand
    )

    # Write output
    with open(args.output, 'w') as f:
        json.dump(crispr_json, f, indent=2)

    print(f"Successfully wrote CRISPR edit JSON to {args.output}", file=sys.stderr)
    print(f"  Edit ID: {args.edit_id}", file=sys.stderr)
    print(f"  Target: {crispr_json['edits'][0]['target']['chr']}:"
          f"{crispr_json['edits'][0]['target']['start']}-"
          f"{crispr_json['edits'][0]['target']['end']}", file=sys.stderr)
    if 'symbol' in crispr_json['edits'][0]['target']:
        print(f"  Gene: {crispr_json['edits'][0]['target']['symbol']}", file=sys.stderr)
    print(f"  Payload components: {len(crispr_json['edits'][0]['edit'].get('payload_components', []))}",
          file=sys.stderr)


if __name__ == '__main__':
    main()