Skip to content

Commit e983178

Browse files
Refactor grid search: updated grid search script, global scoring and fix environment
1 parent 6f48d33 commit e983178

File tree

9 files changed

+954
-65
lines changed

9 files changed

+954
-65
lines changed

docs/source/tutorials/case_studies/atlas_antibodies.ipynb

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,14 @@
325325
"sequences = data_filtered['cleaned_preds'].tolist()"
326326
]
327327
},
328+
{
329+
"cell_type": "markdown",
330+
"id": "26a812f6",
331+
"metadata": {},
332+
"source": [
333+
"### DBG weighted"
334+
]
335+
},
328336
{
329337
"cell_type": "code",
330338
"execution_count": null,
@@ -382,12 +390,176 @@
382390
"scaffolds"
383391
]
384392
},
393+
{
394+
"cell_type": "markdown",
395+
"id": "5be34a36",
396+
"metadata": {},
397+
"source": [
398+
"### DBG weighted refined"
399+
]
400+
},
385401
{
386402
"cell_type": "code",
387403
"execution_count": null,
388404
"id": "40d4a8be",
389405
"metadata": {},
390406
"outputs": [],
407+
"source": [
408+
"assembler = Assembler(\n",
409+
" mode=\"dbg_weighted\",\n",
410+
" kmer_size=6,\n",
411+
" min_overlap=2,\n",
412+
" size_threshold=10,\n",
413+
" min_weight=2,\n",
414+
" refine_rounds=10\n",
415+
")"
416+
]
417+
},
418+
{
419+
"cell_type": "code",
420+
"execution_count": null,
421+
"id": "f0e04211",
422+
"metadata": {},
423+
"outputs": [],
424+
"source": [
425+
"scaffolds = assembler.run(\n",
426+
" sequences=sequences, \n",
427+
" df_full=data_filtered\n",
428+
")"
429+
]
430+
},
431+
{
432+
"cell_type": "code",
433+
"execution_count": null,
434+
"id": "d447b512",
435+
"metadata": {},
436+
"outputs": [],
437+
"source": [
438+
"print(len(scaffolds))"
439+
]
440+
},
441+
{
442+
"cell_type": "code",
443+
"execution_count": null,
444+
"id": "238dec65",
445+
"metadata": {},
446+
"outputs": [],
447+
"source": [
448+
"# show me all the scaffolds\n",
449+
"for scaffold in scaffolds:\n",
450+
" print(scaffold)"
451+
]
452+
},
453+
{
454+
"cell_type": "code",
455+
"execution_count": null,
456+
"id": "ec9903d3",
457+
"metadata": {},
458+
"outputs": [],
459+
"source": [
460+
"# plot the distribution of scaffold lengths\n",
461+
"scaffold_lengths = [len(scaffold) for scaffold in scaffolds]\n",
462+
"plt.figure(figsize=(10, 6))\n",
463+
"sns.histplot(scaffold_lengths, bins=20, kde=False)\n",
464+
"plt.title(\"Distribution of scaffold lengths\")\n",
465+
"plt.xlabel(\"Scaffold length\")\n",
466+
"plt.ylabel(\"Frequency\")\n",
467+
"plt.show()"
468+
]
469+
},
470+
{
471+
"cell_type": "code",
472+
"execution_count": null,
473+
"id": "51bc2815",
474+
"metadata": {},
475+
"outputs": [],
476+
"source": []
477+
},
478+
{
479+
"cell_type": "markdown",
480+
"id": "39bf42e1",
481+
"metadata": {},
482+
"source": [
483+
"## Calculate coverage"
484+
]
485+
},
486+
{
487+
"cell_type": "code",
488+
"execution_count": null,
489+
"id": "5a32472c",
490+
"metadata": {},
491+
"outputs": [],
492+
"source": [
493+
"from Bio import Align\n",
494+
"import numpy as np\n",
495+
"\n",
496+
"def calculate_fuzzy_coverage(reference_seq, peptides, min_identity=0.9):\n",
497+
" # 1. Clean reference\n",
498+
" clean_ref = reference_seq.replace(\"-\", \"\")\n",
499+
" ref_len = len(clean_ref)\n",
500+
" \n",
501+
" if ref_len == 0:\n",
502+
" return 0.0, 0, 0\n",
503+
"\n",
504+
" # 2. Mask setup\n",
505+
" coverage_mask = np.zeros(ref_len, dtype=bool)\n",
506+
" \n",
507+
" # 3. Aligner Setup\n",
508+
" aligner = Align.PairwiseAligner()\n",
509+
" aligner.mode = 'local'\n",
510+
" aligner.open_gap_score = -10\n",
511+
" aligner.extend_gap_score = -1\n",
512+
" \n",
513+
" for pep in peptides:\n",
514+
" if len(pep) > ref_len or len(pep) == 0:\n",
515+
" continue\n",
516+
" \n",
517+
" alignments = aligner.align(clean_ref, pep)\n",
518+
" \n",
519+
" if not alignments:\n",
520+
" continue\n",
521+
" \n",
522+
" best_aln = alignments[0]\n",
523+
" \n",
524+
" # FIX: Calculate identity using matches / peptide length\n",
525+
" # 'counts().identities' returns the number of matching residues\n",
526+
" matches = best_aln.counts().identities\n",
527+
" identity = matches / len(pep)\n",
528+
" \n",
529+
" if identity >= min_identity:\n",
530+
" # best_aln.aligned[0] contains the (start, end) tuples for the reference (first seq)\n",
531+
" for start, end in best_aln.aligned[0]:\n",
532+
" coverage_mask[start:end] = True\n",
533+
"\n",
534+
" # 4. Stats\n",
535+
" covered_count = np.sum(coverage_mask)\n",
536+
" coverage_pct = (covered_count / ref_len) * 100\n",
537+
" \n",
538+
" return coverage_pct, covered_count, ref_len\n",
539+
"\n",
540+
"# --- Example Usage with your data ---\n",
541+
"\n",
542+
"reference_data = {\n",
543+
" \"heavy\": \"EVQLVESGGGLVKPGGSLKLSCAAS--------MSWVRQTPEKRLEWVAT--------SYPDSMKGRFTVSRDSAKNTLYLQMSSLRSEDTAMYY------------GQGTTLTVSSAKTTPPSV\",\n",
544+
" \"light\": \"DVVLTQTPLSLPVNLGDQASLSCKST-----------LDWYVQKPGQSPQPLLY---NRFSGVPDRFSGSGSGTDFTLKLTRVEAEDLGLYY-----------GSGTNLELRRADAAPTVS\"\n",
545+
"}\n",
546+
"\n",
547+
"# Example peptides (mix of perfect and slightly mutated for testing)\n",
548+
"# In real case, this list comes from your filtered InstaNovo output\n",
549+
"\n",
550+
"\n",
551+
"print(f\"{'Chain':<10} | {'Cov %':<10} | {'Residues':<10}\")\n",
552+
"print(\"-\" * 35)\n",
553+
"\n",
554+
"for chain_name, ref_seq in reference_data.items():\n",
555+
" cov_pct, cov_res, total_res = calculate_fuzzy_coverage(ref_seq, scaffold, min_identity=0.85)\n",
556+
" print(f\"{chain_name:<10} | {cov_pct:6.2f}% | {cov_res}/{total_res}\")"
557+
]
558+
},
559+
{
560+
"cell_type": "markdown",
561+
"id": "c5bb4bc7",
562+
"metadata": {},
391563
"source": []
392564
}
393565
],

environment.osx-arm64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies:
99
- python=3.11
1010
- biopython=1.85
1111
- pandas=2.3.1
12+
- scikit-learn
1213
- upsetplot
1314
- tqdm=4.67.1
1415
- seaborn=0.13.2

grid_master.log

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
========================================
2+
PROCESSING MODE: greedy
3+
========================================
4+
>>> [greedy] Launching: ma1 (Chain: 'light')
5+
>>> [greedy] Launching: ma1 (Chain: 'heavy')
6+
>>> [greedy] Launching: ma2 (Chain: 'light')
7+
>>> [greedy] Launching: ma2 (Chain: 'heavy')
8+
>>> [greedy] Launching: ma3 (Chain: 'light')
9+
>>> [greedy] Launching: ma3 (Chain: 'heavy')
10+
>>> [greedy] Launching: nb1 (Chain: '')
11+
>>> [greedy] Launching: nb2 (Chain: '')
12+
>>> [greedy] Launching: nb3 (Chain: '')
13+
>>> [greedy] Launching: nb4 (Chain: '')
14+
>>> [greedy] Launching: nb5 (Chain: '')
15+
>>> [greedy] Launching: nb6 (Chain: '')
16+
>>> [greedy] Launching: nb7 (Chain: '')
17+
>>> [greedy] Launching: nb8 (Chain: '')
18+
>>> [greedy] Launching: nb9 (Chain: '')
19+
>>> [greedy] Launching: nb10 (Chain: '')
20+
>>> [greedy] Launching: bsa (Chain: '')
21+
>>> [greedy] Launching: bind1 (Chain: '')
22+
>>> [greedy] Launching: bind2 (Chain: '')
23+
>>> [greedy] Launching: bind3 (Chain: '')
24+
========================================
25+
PROCESSING MODE: dbg_weighted
26+
========================================
27+
>>> [dbg_weighted] Launching: ma1 (Chain: 'light')
28+
>>> [dbg_weighted] Launching: ma1 (Chain: 'heavy')
29+
>>> [dbg_weighted] Launching: ma2 (Chain: 'light')
30+
>>> [dbg_weighted] Launching: ma2 (Chain: 'heavy')
31+
>>> [dbg_weighted] Launching: ma3 (Chain: 'light')
32+
>>> [dbg_weighted] Launching: ma3 (Chain: 'heavy')
33+
>>> [dbg_weighted] Launching: nb1 (Chain: '')
34+
>>> [dbg_weighted] Launching: nb2 (Chain: '')
35+
>>> [dbg_weighted] Launching: nb3 (Chain: '')
36+
>>> [dbg_weighted] Launching: nb4 (Chain: '')
37+
>>> [dbg_weighted] Launching: nb5 (Chain: '')
38+
>>> [dbg_weighted] Launching: nb6 (Chain: '')
39+
>>> [dbg_weighted] Launching: nb7 (Chain: '')
40+
>>> [dbg_weighted] Launching: nb8 (Chain: '')
41+
>>> [dbg_weighted] Launching: nb9 (Chain: '')
42+
>>> [dbg_weighted] Launching: nb10 (Chain: '')
43+
>>> [dbg_weighted] Launching: bsa (Chain: '')
44+
>>> [dbg_weighted] Launching: bind1 (Chain: '')
45+
>>> [dbg_weighted] Launching: bind2 (Chain: '')
46+
>>> [dbg_weighted] Launching: bind3 (Chain: '')
47+
========================================
48+
PROCESSING MODE: multimodal_dbg
49+
========================================
50+
>>> [multimodal_dbg] Launching: ma1 (Chain: 'light')
51+
>>> [multimodal_dbg] Launching: ma1 (Chain: 'heavy')
52+
>>> [multimodal_dbg] Launching: ma2 (Chain: 'light')
53+
>>> [multimodal_dbg] Launching: ma2 (Chain: 'heavy')
54+
>>> [multimodal_dbg] Launching: ma3 (Chain: 'light')
55+
>>> [multimodal_dbg] Launching: ma3 (Chain: 'heavy')
56+
>>> [multimodal_dbg] Launching: nb1 (Chain: '')
57+
>>> [multimodal_dbg] Launching: nb2 (Chain: '')
58+
>>> [multimodal_dbg] Launching: nb3 (Chain: '')
59+
>>> [multimodal_dbg] Launching: nb4 (Chain: '')
60+
>>> [multimodal_dbg] Launching: nb5 (Chain: '')
61+
>>> [multimodal_dbg] Launching: nb6 (Chain: '')
62+
>>> [multimodal_dbg] Launching: nb7 (Chain: '')
63+
>>> [multimodal_dbg] Launching: nb8 (Chain: '')
64+
>>> [multimodal_dbg] Launching: nb9 (Chain: '')
65+
>>> [multimodal_dbg] Launching: nb10 (Chain: '')
66+
>>> [multimodal_dbg] Launching: bsa (Chain: '')
67+
>>> [multimodal_dbg] Launching: bind1 (Chain: '')
68+
>>> [multimodal_dbg] Launching: bind2 (Chain: '')
69+
>>> [multimodal_dbg] Launching: bind3 (Chain: '')
70+
----------------------------------------
71+
All grid search jobs completed.s

json/gridsearch_params.json

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,28 @@
11
{
2-
"dbg": {
3-
"kmer_size": [6, 7],
4-
"min_overlap": [3, 4],
5-
"size_threshold": [0, 5, 10],
6-
"max_mismatches": [8, 10, 12, 14],
7-
"min_identity": [0.6, 0.7, 0.8, 0.9],
8-
"conf": [0.86, 0.88, 0.90, 0.92]
2+
"greedy": {
3+
"fdr": [0.05, 0.10, 0.20],
4+
"min_overlap": [3, 4],
5+
"size_threshold": [0, 5, 10],
6+
"refine_rounds": [0, 5]
97
},
108

11-
"greedy": {
9+
"dbg_weighted": {
10+
"fdr": [0.05, 0.10, 0.20],
11+
"kmer_size": [5, 6, 7],
12+
"min_weight": [2, 3],
1213
"min_overlap": [3, 4],
1314
"size_threshold": [0, 5, 10],
14-
"max_mismatches": [8, 10, 12, 14],
15-
"min_identity": [0.6, 0.7, 0.8, 0.9],
16-
"conf": [0.86, 0.88, 0.90, 0.92]
15+
"refine_rounds": [0, 5],
16+
"alpha_len": [1.0],
17+
"alpha_cov": [1.0],
18+
"alpha_min": [0.1]
19+
},
20+
21+
"multimodal_dbg": {
22+
"fdr": [0.05, 0.10, 0.20],
23+
"kmer_size": [5, 6, 7],
24+
"min_weight": [2, 5, 10],
25+
"size_threshold": [0, 5, 10],
26+
"refine_rounds": [0]
1727
}
1828
}

0 commit comments

Comments
 (0)