|
325 | 325 | "sequences = data_filtered['cleaned_preds'].tolist()" |
326 | 326 | ] |
327 | 327 | }, |
| 328 | + { |
| 329 | + "cell_type": "markdown", |
| 330 | + "id": "26a812f6", |
| 331 | + "metadata": {}, |
| 332 | + "source": [ |
| 333 | + "### DBG weighted" |
| 334 | + ] |
| 335 | + }, |
328 | 336 | { |
329 | 337 | "cell_type": "code", |
330 | 338 | "execution_count": null, |
|
382 | 390 | "scaffolds" |
383 | 391 | ] |
384 | 392 | }, |
| 393 | + { |
| 394 | + "cell_type": "markdown", |
| 395 | + "id": "5be34a36", |
| 396 | + "metadata": {}, |
| 397 | + "source": [ |
| 398 | + "### DBG weighted refined" |
| 399 | + ] |
| 400 | + }, |
385 | 401 | { |
386 | 402 | "cell_type": "code", |
387 | 403 | "execution_count": null, |
388 | 404 | "id": "40d4a8be", |
389 | 405 | "metadata": {}, |
390 | 406 | "outputs": [], |
| 407 | + "source": [ |
| 408 | + "assembler = Assembler(\n", |
| 409 | + " mode=\"dbg_weighted\",\n", |
| 410 | + " kmer_size=6,\n", |
| 411 | + " min_overlap=2,\n", |
| 412 | + " size_threshold=10,\n", |
| 413 | + " min_weight=2,\n", |
| 414 | + " refine_rounds=10\n", |
| 415 | + ")" |
| 416 | + ] |
| 417 | + }, |
| 418 | + { |
| 419 | + "cell_type": "code", |
| 420 | + "execution_count": null, |
| 421 | + "id": "f0e04211", |
| 422 | + "metadata": {}, |
| 423 | + "outputs": [], |
| 424 | + "source": [ |
| 425 | + "scaffolds = assembler.run(\n", |
| 426 | + " sequences=sequences, \n", |
| 427 | + " df_full=data_filtered\n", |
| 428 | + ")" |
| 429 | + ] |
| 430 | + }, |
| 431 | + { |
| 432 | + "cell_type": "code", |
| 433 | + "execution_count": null, |
| 434 | + "id": "d447b512", |
| 435 | + "metadata": {}, |
| 436 | + "outputs": [], |
| 437 | + "source": [ |
| 438 | + "print(len(scaffolds))" |
| 439 | + ] |
| 440 | + }, |
| 441 | + { |
| 442 | + "cell_type": "code", |
| 443 | + "execution_count": null, |
| 444 | + "id": "238dec65", |
| 445 | + "metadata": {}, |
| 446 | + "outputs": [], |
| 447 | + "source": [ |
| 448 | + "# show me all the scaffolds\n", |
| 449 | + "for scaffold in scaffolds:\n", |
| 450 | + " print(scaffold)" |
| 451 | + ] |
| 452 | + }, |
| 453 | + { |
| 454 | + "cell_type": "code", |
| 455 | + "execution_count": null, |
| 456 | + "id": "ec9903d3", |
| 457 | + "metadata": {}, |
| 458 | + "outputs": [], |
| 459 | + "source": [ |
| 460 | + "# plot the distribution of scaffold lengths\n", |
| 461 | + "scaffold_lengths = [len(scaffold) for scaffold in scaffolds]\n", |
| 462 | + "plt.figure(figsize=(10, 6))\n", |
| 463 | + "sns.histplot(scaffold_lengths, bins=20, kde=False)\n", |
| 464 | + "plt.title(\"Distribution of scaffold lengths\")\n", |
| 465 | + "plt.xlabel(\"Scaffold length\")\n", |
| 466 | + "plt.ylabel(\"Frequency\")\n", |
| 467 | + "plt.show()" |
| 468 | + ] |
| 469 | + }, |
| 470 | + { |
| 471 | + "cell_type": "code", |
| 472 | + "execution_count": null, |
| 473 | + "id": "51bc2815", |
| 474 | + "metadata": {}, |
| 475 | + "outputs": [], |
| 476 | + "source": [] |
| 477 | + }, |
| 478 | + { |
| 479 | + "cell_type": "markdown", |
| 480 | + "id": "39bf42e1", |
| 481 | + "metadata": {}, |
| 482 | + "source": [ |
| 483 | + "## Calculate coverage" |
| 484 | + ] |
| 485 | + }, |
| 486 | + { |
| 487 | + "cell_type": "code", |
| 488 | + "execution_count": null, |
| 489 | + "id": "5a32472c", |
| 490 | + "metadata": {}, |
| 491 | + "outputs": [], |
| 492 | + "source": [ |
| 493 | + "from Bio import Align\n", |
| 494 | + "import numpy as np\n", |
| 495 | + "\n", |
| 496 | + "def calculate_fuzzy_coverage(reference_seq, peptides, min_identity=0.9):\n", |
| 497 | + " # 1. Clean reference\n", |
| 498 | + " clean_ref = reference_seq.replace(\"-\", \"\")\n", |
| 499 | + " ref_len = len(clean_ref)\n", |
| 500 | + " \n", |
| 501 | + " if ref_len == 0:\n", |
| 502 | + " return 0.0, 0, 0\n", |
| 503 | + "\n", |
| 504 | + " # 2. Mask setup\n", |
| 505 | + " coverage_mask = np.zeros(ref_len, dtype=bool)\n", |
| 506 | + " \n", |
| 507 | + " # 3. Aligner Setup\n", |
| 508 | + " aligner = Align.PairwiseAligner()\n", |
| 509 | + " aligner.mode = 'local'\n", |
| 510 | + " aligner.open_gap_score = -10\n", |
| 511 | + " aligner.extend_gap_score = -1\n", |
| 512 | + " \n", |
| 513 | + " for pep in peptides:\n", |
| 514 | + " if len(pep) > ref_len or len(pep) == 0:\n", |
| 515 | + " continue\n", |
| 516 | + " \n", |
| 517 | + " alignments = aligner.align(clean_ref, pep)\n", |
| 518 | + " \n", |
| 519 | + " if not alignments:\n", |
| 520 | + " continue\n", |
| 521 | + " \n", |
| 522 | + " best_aln = alignments[0]\n", |
| 523 | + " \n", |
| 524 | + " # FIX: Calculate identity using matches / peptide length\n", |
| 525 | + " # 'counts().identities' returns the number of matching residues\n", |
| 526 | + " matches = best_aln.counts().identities\n", |
| 527 | + " identity = matches / len(pep)\n", |
| 528 | + " \n", |
| 529 | + " if identity >= min_identity:\n", |
| 530 | + " # best_aln.aligned[0] contains the (start, end) tuples for the reference (first seq)\n", |
| 531 | + " for start, end in best_aln.aligned[0]:\n", |
| 532 | + " coverage_mask[start:end] = True\n", |
| 533 | + "\n", |
| 534 | + " # 4. Stats\n", |
| 535 | + " covered_count = np.sum(coverage_mask)\n", |
| 536 | + " coverage_pct = (covered_count / ref_len) * 100\n", |
| 537 | + " \n", |
| 538 | + " return coverage_pct, covered_count, ref_len\n", |
| 539 | + "\n", |
| 540 | + "# --- Example Usage with your data ---\n", |
| 541 | + "\n", |
| 542 | + "reference_data = {\n", |
| 543 | + " \"heavy\": \"EVQLVESGGGLVKPGGSLKLSCAAS--------MSWVRQTPEKRLEWVAT--------SYPDSMKGRFTVSRDSAKNTLYLQMSSLRSEDTAMYY------------GQGTTLTVSSAKTTPPSV\",\n", |
| 544 | + " \"light\": \"DVVLTQTPLSLPVNLGDQASLSCKST-----------LDWYVQKPGQSPQPLLY---NRFSGVPDRFSGSGSGTDFTLKLTRVEAEDLGLYY-----------GSGTNLELRRADAAPTVS\"\n", |
| 545 | + "}\n", |
| 546 | + "\n", |
| 547 | + "# Example peptides (mix of perfect and slightly mutated for testing)\n", |
| 548 | + "# In real case, this list comes from your filtered InstaNovo output\n", |
| 549 | + "\n", |
| 550 | + "\n", |
| 551 | + "print(f\"{'Chain':<10} | {'Cov %':<10} | {'Residues':<10}\")\n", |
| 552 | + "print(\"-\" * 35)\n", |
| 553 | + "\n", |
| 554 | + "for chain_name, ref_seq in reference_data.items():\n", |
| 555 | + " cov_pct, cov_res, total_res = calculate_fuzzy_coverage(ref_seq, scaffold, min_identity=0.85)\n", |
| 556 | + " print(f\"{chain_name:<10} | {cov_pct:6.2f}% | {cov_res}/{total_res}\")" |
| 557 | + ] |
| 558 | + }, |
| 559 | + { |
| 560 | + "cell_type": "markdown", |
| 561 | + "id": "c5bb4bc7", |
| 562 | + "metadata": {}, |
391 | 563 | "source": [] |
392 | 564 | } |
393 | 565 | ], |
|
0 commit comments