|
48 | 48 | "outputs": [], |
49 | 49 | "source": [ |
50 | 50 | "# read a pre cleaned data file\n", |
51 | | - "#data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")" |
| 51 | + "# data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")" |
52 | 52 | ] |
53 | 53 | }, |
54 | 54 | { |
|
62 | 62 | "\n", |
63 | 63 | "import re\n", |
64 | 64 | "\n", |
65 | | - "file_name = 'bsa'\n", |
| 65 | + "file_name = \"bsa\"\n", |
66 | 66 | "\n", |
67 | | - "data = pd.read_csv(f'../inputs/{file_name}.csv'.format(file_name=file_name))\n", |
| 67 | + "data = pd.read_csv(f\"../inputs/{file_name}.csv\".format(file_name=file_name))\n", |
68 | 68 | "\n", |
69 | 69 | "data[\"log_probs\"] = data[\"log_probs\"].replace(-1, -10)\n", |
70 | 70 | "\n", |
|
103 | 103 | "repo_folder = Path(\"../\")\n", |
104 | 104 | "\n", |
105 | 105 | "filtered_psms = instanexus.preprocessing.filter_contaminants(\n", |
106 | | - " cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n", |
107 | | - " )\n", |
| 106 | + " cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n", |
| 107 | + ")\n", |
108 | 108 | "\n", |
109 | 109 | "data = data[data[\"preds\"].isin(filtered_psms)]" |
110 | 110 | ] |
|
157 | 157 | "source": [ |
158 | 158 | "assembler = Assembler(\n", |
159 | 159 | " mode=\"dbg_weighted\",\n", |
160 | | - " kmer_size=7, \n", |
161 | | - " size_threshold=0, \n", |
162 | | - " min_weight=2, # filter low-weight edges\n", |
163 | | - " refine_rounds=3, # optional iterative refinement\n", |
| 160 | + " kmer_size=7,\n", |
| 161 | + " size_threshold=0,\n", |
| 162 | + " min_weight=2, # filter low-weight edges\n", |
| 163 | + " refine_rounds=3, # optional iterative refinement\n", |
164 | 164 | ")" |
165 | 165 | ] |
166 | 166 | }, |
|
171 | 171 | "metadata": {}, |
172 | 172 | "outputs": [], |
173 | 173 | "source": [ |
174 | | - "scaffolds_dbg_w = assembler.run(sequences, output_folder=output_folder, protein_norm=None)" |
| 174 | + "scaffolds_dbg_w = assembler.run(\n", |
| 175 | + " sequences, output_folder=output_folder, protein_norm=None\n", |
| 176 | + ")" |
175 | 177 | ] |
176 | 178 | }, |
177 | 179 | { |
|
241 | 243 | "metadata": {}, |
242 | 244 | "outputs": [], |
243 | 245 | "source": [ |
244 | | - "mapped_contigs = map.process_protein_contigs_scaffold(scaffolds_dbg_w, protein_norm, max_mismatches = 10, min_identity = 0.8)" |
| 246 | + "mapped_contigs = map.process_protein_contigs_scaffold(\n", |
| 247 | + " scaffolds_dbg_w, protein_norm, max_mismatches=10, min_identity=0.8\n", |
| 248 | + ")" |
245 | 249 | ] |
246 | 250 | }, |
247 | 251 | { |
|
337 | 341 | "assembler_dbgx = Assembler(\n", |
338 | 342 | " mode=\"dbgX\",\n", |
339 | 343 | " kmer_size=7,\n", |
340 | | - " size_threshold=10, \n", |
341 | | - " min_weight=2, \n", |
| 344 | + " size_threshold=10,\n", |
| 345 | + " min_weight=2,\n", |
342 | 346 | ")" |
343 | 347 | ] |
344 | 348 | }, |
|
350 | 354 | "outputs": [], |
351 | 355 | "source": [ |
352 | 356 | "scaffolds_dbgx = assembler_dbgx.run(\n", |
353 | | - " sequences=sequences,\n", |
354 | | - " output_folder=output_folder,\n", |
355 | | - " protein_norm=None\n", |
| 357 | + " sequences=sequences, output_folder=output_folder, protein_norm=None\n", |
356 | 358 | ")" |
357 | 359 | ] |
358 | 360 | }, |
|
363 | 365 | "metadata": {}, |
364 | 366 | "outputs": [], |
365 | 367 | "source": [ |
366 | | - "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(scaffolds_dbgx, protein_norm, max_mismatches = 10, min_identity = 0.8)" |
| 368 | + "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(\n", |
| 369 | + " scaffolds_dbgx, protein_norm, max_mismatches=10, min_identity=0.8\n", |
| 370 | + ")" |
367 | 371 | ] |
368 | 372 | }, |
369 | 373 | { |
|
426 | 430 | " mode=\"fusion\",\n", |
427 | 431 | " kmer_size=7,\n", |
428 | 432 | " size_threshold=10,\n", |
429 | | - " min_overlap=3, \n", |
| 433 | + " min_overlap=3,\n", |
430 | 434 | " min_weight=2,\n", |
431 | 435 | ")" |
432 | 436 | ] |
|
449 | 453 | "outputs": [], |
450 | 454 | "source": [ |
451 | 455 | "scaffolds_fusion = assembler_fusion.run(\n", |
452 | | - " sequences=sequences,\n", |
453 | | - " output_folder=output_folder_fusion,\n", |
454 | | - " protein_norm=None\n", |
| 456 | + " sequences=sequences, output_folder=output_folder_fusion, protein_norm=None\n", |
455 | 457 | ")" |
456 | 458 | ] |
457 | 459 | }, |
|
462 | 464 | "metadata": {}, |
463 | 465 | "outputs": [], |
464 | 466 | "source": [ |
465 | | - "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8)\n", |
| 467 | + "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(\n", |
| 468 | + " scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8\n", |
| 469 | + ")\n", |
466 | 470 | "\n", |
467 | 471 | "# top 20\n", |
468 | 472 | "mapped_scaffolds_fusion = mapped_scaffolds_fusion[:20]" |
|
0 commit comments