|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 9, |
| 5 | + "execution_count": 3, |
6 | 6 | "id": "5e322d83", |
7 | 7 | "metadata": {}, |
8 | 8 | "outputs": [], |
|
26 | 26 | }, |
27 | 27 | { |
28 | 28 | "cell_type": "code", |
29 | | - "execution_count": 2, |
| 29 | + "execution_count": null, |
30 | 30 | "id": "804f83bc", |
31 | 31 | "metadata": {}, |
32 | 32 | "outputs": [], |
|
35 | 35 | "\n", |
36 | 36 | "petrack = PETrackII()\n", |
37 | 37 | "petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"A\", count=2) # peak_1\n", |
38 | | - "petrack.add_loc(b\"chr1\", 70, 270, barcode=b\"A\", count=1) # peak_2\n", |
39 | | - "petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"B\", count=3) # peak_2\n", |
| 38 | + "petrack.add_loc(b\"chr1\", 70, 270, barcode=b\"A\", count=1) # peak_1&2\n", |
| 39 | + "petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"B\", count=3) # peak_1\n", |
40 | 40 | "petrack.add_loc(b\"chr1\", 175, 325, barcode=b\"C\", count=4) # peak_2\n", |
41 | 41 | "petrack.finalize()\n", |
42 | 42 | "\n", |
43 | 43 | "regions = Regions()\n", |
44 | | - "regions.add_loc(b\"chr1\", 0, 100) # peak_1\n", |
45 | | - "regions.add_loc(b\"chr1\", 200, 300) # peak_2\n", |
46 | | - "regions.add_loc(b\"chr1\", 500, 600) # peak_3" |
| 44 | + "regions.add_loc(b\"chr1\", 0, 100) # peak_1 A:3 B:3 C:0\n", |
| 45 | + "regions.add_loc(b\"chr1\", 200, 300) # peak_2 A:1 B:0 C:4\n", |
| 46 | + "regions.add_loc(b\"chr1\", 500, 600) # peak_3 A:0 B:0 C:0" |
47 | 47 | ] |
48 | 48 | }, |
49 | 49 | { |
50 | 50 | "cell_type": "code", |
51 | | - "execution_count": 11, |
| 51 | + "execution_count": 48, |
52 | 52 | "id": "74fff8ce", |
53 | 53 | "metadata": {}, |
54 | 54 | "outputs": [], |
|
118 | 118 | }, |
119 | 119 | { |
120 | 120 | "cell_type": "code", |
121 | | - "execution_count": 12, |
| 121 | + "execution_count": 49, |
122 | 122 | "id": "744ba5ee", |
123 | 123 | "metadata": {}, |
124 | 124 | "outputs": [ |
|
142 | 142 | }, |
143 | 143 | { |
144 | 144 | "cell_type": "code", |
145 | | - "execution_count": 20, |
| 145 | + "execution_count": 51, |
146 | 146 | "id": "a83976eb", |
147 | 147 | "metadata": {}, |
148 | 148 | "outputs": [], |
|
226 | 226 | }, |
227 | 227 | { |
228 | 228 | "cell_type": "code", |
229 | | - "execution_count": 23, |
| 229 | + "execution_count": 52, |
230 | 230 | "id": "b7dcf943", |
231 | 231 | "metadata": {}, |
232 | 232 | "outputs": [ |
|
306 | 306 | "\n", |
307 | 307 | " frag_idx = 0\n", |
308 | 308 | " local_peak_idx = 0\n", |
309 | | - " frag_iter = iter(fragment_locs).__next__ # fragment\n", |
310 | | - " peak_iter = iter(regions_c).__next__ # peaks\n", |
311 | 309 | " frag_len = len(fragment_locs)\n", |
312 | 310 | " peak_len = len(regions_c)\n", |
313 | | - " frag = frag_iter() # (l,r,c)\n", |
| 311 | + " frag = fragment_locs[frag_idx]\n", |
314 | 312 | " remaining_frag_len = frag_len - 1\n", |
315 | | - " peak = peak_iter() # (start, end)\n", |
| 313 | + " peak = regions_c[local_peak_idx] # (start, end)\n", |
316 | 314 | " remaining_peak_len = peak_len - 1\n", |
317 | 315 | "\n", |
318 | 316 | " # inside two_pointer_sweep, replace only the while-loop block with this\n", |
319 | | - "\n", |
| 317 | + " back_trace_frag = 0\n", |
320 | 318 | " while True:\n", |
321 | 319 | " frag_start, frag_end = frag[0], frag[1]\n", |
322 | 320 | " peak_start, peak_end = peak[0], peak[1]\n", |
323 | 321 | "\n", |
324 | | - " # peak inside fragment\n", |
325 | | - " if frag_start <= peak_start and peak_end <= frag_end:\n", |
| 322 | + " # peak overlap fragment\n", |
| 323 | + " if frag_start <= peak_end and peak_start <= frag_end:\n", |
326 | 324 | " bc_id = int(fragment_barcodes[frag_idx])\n", |
327 | 325 | " row_id = barcode_id_to_row.get(bc_id, -1)\n", |
328 | 326 | " if row_id >= 0:\n", |
329 | 327 | " rows.append(int(row_id))\n", |
330 | 328 | " columns.append(local_peak[local_peak_idx])\n", |
331 | 329 | " data.append(int(frag[2]))\n", |
| 330 | + " #print(f\"peak {local_peak_idx} overlaps with fragment {frag_idx}, row_id: {row_id}, count: {frag[2]}\")\n", |
332 | 331 | "\n", |
333 | | - " # move to next peak (same fragment may contain multiple peaks)\n", |
334 | | - " if remaining_peak_len > 0:\n", |
335 | | - " peak = peak_iter()\n", |
336 | | - " remaining_peak_len -= 1\n", |
337 | | - " local_peak_idx += 1\n", |
| 332 | + " if frag_end > peak_end:\n", |
| 333 | + " # overhang case, perhaps the frag can still contain next peak(s)\n", |
| 334 | + " back_trace_frag += 1\n", |
| 335 | + " remaining_frag_len -= 1\n", |
| 336 | + " if remaining_frag_len >= 0:\n", |
| 337 | + " frag_idx += 1\n", |
| 338 | + " frag = fragment_locs[frag_idx]\n", |
338 | 339 | " continue\n", |
339 | 340 | " else:\n", |
340 | 341 | " break\n", |
341 | 342 | "\n", |
342 | | - " # if fragment ends before peak starts -> advance fragment\n", |
343 | | - " if frag_end < peak_start:\n", |
| 343 | + " # if fragment ends before peak ends -> advance fragment\n", |
| 344 | + " if frag_end < peak_end:\n", |
344 | 345 | " remaining_frag_len -= 1\n", |
345 | 346 | " if remaining_frag_len >= 0:\n", |
346 | | - " frag = frag_iter()\n", |
347 | 347 | " frag_idx += 1\n", |
| 348 | + " frag = fragment_locs[frag_idx]\n", |
348 | 349 | " else:\n", |
349 | 350 | " break\n", |
350 | 351 | " else:\n", |
351 | | - " # peak starts before fragment ends but not contained -> advance peak\n", |
352 | | - " if remaining_peak_len:\n", |
353 | | - " peak = peak_iter()\n", |
354 | | - " remaining_peak_len -= 1\n", |
| 352 | + " # advance peak\n", |
| 353 | + " remaining_peak_len -= 1\n", |
| 354 | + " if remaining_peak_len >= 0:\n", |
355 | 355 | " local_peak_idx += 1\n", |
| 356 | + " peak = regions_c[local_peak_idx]\n", |
| 357 | + " # we will also check if backtrace > 0 or not, if so, we need to move back the fragment pointer to check those peaks that were skipped due to overhang\n", |
| 358 | + " if back_trace_frag > 0:\n", |
| 359 | + " frag_idx -= back_trace_frag\n", |
| 360 | + " remaining_frag_len += back_trace_frag\n", |
| 361 | + " frag = fragment_locs[frag_idx]\n", |
| 362 | + " back_trace_frag = 0\n", |
356 | 363 | " else:\n", |
357 | 364 | " break\n", |
| 365 | + " #print(f\"remaining fragments: {remaining_frag_len}, remaining peaks: {remaining_peak_len}\")\n", |
358 | 366 | "\n", |
359 | 367 | " n_peaks = peak_counter\n", |
360 | 368 | " x = sparse.coo_matrix((data,(rows,columns)),shape=(n_cells,n_peaks)).tocsr()\n", |
|
368 | 376 | }, |
369 | 377 | { |
370 | 378 | "cell_type": "code", |
371 | | - "execution_count": 37, |
| 379 | + "execution_count": 54, |
372 | 380 | "id": "2e4bd05d", |
373 | 381 | "metadata": {}, |
374 | 382 | "outputs": [ |
375 | 383 | { |
376 | 384 | "name": "stdout", |
377 | 385 | "output_type": "stream", |
378 | 386 | "text": [ |
| 387 | + "peak 0 overlaps with fragment 0, row_id: 0, count: 2\n", |
| 388 | + "peak 0 overlaps with fragment 1, row_id: 1, count: 3\n", |
| 389 | + "peak 0 overlaps with fragment 2, row_id: 0, count: 1\n", |
| 390 | + "peak 1 overlaps with fragment 2, row_id: 0, count: 1\n", |
| 391 | + "peak 1 overlaps with fragment 3, row_id: 2, count: 4\n", |
| 392 | + "remaining fragments: -1, remaining peaks: 1\n", |
379 | 393 | "AnnData object with n_obs × n_vars = 3 × 3\n", |
380 | 394 | " var: 'chrom', 'start', 'end'\n", |
381 | | - "[[2 0 0]\n", |
382 | | - " [0 0 0]\n", |
383 | | - " [0 0 0]]\n" |
| 395 | + "[[3 1 0]\n", |
| 396 | + " [3 0 0]\n", |
| 397 | + " [0 4 0]]\n" |
384 | 398 | ] |
385 | 399 | } |
386 | 400 | ], |
|
427 | 441 | ], |
428 | 442 | "metadata": { |
429 | 443 | "kernelspec": { |
430 | | - "display_name": ".venv (3.9.6)", |
| 444 | + "display_name": ".venv (3.12.8)", |
431 | 445 | "language": "python", |
432 | 446 | "name": "python3" |
433 | 447 | }, |
|
441 | 455 | "name": "python", |
442 | 456 | "nbconvert_exporter": "python", |
443 | 457 | "pygments_lexer": "ipython3", |
444 | | - "version": "3.9.6" |
| 458 | + "version": "3.12.8" |
445 | 459 | } |
446 | 460 | }, |
447 | 461 | "nbformat": 4, |
|
0 commit comments