Skip to content

Commit e777219

Browse files
committed
two pointer sweep logic fix
1 parent 943d982 commit e777219

5 files changed

Lines changed: 206 additions & 95 deletions

File tree

.github/workflows/publish-to-gh-with-sphinx.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Build and Publish Sphinx Documentation
22

33
on:
44
push:
5-
branches: ["master"]
5+
branches: ["main"]
66
# Allows you to run this workflow manually from the Actions tab
77
workflow_dispatch:
88

MACS3/Signal/PairedEndTrack.py

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,11 +1832,18 @@ def _anndata_ncls(self, regions):
18321832
return adata
18331833

18341834
def _two_pointer_sweep(self,regions):
1835+
barcode_items = list(self.barcode_dict.items())
1836+
barcodes = [b.decode() if isinstance(b, (bytes, bytearray)) else str(b) for b, _ in barcode_items]
1837+
barcode_ids = np.array([i for _, i in barcode_items], dtype=np.int64)
1838+
1839+
id_map = np.full(barcode_ids.max() + 1, -1, dtype=np.int64)
1840+
id_map[barcode_ids] = np.arange(len(barcode_ids), dtype=np.int64)
1841+
n_cells = len(barcodes)
18351842

18361843
# # array of locataions and counts
1837-
# fragment_locs = self.locations[chrom][:self.size[chrom]]
1844+
# fragment_locs = petrack.locations[chrom][:petrack.size[chrom]]
18381845
# # arrray of barcodes
1839-
# fragment_barcodes = self.barcodes[chrom][:self.size[chrom]]
1846+
# fragment_barcodes = petrack.barcodes[chrom][:petrack.size[chrom]]
18401847

18411848
peak_names = []
18421849
peak_data = []
@@ -1853,9 +1860,7 @@ def _two_pointer_sweep(self,regions):
18531860
regions.sort()
18541861
peak_counter = 0
18551862
for chrom in regions.regions.keys():
1856-
size = self.size[chrom]
1857-
locs = self.locations[chrom]
1858-
#barcodes = self.barcodes
1863+
#barcodes = petrack.barcodes
18591864
regions_c = regions.regions[chrom]
18601865
local_peak = []
18611866
### regions empty skip
@@ -1865,61 +1870,67 @@ def _two_pointer_sweep(self,regions):
18651870
peak_data.append((chrom.decode(),start,end))
18661871
local_peak.append(peak_counter - 1)
18671872

1868-
fragment_locs = self.locations[chrom][:self.size[chrom]]
1869-
fragment_barcodes = self.barcodes[chrom][:self.size[chrom]]
1873+
fragment_locs = self.locations[chrom]
1874+
fragment_barcodes = self.barcodes[chrom]
18701875

18711876
if len(fragment_locs) == 0:
18721877
continue
18731878

18741879
frag_idx = 0
18751880
local_peak_idx = 0
1876-
frag_iter = iter(fragment_locs).__next__ # fragment
1877-
peak_iter = iter(regions_c).__next__ # peaks
18781881
frag_len = len(fragment_locs)
18791882
peak_len = len(regions_c)
1880-
frag = frag_iter() # (l,r,c)
1883+
frag = fragment_locs[frag_idx]
18811884
remaining_frag_len = frag_len - 1
1882-
peak = peak_iter() # (start, end)
1885+
peak = regions_c[local_peak_idx] # (start, end)
18831886
remaining_peak_len = peak_len - 1
18841887

18851888
# inside two_pointer_sweep, replace only the while-loop block with this
1886-
1889+
back_trace_frag = 0
18871890
while True:
18881891
frag_start, frag_end = frag[0], frag[1]
18891892
peak_start, peak_end = peak[0], peak[1]
18901893

1891-
# peak inside fragment
1892-
if frag_start <= peak_start and peak_end <= frag_end:
1894+
# peak overlap fragment
1895+
if frag_start <= peak_end and peak_start <= frag_end:
18931896
bc_id = int(fragment_barcodes[frag_idx])
18941897
row_id = barcode_id_to_row.get(bc_id, -1)
18951898
if row_id >= 0:
18961899
rows.append(int(row_id))
18971900
columns.append(local_peak[local_peak_idx])
18981901
data.append(int(frag[2]))
18991902

1900-
# move to next peak (same fragment may contain multiple peaks)
1901-
if remaining_peak_len > 0:
1902-
peak = peak_iter()
1903-
remaining_peak_len -= 1
1904-
local_peak_idx += 1
1903+
if frag_end > peak_end:
1904+
# overhang case, perhaps the frag can still contain next peak(s)
1905+
back_trace_frag += 1
1906+
remaining_frag_len -= 1
1907+
if remaining_frag_len >= 0:
1908+
frag_idx += 1
1909+
frag = fragment_locs[frag_idx]
19051910
continue
19061911
else:
19071912
break
19081913

1909-
# if fragment ends before peak starts -> advance fragment
1910-
if frag_end < peak_start:
1914+
# if fragment ends before peak ends -> advance fragment
1915+
if frag_end < peak_end:
19111916
remaining_frag_len -= 1
19121917
if remaining_frag_len >= 0:
1913-
frag = frag_iter()
19141918
frag_idx += 1
1919+
frag = fragment_locs[frag_idx]
19151920
else:
19161921
break
19171922
else:
1918-
# peak starts before fragment ends but not contained -> advance peak
1919-
if remaining_peak_len:
1920-
peak = peak_iter()
1921-
remaining_peak_len -= 1
1923+
# advance peak
1924+
remaining_peak_len -= 1
1925+
if remaining_peak_len >= 0:
19221926
local_peak_idx += 1
1927+
peak = regions_c[local_peak_idx]
1928+
# we will also check if backtrace > 0 or not, if so, we need to move back the fragment pointer to check those peaks that were skipped due to overhang
1929+
if back_trace_frag > 0:
1930+
frag_idx -= back_trace_frag
1931+
remaining_frag_len += back_trace_frag
1932+
frag = fragment_locs[frag_idx]
1933+
back_trace_frag = 0
19231934
else:
19241935
break
19251936

@@ -1928,8 +1939,11 @@ def _two_pointer_sweep(self,regions):
19281939
obs = pd.DataFrame(index=barcodes)
19291940
#obs['n_fragments_in_peaks'] = np.asarray(x.sum(axis=1)).ravel()
19301941
var = pd.DataFrame(peak_data, columns=['chrom', 'start', 'end'], index=peak_names)
1931-
adata = ad.AnnData(X=x, obs=obs, var=var)
1932-
return adata
1942+
1943+
adata_peaks_loop = ad.AnnData(X=x, obs=obs, var=var)
1944+
return adata_peaks_loop
1945+
1946+
19331947

19341948
def return_anndata(self, regions, method = "ncls"):
19351949
"""

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,13 @@ MACS3 project is sponsored by [![CZI's Essential Open Source Software for Scienc
5252
2008: [Model-based Analysis of ChIP-Seq
5353
(MACS)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137)
5454

55+
## Note
56+
57+
Now the default branch of MACS3 has been renamed to 'main'. If you are still using old 'master' branch in your cloned Git repository for MACS3, please use the following command to rename it:
58+
59+
```
60+
git branch -m master main
61+
git fetch origin
62+
git branch -u origin/main main
63+
git remote set-head origin -a
64+
```

notebooks/500-PBMC-test/anndata_implemetation_dev.ipynb

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 9,
5+
"execution_count": 3,
66
"id": "5e322d83",
77
"metadata": {},
88
"outputs": [],
@@ -26,7 +26,7 @@
2626
},
2727
{
2828
"cell_type": "code",
29-
"execution_count": 2,
29+
"execution_count": null,
3030
"id": "804f83bc",
3131
"metadata": {},
3232
"outputs": [],
@@ -35,20 +35,20 @@
3535
"\n",
3636
"petrack = PETrackII()\n",
3737
"petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"A\", count=2) # peak_1\n",
38-
"petrack.add_loc(b\"chr1\", 70, 270, barcode=b\"A\", count=1) # peak_2\n",
39-
"petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"B\", count=3) # peak_2\n",
38+
"petrack.add_loc(b\"chr1\", 70, 270, barcode=b\"A\", count=1) # peak_1&2\n",
39+
"petrack.add_loc(b\"chr1\", 0, 100, barcode=b\"B\", count=3) # peak_1\n",
4040
"petrack.add_loc(b\"chr1\", 175, 325, barcode=b\"C\", count=4) # peak_2\n",
4141
"petrack.finalize()\n",
4242
"\n",
4343
"regions = Regions()\n",
44-
"regions.add_loc(b\"chr1\", 0, 100) # peak_1\n",
45-
"regions.add_loc(b\"chr1\", 200, 300) # peak_2\n",
46-
"regions.add_loc(b\"chr1\", 500, 600) # peak_3"
44+
"regions.add_loc(b\"chr1\", 0, 100) # peak_1 A:3 B:3 C:0\n",
45+
"regions.add_loc(b\"chr1\", 200, 300) # peak_2 A:1 B:0 C:4\n",
46+
"regions.add_loc(b\"chr1\", 500, 600) # peak_3 A:0 B:0 C:0"
4747
]
4848
},
4949
{
5050
"cell_type": "code",
51-
"execution_count": 11,
51+
"execution_count": 48,
5252
"id": "74fff8ce",
5353
"metadata": {},
5454
"outputs": [],
@@ -118,7 +118,7 @@
118118
},
119119
{
120120
"cell_type": "code",
121-
"execution_count": 12,
121+
"execution_count": 49,
122122
"id": "744ba5ee",
123123
"metadata": {},
124124
"outputs": [
@@ -142,7 +142,7 @@
142142
},
143143
{
144144
"cell_type": "code",
145-
"execution_count": 20,
145+
"execution_count": 51,
146146
"id": "a83976eb",
147147
"metadata": {},
148148
"outputs": [],
@@ -226,7 +226,7 @@
226226
},
227227
{
228228
"cell_type": "code",
229-
"execution_count": 23,
229+
"execution_count": 52,
230230
"id": "b7dcf943",
231231
"metadata": {},
232232
"outputs": [
@@ -306,55 +306,63 @@
306306
"\n",
307307
" frag_idx = 0\n",
308308
" local_peak_idx = 0\n",
309-
" frag_iter = iter(fragment_locs).__next__ # fragment\n",
310-
" peak_iter = iter(regions_c).__next__ # peaks\n",
311309
" frag_len = len(fragment_locs)\n",
312310
" peak_len = len(regions_c)\n",
313-
" frag = frag_iter() # (l,r,c)\n",
311+
" frag = fragment_locs[frag_idx]\n",
314312
" remaining_frag_len = frag_len - 1\n",
315-
" peak = peak_iter() # (start, end)\n",
313+
" peak = regions_c[local_peak_idx] # (start, end)\n",
316314
" remaining_peak_len = peak_len - 1\n",
317315
"\n",
318316
" # inside two_pointer_sweep, replace only the while-loop block with this\n",
319-
"\n",
317+
" back_trace_frag = 0\n",
320318
" while True:\n",
321319
" frag_start, frag_end = frag[0], frag[1]\n",
322320
" peak_start, peak_end = peak[0], peak[1]\n",
323321
"\n",
324-
" # peak inside fragment\n",
325-
" if frag_start <= peak_start and peak_end <= frag_end:\n",
322+
" # peak overlap fragment\n",
323+
" if frag_start <= peak_end and peak_start <= frag_end:\n",
326324
" bc_id = int(fragment_barcodes[frag_idx])\n",
327325
" row_id = barcode_id_to_row.get(bc_id, -1)\n",
328326
" if row_id >= 0:\n",
329327
" rows.append(int(row_id))\n",
330328
" columns.append(local_peak[local_peak_idx])\n",
331329
" data.append(int(frag[2]))\n",
330+
" #print(f\"peak {local_peak_idx} overlaps with fragment {frag_idx}, row_id: {row_id}, count: {frag[2]}\")\n",
332331
"\n",
333-
" # move to next peak (same fragment may contain multiple peaks)\n",
334-
" if remaining_peak_len > 0:\n",
335-
" peak = peak_iter()\n",
336-
" remaining_peak_len -= 1\n",
337-
" local_peak_idx += 1\n",
332+
" if frag_end > peak_end:\n",
333+
" # overhang case, perhaps the frag can still contain next peak(s)\n",
334+
" back_trace_frag += 1\n",
335+
" remaining_frag_len -= 1\n",
336+
" if remaining_frag_len >= 0:\n",
337+
" frag_idx += 1\n",
338+
" frag = fragment_locs[frag_idx]\n",
338339
" continue\n",
339340
" else:\n",
340341
" break\n",
341342
"\n",
342-
" # if fragment ends before peak starts -> advance fragment\n",
343-
" if frag_end < peak_start:\n",
343+
" # if fragment ends before peak ends -> advance fragment\n",
344+
" if frag_end < peak_end:\n",
344345
" remaining_frag_len -= 1\n",
345346
" if remaining_frag_len >= 0:\n",
346-
" frag = frag_iter()\n",
347347
" frag_idx += 1\n",
348+
" frag = fragment_locs[frag_idx]\n",
348349
" else:\n",
349350
" break\n",
350351
" else:\n",
351-
" # peak starts before fragment ends but not contained -> advance peak\n",
352-
" if remaining_peak_len:\n",
353-
" peak = peak_iter()\n",
354-
" remaining_peak_len -= 1\n",
352+
" # advance peak\n",
353+
" remaining_peak_len -= 1\n",
354+
" if remaining_peak_len >= 0:\n",
355355
" local_peak_idx += 1\n",
356+
" peak = regions_c[local_peak_idx]\n",
357+
" # we will also check if backtrace > 0 or not, if so, we need to move back the fragment pointer to check those peaks that were skipped due to overhang\n",
358+
" if back_trace_frag > 0:\n",
359+
" frag_idx -= back_trace_frag\n",
360+
" remaining_frag_len += back_trace_frag\n",
361+
" frag = fragment_locs[frag_idx]\n",
362+
" back_trace_frag = 0\n",
356363
" else:\n",
357364
" break\n",
365+
" #print(f\"remaining fragments: {remaining_frag_len}, remaining peaks: {remaining_peak_len}\")\n",
358366
"\n",
359367
" n_peaks = peak_counter\n",
360368
" x = sparse.coo_matrix((data,(rows,columns)),shape=(n_cells,n_peaks)).tocsr()\n",
@@ -368,19 +376,25 @@
368376
},
369377
{
370378
"cell_type": "code",
371-
"execution_count": 37,
379+
"execution_count": 54,
372380
"id": "2e4bd05d",
373381
"metadata": {},
374382
"outputs": [
375383
{
376384
"name": "stdout",
377385
"output_type": "stream",
378386
"text": [
387+
"peak 0 overlaps with fragment 0, row_id: 0, count: 2\n",
388+
"peak 0 overlaps with fragment 1, row_id: 1, count: 3\n",
389+
"peak 0 overlaps with fragment 2, row_id: 0, count: 1\n",
390+
"peak 1 overlaps with fragment 2, row_id: 0, count: 1\n",
391+
"peak 1 overlaps with fragment 3, row_id: 2, count: 4\n",
392+
"remaining fragments: -1, remaining peaks: 1\n",
379393
"AnnData object with n_obs × n_vars = 3 × 3\n",
380394
" var: 'chrom', 'start', 'end'\n",
381-
"[[2 0 0]\n",
382-
" [0 0 0]\n",
383-
" [0 0 0]]\n"
395+
"[[3 1 0]\n",
396+
" [3 0 0]\n",
397+
" [0 4 0]]\n"
384398
]
385399
}
386400
],
@@ -427,7 +441,7 @@
427441
],
428442
"metadata": {
429443
"kernelspec": {
430-
"display_name": ".venv (3.9.6)",
444+
"display_name": ".venv (3.12.8)",
431445
"language": "python",
432446
"name": "python3"
433447
},
@@ -441,7 +455,7 @@
441455
"name": "python",
442456
"nbconvert_exporter": "python",
443457
"pygments_lexer": "ipython3",
444-
"version": "3.9.6"
458+
"version": "3.12.8"
445459
}
446460
},
447461
"nbformat": 4,

0 commit comments

Comments
 (0)