Skip to content

Commit 7331123

Browse files
committed
Fixed CLI errors.
1 parent 737a236 commit 7331123

File tree

4 files changed

+221
-47
lines changed

4 files changed

+221
-47
lines changed

pairtools/cli/dedup.py

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,6 @@ def dedup_py(
383383
n_proc,
384384
**kwargs,
385385
):
386-
387386
sep = ast.literal_eval('"""' + sep + '"""')
388387
send_header_to_dedup = send_header_to in ["both", "dedup"]
389388
send_header_to_dup = send_header_to in ["both", "dups"]
@@ -485,6 +484,34 @@ def dedup_py(
485484
logger.warning(
486485
"Pairs file appears not to be sorted, dedup might produce wrong results."
487486
)
487+
488+
# Canonicalize column names for flexible matching
489+
column_names = headerops.extract_column_names(header)
490+
column_names = headerops.canonicalize_columns(column_names)
491+
492+
# Get column indices with fallbacks
493+
try:
494+
col1 = headerops.get_column_index(column_names, c1)
495+
col2 = headerops.get_column_index(column_names, c2)
496+
colp1 = headerops.get_column_index(column_names, p1)
497+
colp2 = headerops.get_column_index(column_names, p2)
498+
cols1 = headerops.get_column_index(column_names, s1)
499+
cols2 = headerops.get_column_index(column_names, s2)
500+
501+
# Handle extra column pairs
502+
extra_cols1 = []
503+
extra_cols2 = []
504+
if extra_col_pair is not None:
505+
for col1_spec, col2_spec in extra_col_pair:
506+
try:
507+
extra_cols1.append(headerops.get_column_index(column_names, col1_spec))
508+
extra_cols2.append(headerops.get_column_index(column_names, col2_spec))
509+
except ValueError:
510+
logger.warning(f"Extra column pair ({col1_spec}, {col2_spec}) not found in header, skipping")
511+
continue
512+
except ValueError as e:
513+
raise ValueError(f"Column error: {str(e)}") from e
514+
488515
header = headerops.append_new_pg(header, ID=UTIL_NAME, PN=UTIL_NAME)
489516
dups_header = header.copy()
490517
if keep_parent_id and len(dups_header) > 0:
@@ -502,38 +529,17 @@ def dedup_py(
502529
):
503530
outstream_unmapped.writelines((l + "\n" for l in header))
504531

505-
column_names = headerops.extract_column_names(header)
506-
extra_cols1 = []
507-
extra_cols2 = []
508-
if extra_col_pair is not None:
509-
for col1, col2 in extra_col_pair:
510-
extra_cols1.append(column_names[col1] if col1.isnumeric() else col1)
511-
extra_cols2.append(column_names[col2] if col2.isnumeric() else col2)
512-
513532
if backend == "cython":
514-
# warnings.warn(
515-
# "'cython' backend is deprecated and provided only"
516-
# " for backwards compatibility",
517-
# DeprecationWarning,
518-
# )
519-
extra_cols1 = [column_names.index(col) for col in extra_cols1]
520-
extra_cols2 = [column_names.index(col) for col in extra_cols2]
521-
c1 = column_names.index(c1)
522-
c2 = column_names.index(c2)
523-
p1 = column_names.index(p1)
524-
p2 = column_names.index(p2)
525-
s1 = column_names.index(s1)
526-
s2 = column_names.index(s2)
527533
streaming_dedup_cython(
528534
method,
529535
max_mismatch,
530536
sep,
531-
c1,
532-
c2,
533-
p1,
534-
p2,
535-
s1,
536-
s2,
537+
col1,
538+
col2,
539+
colp1,
540+
colp2,
541+
cols1,
542+
cols2,
537543
extra_cols1,
538544
extra_cols2,
539545
unmapped_chrom,
@@ -554,7 +560,7 @@ def dedup_py(
554560
method=method,
555561
mark_dups=mark_dups,
556562
max_mismatch=max_mismatch,
557-
extra_col_pairs=list(extra_col_pair),
563+
extra_col_pairs=list(zip(extra_cols1, extra_cols2)) if extra_cols1 else [],
558564
keep_parent_id=keep_parent_id,
559565
unmapped_chrom=unmapped_chrom,
560566
outstream=outstream,
@@ -563,12 +569,12 @@ def dedup_py(
563569
out_stat=out_stat,
564570
backend=backend,
565571
n_proc=n_proc,
566-
c1=c1,
567-
c2=c2,
568-
p1=p1,
569-
p2=p2,
570-
s1=s1,
571-
s2=s2,
572+
c1=col1,
573+
c2=col2,
574+
p1=colp1,
575+
p2=colp2,
576+
s1=cols1,
577+
s2=cols2,
572578
)
573579
else:
574580
raise ValueError("Unknown backend")

pairtools/cli/sort.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
import subprocess
77
import shutil
88
import warnings
9+
from .._logging import get_logger
910

1011
from ..lib import fileio, pairsam_format, headerops
1112
from . import cli, common_io_options
1213

14+
logger = get_logger()
1315
UTIL_NAME = "pairtools_sort"
1416

1517

@@ -58,6 +60,7 @@
5860
default=pairsam_format.COLUMNS_PAIRS[7],
5961
help=f"Pair type column; default {pairsam_format.COLUMNS_PAIRS[7]}"
6062
"[input format option]",
63+
required=False,
6164
)
6265
@click.option(
6366
"--extra-col",
@@ -157,7 +160,6 @@ def sort_py(
157160
compress_program,
158161
**kwargs,
159162
):
160-
161163
instream = fileio.auto_open(
162164
pairs_path,
163165
mode="r",
@@ -176,7 +178,6 @@ def sort_py(
176178
header = headerops.mark_header_as_sorted(header)
177179

178180
outstream.writelines((l + "\n" for l in header))
179-
180181
outstream.flush()
181182

182183
if compress_program == "auto":
@@ -191,16 +192,42 @@ def sort_py(
191192
compress_program = "gzip"
192193

193194
column_names = headerops.extract_column_names(header)
194-
columns = [c1, c2, p1, p2, pt] + list(extra_col)
195-
# Now generating the "-k <i>,<i><mode>" expressions for all columns.
196-
# If column name is in the default pairsam format and has an integer dtype there, do numerical sorting
195+
column_names = headerops.canonicalize_columns(column_names)
196+
197+
# Get column indices with fallbacks
198+
try:
199+
col1 = headerops.get_column_index(column_names, c1)
200+
col2 = headerops.get_column_index(column_names, c2)
201+
colp1 = headerops.get_column_index(column_names, p1)
202+
colp2 = headerops.get_column_index(column_names, p2)
203+
204+
# Make pair_type optional
205+
try:
206+
colpt = headerops.get_column_index(column_names, pt) if pt else None
207+
except ValueError:
208+
colpt = None
209+
210+
extra_cols = []
211+
for col in extra_col:
212+
try:
213+
extra_cols.append(headerops.get_column_index(column_names, col))
214+
except ValueError:
215+
logger.warning(f"Extra column {col} not found in header, skipping")
216+
continue
217+
except ValueError as e:
218+
raise ValueError(f"Column error: {str(e)}") from e
219+
220+
# Generate sort command columns
197221
cols = []
198-
for col in columns:
199-
colindex = int(col) if col.isnumeric() else column_names.index(col) + 1
222+
for i, col in enumerate([col1, colp1, col2, colp2, colpt] + extra_cols):
223+
if col is None:
224+
continue # Skip optional columns that weren't found
225+
dtype = pairsam_format.DTYPES_PAIRSAM.get(column_names[col], str)
200226
cols.append(
201-
f"-k {colindex},{colindex}{'n' if issubclass(pairsam_format.DTYPES_PAIRSAM.get(column_names[colindex-1], str), int) else ''}"
227+
f"-k {col+1},{col+1}{'n' if issubclass(dtype, int) else ''}"
202228
)
203229
cols = " ".join(cols)
230+
204231
command = rf"""
205232
/bin/bash -c 'export LC_COLLATE=C; export LANG=C; sort
206233
{cols}
@@ -210,9 +237,8 @@ def sort_py(
210237
{f'--temporary-directory={tmpdir}' if tmpdir else ''}
211238
-S {memory}
212239
{f'--compress-program={compress_program}' if compress_program else ''}'
213-
""".replace(
214-
"\n", " "
215-
)
240+
""".replace("\n", " ")
241+
216242
with subprocess.Popen(
217243
command, stdin=subprocess.PIPE, bufsize=-1, shell=True, stdout=outstream
218244
) as process:
@@ -224,7 +250,6 @@ def sort_py(
224250

225251
if instream != sys.stdin:
226252
instream.close()
227-
228253
if outstream != sys.stdout:
229254
outstream.close()
230255

pairtools/lib/headerops.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,51 @@
2222

2323

2424

25+
def canonicalize_columns(columns):
26+
"""Convert between common column name variants."""
27+
canonical_map = {
28+
'chr1': 'chrom1',
29+
'chr2': 'chrom2',
30+
'chrom1': 'chrom1', # Ensure identity mapping
31+
'chrom2': 'chrom2',
32+
'pt': 'pair_type',
33+
'pair_type': 'pair_type'
34+
}
35+
return [canonical_map.get(col.lower(), col) for col in columns]
36+
37+
def get_column_index(column_names, column_spec):
38+
"""Get column index with flexible name matching."""
39+
if isinstance(column_spec, int):
40+
if -len(column_names) <= column_spec < len(column_names):
41+
return column_spec % len(column_names) # Handle negative indices
42+
raise ValueError(f"Column index {column_spec} out of range")
43+
44+
if not isinstance(column_spec, (str, int)):
45+
raise AttributeError(f"Column spec must be string or integer, got {type(column_spec)}")
46+
47+
# Try direct match first
48+
try:
49+
return column_names.index(column_spec)
50+
except ValueError:
51+
pass
52+
53+
# Try canonical name
54+
canonical = canonicalize_columns([column_spec])[0]
55+
try:
56+
return column_names.index(canonical)
57+
except ValueError:
58+
pass
59+
60+
# Try case-insensitive
61+
lower_columns = [c.lower() for c in column_names]
62+
try:
63+
return lower_columns.index(canonical.lower())
64+
except ValueError:
65+
available = ', '.join(f"'{c}'" for c in column_names)
66+
raise ValueError(
67+
f"Column '{column_spec}' not found. Available columns: {available}"
68+
)
69+
2570
def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
2671
"""Returns a header from the stream and an the reaminder of the stream
2772
with the actual data.

tests/test_headerops.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,104 @@
44
import pytest
55

66

7+
import pytest
8+
from pairtools.lib.headerops import (
9+
canonicalize_columns,
10+
get_column_index,
11+
extract_column_names,
12+
)
13+
14+
def test_canonicalize_columns():
15+
# Test basic canonicalization
16+
assert canonicalize_columns(['chr1', 'chr2']) == ['chrom1', 'chrom2']
17+
assert canonicalize_columns(['chrom1', 'chrom2']) == ['chrom1', 'chrom2']
18+
assert canonicalize_columns(['pt', 'other']) == ['pair_type', 'other']
19+
20+
# Test mixed case
21+
assert canonicalize_columns(['Chr1', 'CHR2']) == ['chrom1', 'chrom2']
22+
assert canonicalize_columns(['CHR1', 'chr2']) == ['chrom1', 'chrom2']
23+
24+
# Test no changes needed
25+
assert canonicalize_columns(['readID', 'pos1']) == ['readID', 'pos1']
26+
27+
# Test empty input
28+
assert canonicalize_columns([]) == []
29+
30+
# Test all known aliases
31+
assert canonicalize_columns(['chr1', 'Chr2', 'PT']) == ['chrom1', 'chrom2', 'pair_type']
32+
33+
def test_get_column_index():
34+
# Setup test columns
35+
columns = ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
36+
37+
# Test string lookup - direct matches
38+
assert get_column_index(columns, 'chr1') == 1
39+
assert get_column_index(columns, 'pos2') == 4
40+
assert get_column_index(columns, 'pair_type') == 7
41+
42+
# Test string lookup - canonicalized matches
43+
assert get_column_index(columns, 'chrom1') == 1
44+
assert get_column_index(columns, 'CHROM2') == 3
45+
assert get_column_index(columns, 'PT') == 7
46+
47+
# Test case insensitive matches
48+
assert get_column_index(columns, 'CHR1') == 1
49+
assert get_column_index(columns, 'ChR2') == 3
50+
51+
# Test integer lookup
52+
assert get_column_index(columns, 0) == 0
53+
assert get_column_index(columns, 3) == 3
54+
assert get_column_index(columns, 7) == 7
55+
56+
# Test error cases
57+
with pytest.raises(ValueError, match="Column 'nonexistent' not found"):
58+
get_column_index(columns, 'nonexistent')
59+
60+
with pytest.raises(ValueError, match="Column index 100 out of range"):
61+
get_column_index(columns, 100)
62+
63+
with pytest.raises(AttributeError, match="Column spec must be string or integer"):
64+
get_column_index(columns, 3.14)
65+
66+
def test_integration_with_extract_column_names():
67+
# Test with actual header format
68+
header = [
69+
"## pairs format v1.0",
70+
"#columns: readID chr1 pos1 chr2 pos2 strand1 strand2 pair_type",
71+
"#chromsize: chr1 1000",
72+
"#chromsize: chr2 800"
73+
]
74+
75+
columns = extract_column_names(header)
76+
assert columns == ['readID', 'chr1', 'pos1', 'chr2', 'pos2', 'strand1', 'strand2', 'pair_type']
77+
78+
# Test canonicalized column access
79+
assert get_column_index(columns, 'chrom1') == 1
80+
assert get_column_index(columns, 'chrom2') == 3
81+
assert get_column_index(columns, 'pt') == 7
82+
83+
# Test with alternative header format
84+
header2 = [
85+
"## pairs format v1.0",
86+
"#columns: readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type",
87+
]
88+
columns2 = extract_column_names(header2)
89+
assert get_column_index(columns2, 'chr1') == 1
90+
assert get_column_index(columns2, 'chr2') == 3
91+
92+
def test_edge_cases():
93+
# Test empty columns
94+
with pytest.raises(ValueError):
95+
get_column_index([], 'chrom1')
96+
97+
# Test invalid column spec type
98+
with pytest.raises(AttributeError):
99+
get_column_index(['a', 'b', 'c'], 3.14) # float not supported
100+
101+
# Test negative indices
102+
assert get_column_index(['a', 'b', 'c'], -1) == 2 # Python-style negative indexing
103+
104+
7105
def test_make_standard_header():
8106
header = headerops.make_standard_pairsheader()
9107

0 commit comments

Comments
 (0)