diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 64ee60b..230b7ad 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -5,38 +5,37 @@ name: Pytest on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.9', '3.10', '3.11', '3.12' ] + python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install --upgrade pip wheel setuptools - pip install numpy cython - pip install -r requirements-dev.txt - pip install -e . - - name: Lint with flake8 - run: | - pip install flake8 - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pip install pytest - pytest + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip wheel setuptools + pip install numpy cython + pip install -r requirements-dev.txt + pip install -e . + - name: Lint with flake8 + run: | + pip install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pip install pytest + pytest diff --git a/cooltools/sandbox/cli_obs_over_exp.py b/cooltools/sandbox/cli_obs_over_exp.py new file mode 100644 index 0000000..8f2d12d --- /dev/null +++ b/cooltools/sandbox/cli_obs_over_exp.py @@ -0,0 +1,277 @@ +""" +CLI tool for creating observed/expected cooler files. + +This file lives in the sandbox and can be moved to cooltools/cli/ +once the obs_over_exp_cooler module is promoted from sandbox to api. + +Usage (standalone): + python -m cooltools.sandbox.cli_obs_over_exp --help + +Usage (as part of cooltools CLI, when registered): + cooltools obs-over-exp COOL_PATH --cis-expected CIS_EXP.tsv -o output_oe.cool +""" +import click +import logging + +logging.basicConfig(level=logging.INFO) + + +@click.command("obs-over-exp") +@click.argument("cool_path", metavar="COOL_PATH", type=str, nargs=1) +@click.option( + "--cis-expected", + help="Path to a TSV file with cis-expected in cooltools format " + "(as produced by `cooltools expected-cis`). " + "If not provided, cis-expected is computed from the cooler.", + type=click.Path(exists=True), + required=False, + default=None, +) +@click.option( + "--trans-expected", + help="Path to a TSV file with trans-expected in cooltools format " + "(as produced by `cooltools expected-trans`). " + "If not provided, trans-expected is computed from the cooler.", + type=click.Path(exists=True), + required=False, + default=None, +) +@click.option( + "--output", + "-o", + help="Path to the output .cool file with O/E values.", + type=str, + required=True, +) +@click.option( + "--view", + "--regions", + help="Path to a 3 or 4-column BED file with genomic regions. " + "Must match the regions used to compute expected. " + "When not specified, full chromosomes from the cooler are used.", + type=click.Path(exists=True), + required=False, + default=None, +) +@click.option( + "--expected-value-col", + help="Name of the column in the expected file(s) that contains " + "the expected interaction values to divide by.", + type=str, + default="balanced.avg", + show_default=True, +) +@click.option( + "--clr-weight-name", + help="Name of the balancing weight column in the cooler. " + "Provide empty string to use raw (unbalanced) data.", + type=str, + default="weight", + show_default=True, +) +@click.option( + "--ignore-diags", + help="Number of short-distance diagonals to ignore.", + type=int, + default=2, + show_default=True, +) +@click.option( + "--smooth", + help="If set and expected is computed on the fly, apply smoothing " + "to cis-expected before division.", + is_flag=True, +) +@click.option( + "--aggregate-smoothed", + help="If set and expected is computed on the fly, aggregate smoothed " + "expected over all regions before division.", + is_flag=True, +) +@click.option( + "--smooth-sigma", + help="Sigma for smoothing (only used when computing expected on the fly).", + type=float, + default=0.1, + show_default=True, +) +@click.option( + "--chunksize", + "-c", + help="Number of pixels per chunk for streaming computation.", + type=int, + default=int(1e6), + show_default=True, +) +@click.option( + "--nproc", + "-p", + help="Number of processes for expected computation " + "(only used when expected is computed on the fly).", + type=int, + default=4, + show_default=True, +) +@click.option( + "--no-trans", + help="If set, skip trans (inter-chromosomal) pixels entirely. " + "Only cis O/E values will be written to the output cooler. " + "This is useful when only cis-expected is available.", + is_flag=True, +) +@click.option( + "--output-dtype", + help="Data type for O/E values in the output cooler. " + "'float32' saves disk space (~2x smaller), 'float64' keeps full precision.", + type=click.Choice(["float32", "float64"]), + default="float32", + show_default=True, +) +@click.option( + "--mode", + help="File mode for cooler creation. 'a' (append) allows writing " + "multiple resolutions into one .mcool file. 'w' overwrites.", + type=click.Choice(["a", "w"]), + default="a", + show_default=True, +) +def obs_over_exp( + cool_path, + cis_expected, + trans_expected, + output, + view, + expected_value_col, + clr_weight_name, + ignore_diags, + smooth, + aggregate_smoothed, + smooth_sigma, + chunksize, + nproc, + no_trans, + output_dtype, + mode, +): + """ + Divide a cooler's contact matrix by expected and save as a new cooler. + + Creates a new .cool file where pixel values are observed/expected (O/E) + ratios. Cis (intra-chromosomal) and trans (inter-chromosomal) expected + can be provided as pre-computed TSV files, or computed on the fly from + the cooler. + + COOL_PATH : Path to a .cool file with a Hi-C contact map. + + Examples: + + \b + Using pre-computed expected files: + cooltools obs-over-exp data.cool \\ + --cis-expected cis_exp.tsv \\ + --trans-expected trans_exp.tsv \\ + --expected-value-col balanced.avg.smoothed.agg \\ + -o data_oe.cool + + \b + Computing expected on the fly: + cooltools obs-over-exp data.cool \\ + --smooth --aggregate-smoothed \\ + --expected-value-col balanced.avg.smoothed.agg \\ + -o data_oe.cool + """ + import cooler + import pandas as pd + from cooltools.lib.common import make_cooler_view + from cooltools.lib.io import read_viewframe_from_file, read_expected_from_file + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + clr = cooler.Cooler(cool_path) + + # Handle empty string for clr_weight_name (means raw data) + if clr_weight_name == "": + clr_weight_name = None + + # ---- Load view ---- + if view is not None: + view_df = read_viewframe_from_file(view, verify_cooler=clr, check_sorting=True) + else: + view_df = make_cooler_view(clr) + + # ---- Load expected files ---- + # Only validate the column(s) the user actually needs. + # For cis: we need expected_value_col (e.g. "balanced.avg.smoothed.agg"). + # For trans: smoothed columns don't exist, so we'll try the requested col + # first, and fall back to "balanced.avg" / "count.avg" at stitch-time. + expected_cis_df = None + expected_trans_df = None + + if cis_expected is not None: + logging.info(f"Reading cis-expected from {cis_expected} ...") + expected_cis_df = read_expected_from_file( + cis_expected, + contact_type="cis", + expected_value_cols=[expected_value_col], + verify_view=view_df, + verify_cooler=clr, + ) + + if trans_expected is not None: + logging.info(f"Reading trans-expected from {trans_expected} ...") + # Trans expected typically has "balanced.avg" / "count.avg" only, + # so validate with the base column; create_obs_over_exp_cooler will + # handle the fallback logic. + trans_base_col = "balanced.avg" if clr_weight_name else "count.avg" + expected_trans_df = read_expected_from_file( + trans_expected, + contact_type="trans", + expected_value_cols=[trans_base_col], + verify_view=view_df, + verify_cooler=clr, + ) + elif no_trans: + # User explicitly skips trans O/E + expected_trans_df = False + # else: expected_trans_df stays None => computed on the fly + + # ---- Determine the right expected_value_col ---- + # When computing on the fly and smooth is requested, auto-adjust + # the expected_value_col to the smoothed column name + if cis_expected is None and smooth: + base_col = "balanced.avg" if clr_weight_name else "count.avg" + auto_col = f"{base_col}.smoothed" + if aggregate_smoothed: + auto_col = f"{auto_col}.agg" + if expected_value_col in ("balanced.avg", "count.avg"): + logging.info( + f"Smoothing requested; auto-adjusting expected_value_col " + f"from '{expected_value_col}' to '{auto_col}'" + ) + expected_value_col = auto_col + + # ---- Run ---- + create_obs_over_exp_cooler( + clr=clr, + out_path=output, + expected_cis_df=expected_cis_df, + expected_trans_df=expected_trans_df, + view_df=view_df, + expected_value_col=expected_value_col, + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + smooth_cis=smooth, + aggregate_smoothed=aggregate_smoothed, + smooth_sigma=smooth_sigma, + chunksize=chunksize, + nproc=nproc, + output_dtype=output_dtype, + mode=mode, + ) + + logging.info(f"Done! O/E cooler written to: {output}") + + +# ---- Standalone entry point ---- +# This allows running: python -m cooltools.sandbox.cli_obs_over_exp --help +if __name__ == "__main__": + obs_over_exp() diff --git a/cooltools/sandbox/obs_over_exp_cooler.py b/cooltools/sandbox/obs_over_exp_cooler.py index 58dd92b..54954b6 100644 --- a/cooltools/sandbox/obs_over_exp_cooler.py +++ b/cooltools/sandbox/obs_over_exp_cooler.py @@ -15,6 +15,9 @@ `obs_over_exp` function and yields chunks of observed/expected pixel table. Such a "stream" can be used in cooler.create as a "pixels" argument to write obs/exp cooler-file. +create_obs_over_exp_cooler - the main user-facing function that takes a cooler, + pre-computed expected (cis and optionally trans), and writes a new cooler file + with observed/expected values as the pixel counts. It also includes 3 helper functions (used in `expected_full_fast`): make_pairwise_expected_table - a function that creates an empty table for the @@ -742,6 +745,31 @@ def obs_over_exp( return pixels_oe +def _obs_over_exp_worker(args): + """ + Top-level picklable worker function for parallel obs_over_exp computation. + + Reads a pixel chunk from the cooler and computes O/E for that chunk. + Must be a module-level function so that multiprocess can pickle it. + """ + (clr_uri, lo, hi, bins_view, expected_full, + view_column_name, expected_column_name, + clr_weight_name, oe_column_name) = args + + clr = cooler.Cooler(clr_uri) + pixels = clr.pixels()[lo:hi] + oe_chunk = obs_over_exp( + pixels, + bins_view, + expected_full, + view_column_name=view_column_name, + expected_column_name=expected_column_name, + clr_weight_name=clr_weight_name, + oe_column_name=oe_column_name, + ) + return oe_chunk[["bin1_id", "bin2_id", oe_column_name]] + + def obs_over_exp_generator( clr, expected_full, @@ -749,12 +777,17 @@ def obs_over_exp_generator( expected_column_name="expected", clr_weight_name='weight', oe_column_name="count", - chunksize = 1_000_000 + chunksize=1_000_000, + nproc=1, ): """ Generator yielding chunks of pixels with pre-caluclated observed over expected. + When nproc > 1, chunks are computed in parallel using a process pool + with ordered mapping (``imap``), so the output order is preserved — + a requirement for ``cooler.create_cooler(ordered=True)``. + Parameters ---------- clr : cooler.Cooler @@ -772,6 +805,8 @@ def obs_over_exp_generator( Use raw unbalanced data, when None. chunksize : int, optional Size of pixel table chunks to process and output + nproc : int, optional + Number of worker processes. 1 (default) means sequential. Yields ------ @@ -791,18 +826,340 @@ def obs_over_exp_generator( # define chunks of pixels to work on spans = partition(0, len(clr.pixels()), chunksize) - for span in spans: - lo, hi = span - # logging.info(f"Calculating observed over expected for pixels [{lo}:{hi}]") - oe_chunk = obs_over_exp( - clr.pixels()[lo:hi], - bins_view, - expected_full, - view_column_name=view_column_name, - expected_column_name=expected_column_name, + + if nproc > 1: + # Build the cooler URI so workers can re-open it independently + clr_uri = clr.uri # e.g. "path.cool" or "path.mcool::/resolutions/10000" + + # Prepare argument tuples for the worker function + job_args = [ + (clr_uri, lo, hi, bins_view, expected_full, + view_column_name, expected_column_name, + clr_weight_name, oe_column_name) + for lo, hi in spans + ] + + logging.info( + f"Computing O/E in parallel with {nproc} processes " + f"({len(job_args)} chunks of {chunksize} pixels) ..." + ) + + # imap preserves order — critical for cooler.create_cooler(ordered=True) + pool = mp.Pool(nproc) + try: + for oe_chunk in pool.imap(_obs_over_exp_worker, job_args): + yield oe_chunk + finally: + pool.close() + pool.join() + else: + for span in spans: + lo, hi = span + oe_chunk = obs_over_exp( + clr.pixels()[lo:hi], + bins_view, + expected_full, + view_column_name=view_column_name, + expected_column_name=expected_column_name, + clr_weight_name=clr_weight_name, + oe_column_name=oe_column_name, + ) + yield oe_chunk[["bin1_id", "bin2_id", oe_column_name]] + + +def create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df, + expected_trans_df=None, + view_df=None, + expected_value_col="balanced.avg", + clr_weight_name="weight", + ignore_diags=2, + smooth_cis=False, + aggregate_smoothed=False, + smooth_sigma=0.1, + aggregate_trans=False, + chunksize=1_000_000, + nproc=4, + output_dtype="float32", + mode="a", +): + """ + Create a new cooler file where pixel values are observed/expected ratios. + + This is the main user-facing function that: + 1. Takes a cooler and pre-computed cis-expected (and optionally trans-expected). + If expected DataFrames are not provided, computes them from the cooler. + 2. Stitches cis and trans expected into a "full" expected covering the entire heatmap. + 3. Divides observed (balanced or raw) pixel values by the expected, and writes + the resulting O/E values into a new cooler file. + + Expected DataFrames should be in the standard cooltools format, e.g. as produced + by `cooltools.expected_cis()` and `cooltools.expected_trans()`. + + Parameters + ---------- + clr : cooler.Cooler + Input cooler object. + out_path : str + Path to the output .cool file where O/E values will be stored. + expected_cis_df : pd.DataFrame or None + Pre-computed cis-expected DataFrame in cooltools format. + If None, cis-expected is computed from the cooler. + expected_trans_df : pd.DataFrame, None, or False + Pre-computed trans-expected DataFrame in cooltools format. + If None, trans-expected is computed from the cooler. + If False, trans pixels are skipped entirely (cis-only O/E). + view_df : viewframe or None + A viewframe defining genomic regions. Must match the regions + used to compute expected. If None, uses full chromosomes from + the cooler. + expected_value_col : str + Name of the column in expected DataFrames that contains the + expected interaction values to divide by. + Default is "balanced.avg". Other common choices: + "balanced.avg.smoothed", "balanced.avg.smoothed.agg", + "count.avg" (for raw data). + clr_weight_name : str or None + Name of the balancing weight column in the cooler. + Set to None to use raw (unbalanced) data. + ignore_diags : int + Number of short-distance diagonals to ignore. + Pixels on these diagonals will be excluded from the output. + smooth_cis : bool + Only used when expected_cis_df is None and expected needs + to be computed. Apply smoothing to cis-expected. + aggregate_smoothed : bool + Only used when expected_cis_df is None. Aggregate smoothed + expected over all regions. + smooth_sigma : float + Only used when expected_cis_df is None. Sigma for smoothing. + aggregate_trans : bool + Only used when expected_trans_df is None. Aggregate + trans-expected at the inter-chromosomal level. + chunksize : int + Number of pixels per chunk for streaming computation. + nproc : int + Number of processes for expected computation (when expected + needs to be computed) and for parallel O/E pixel calculation. + output_dtype : str or numpy dtype + Data type for the O/E values stored in the output cooler. + Default is "float32" which is sufficient for visualisation + and saves disk space (~2x smaller files). Use "float64" when + full double-precision is needed for downstream analyses. + mode : str + File mode for ``cooler.create_cooler()``. Default is "a" + (append), which allows writing multiple resolutions into + one ``.mcool`` file by calling this function consecutively + for each resolution. Use "w" to overwrite an existing file. + + Returns + ------- + None + Writes the output cooler to `out_path`. + + Notes + ----- + The output cooler has the same bins as the input cooler, but the + "count" column in pixels contains float O/E values instead of + integer raw counts. Pixels with NaN O/E values (e.g. from + bad bins, ignored diagonals, or missing expected) are excluded. + + The workflow is: + 1. Prepare a "full" expected table covering the entire heatmap by + stitching cis and trans expected together with integer region + labels matching the viewframe. + 2. Stream through pixel chunks, annotate each pixel with its + region pair and genomic distance, merge with expected, divide + observed by expected. + 3. Write the resulting O/E pixel stream into a new cooler using + `cooler.create_cooler()`. + + Examples + -------- + Using pre-computed expected files: + + >>> import cooler + >>> import pandas as pd + >>> from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + >>> clr = cooler.Cooler("mydata.cool") + >>> cis_exp = pd.read_table("expected_cis.tsv") + >>> trans_exp = pd.read_table("expected_trans.tsv") + >>> create_obs_over_exp_cooler( + ... clr, "mydata_oe.cool", + ... expected_cis_df=cis_exp, + ... expected_trans_df=trans_exp, + ... expected_value_col="balanced.avg.smoothed.agg", + ... ) + + Computing expected on the fly: + + >>> create_obs_over_exp_cooler( + ... clr, "mydata_oe.cool", + ... expected_cis_df=None, + ... expected_trans_df=None, + ... smooth_cis=True, + ... aggregate_smoothed=True, + ... ) + """ + + logging.info( + f"Creating O/E cooler from {clr.filename} " + f"(binsize={clr.binsize}, {len(clr.chromnames)} chroms, " + f"{len(clr.pixels())} pixels) -> {out_path}" + ) + + # ---- 1. Prepare view ------------------------------------------------ + if view_df is None: + view_df = make_cooler_view(clr) + else: + try: + _ = is_compatible_viewframe( + view_df, clr, check_sorting=True, raise_errors=True, + ) + except Exception as e: + raise ValueError("view_df is not a valid viewframe or incompatible") from e + + # ---- 2. Compute expected if not provided ----------------------------- + if expected_cis_df is None: + logging.info("No cis-expected provided, computing from the cooler ...") + expected_cis_df = expected_cis( + clr, + view_df=view_df, + intra_only=False, # need all cis pairwise blocks + smooth=smooth_cis, + aggregate_smoothed=aggregate_smoothed, + smooth_sigma=smooth_sigma, + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize * 10, # larger chunks for expected computation + nproc=nproc, + ) + + if expected_trans_df is None: + logging.info( + "No trans-expected provided, computing from the cooler ... " + "Pass expected_trans_df=False to skip trans and keep only cis O/E." + ) + expected_trans_df = expected_trans( + clr, + view_df=view_df, clr_weight_name=clr_weight_name, - oe_column_name=oe_column_name, + chunksize=chunksize * 10, + nproc=nproc, ) - # yield pixel-table-like chunks of observed over expected - yield oe_chunk[["bin1_id", "bin2_id", oe_column_name]] + # Aggregate trans-expected at the chromosome level if requested. + # This is useful when a sub-chromosomal view is provided, so that + # all inter-arm blocks of the same chromosome pair share the same + # expected value. + if aggregate_trans: + view_label_agg = ( + view_df.reset_index() + .rename(columns={"index": "r"}) + .set_index("name") + ) + additive_cols = ["n_valid", "count.sum"] + if clr_weight_name: + additive_cols.append("balanced.sum") + _cpb_agg = expected_trans_df.groupby( + [ + view_label_agg["chrom"].loc[expected_trans_df[_REGION1_NAME]].to_numpy(), + view_label_agg["chrom"].loc[expected_trans_df[_REGION2_NAME]].to_numpy(), + ] + )[additive_cols].transform("sum") + expected_trans_df["count.avg.agg"] = ( + _cpb_agg["count.sum"] / _cpb_agg["n_valid"] + ) + if clr_weight_name: + expected_trans_df["balanced.avg.agg"] = ( + _cpb_agg["balanced.sum"] / _cpb_agg["n_valid"] + ) + logging.info("Aggregated trans-expected at the chromosome level.") + + # ---- 3. Stitch cis + trans into a "full" expected -------------------- + # Determine the expected value column name for cis and trans + if expected_value_col not in expected_cis_df.columns: + raise ValueError( + f"Column '{expected_value_col}' not found in cis-expected. " + f"Available columns: {list(expected_cis_df.columns)}" + ) + + # Build a unified view label mapping: region name -> integer index + view_label = ( + view_df.reset_index() + .rename(columns={"index": "r"}) + .set_index("name") + ) + + # Prepare cis expected with "r1", "r2", "dist" for merging + cis_df = expected_cis_df.copy() + cis_df["expected"] = cis_df[expected_value_col] + cis_df["r1"] = view_label["r"].loc[cis_df[_REGION1_NAME]].to_numpy() + cis_df["r2"] = view_label["r"].loc[cis_df[_REGION2_NAME]].to_numpy() + + # Prepare trans expected (if provided) + if expected_trans_df is not False and expected_trans_df is not None: + trans_value_col = expected_value_col + # trans expected might use a simpler column name (no smoothing suffix) + # try to find the best match + if trans_value_col not in expected_trans_df.columns: + # fall back to balanced.avg or count.avg + for fallback in ["balanced.avg", "count.avg"]: + if fallback in expected_trans_df.columns: + trans_value_col = fallback + logging.info( + f"Column '{expected_value_col}' not found in trans-expected, " + f"falling back to '{trans_value_col}'." + ) + break + else: + raise ValueError( + f"Column '{expected_value_col}' not found in trans-expected. " + f"Available columns: {list(expected_trans_df.columns)}" + ) + + trans_df = expected_trans_df.copy() + trans_df["expected"] = trans_df[trans_value_col] + trans_df[_DIST_NAME] = TRANS_DIST_VALUE # sentinel value for trans + trans_df["r1"] = view_label["r"].loc[trans_df[_REGION1_NAME]].to_numpy() + trans_df["r2"] = view_label["r"].loc[trans_df[_REGION2_NAME]].to_numpy() + else: + trans_df = None + logging.info("No trans-expected, output cooler will contain cis O/E only.") + + # Concatenate into full expected + keep_cols = ["r1", "r2", _DIST_NAME, "expected"] + parts = [cis_df[keep_cols]] + if trans_df is not None: + parts.append(trans_df[keep_cols]) + full_expected = pd.concat(parts, ignore_index=True) + + # ---- 4. Generate O/E pixels and write cooler ------------------------- + logging.info(f"Generating O/E pixels and writing to {out_path} ...") + + pixel_generator = obs_over_exp_generator( + clr, + expected_full=full_expected, + view_df=view_df, + expected_column_name="expected", + clr_weight_name=clr_weight_name, + oe_column_name="count", # name it "count" for the output cooler + chunksize=chunksize, + nproc=nproc, + ) + + # Create the new cooler with O/E values + # Use the original bins (chrom, start, end only - no weights) + bins = clr.bins()[:][["chrom", "start", "end"]] + cooler.create_cooler( + out_path, + bins, + pixel_generator, + dtypes={"count": np.dtype(output_dtype)}, # O/E values are floats + ordered=True, + mode=mode, + ) + + logging.info(f"Done. O/E cooler written to {out_path}") diff --git a/tests/test_obs_over_exp_cooler.py b/tests/test_obs_over_exp_cooler.py new file mode 100644 index 0000000..34c7e11 --- /dev/null +++ b/tests/test_obs_over_exp_cooler.py @@ -0,0 +1,744 @@ +""" +Tests for cooltools.sandbox.obs_over_exp_cooler.create_obs_over_exp_cooler +and cooltools.sandbox.cli_obs_over_exp. + +Uses the 10 Mb-resolution test cooler (CN.mm9.10000kb.cool, ~38K pixels) +for fast execution. +""" +import os.path as op +import numpy as np +import pandas as pd +from numpy import testing + +import bioframe +import cooler +import pytest + +from click.testing import CliRunner + +import cooltools.api.expected +from cooltools.lib.common import make_cooler_view, assign_supports + +# --- Common test parameters ------------------------------------------------- +ignore_diags = 2 +clr_weight_name = "weight" +chunksize = 10_000 +# Use the coarse 10 Mb cooler for speed (278 bins, 38K pixels) +_COOL_FILE = "data/CN.mm9.10000kb.cool" +assumed_binsize = 10_000_000 + +chromsizes_file = op.join( + op.dirname(op.realpath(__file__)), + "data/mm9.chrom.sizes.reduced", +) +chromsizes = bioframe.read_chromsizes(chromsizes_file) +chromosomes = list(chromsizes.index) + +# Build a view that splits each of the first 4 chroms in half (aligned to 10 Mb) +common_regions = [] +for i in range(4): + chrom = chromosomes[i] + halfway_chrom = int(chromsizes[chrom] / 2) + halfway_chrom = round(halfway_chrom / assumed_binsize) * assumed_binsize + reg1 = (chrom, 0, halfway_chrom) + reg2 = (chrom, halfway_chrom, chromsizes[chrom]) + common_regions.append(reg1) + common_regions.append(reg2) + +view_df = bioframe.make_viewframe(common_regions, name_style="ucsc") + + +# --------------------------------------------------------------------------- +# Fixtures – compute expected once per module and share across tests +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="module") +def clr(request): + """Cooler object for the 10 Mb test dataset.""" + return cooler.Cooler(op.join(request.fspath.dirname, _COOL_FILE)) + + +@pytest.fixture(scope="module") +def cool_path(request): + """Absolute path to the test cooler file.""" + return op.join(request.fspath.dirname, _COOL_FILE) + + +@pytest.fixture(scope="module") +def cis_exp(clr): + """Pre-computed cis expected (non-smoothed).""" + return cooltools.api.expected.expected_cis( + clr, + view_df=view_df, + intra_only=False, + clr_weight_name=clr_weight_name, + chunksize=chunksize, + ignore_diags=ignore_diags, + ) + + +@pytest.fixture(scope="module") +def cis_exp_smoothed(clr): + """Pre-computed cis expected with smoothing + aggregation.""" + return cooltools.api.expected.expected_cis( + clr, + view_df=view_df, + intra_only=False, + clr_weight_name=clr_weight_name, + chunksize=chunksize, + ignore_diags=ignore_diags, + smooth=True, + aggregate_smoothed=True, + ) + + +@pytest.fixture(scope="module") +def trans_exp(clr): + """Pre-computed trans expected.""" + return cooltools.api.expected.expected_trans( + clr, + view_df=view_df, + clr_weight_name=clr_weight_name, + chunksize=chunksize, + ) + + +def _verify_oe_cooler( + oe_clr, + original_clr, + expected_cis_df, + expected_trans_df, + view_df, + expected_value_col="balanced.avg", + atol=1e-6, + check_trans=True, +): + """ + Verify that the O/E cooler has correct values by manually computing O/E + for a few region pairs and comparing with the values stored in the cooler. + + Parameters + ---------- + oe_clr : cooler.Cooler + The output O/E cooler to verify. + original_clr : cooler.Cooler + The original input cooler. + expected_cis_df : pd.DataFrame + Cis expected used to create the O/E cooler. + expected_trans_df : pd.DataFrame or None + Trans expected, or None if trans was not included. + view_df : pd.DataFrame + The viewframe. + expected_value_col : str + Column name used for expected values. + atol : float + Absolute tolerance for float comparisons. + check_trans : bool + Whether to also verify trans (inter-chromosomal) pixels. + """ + # 1. Basic structural checks + assert oe_clr.binsize == original_clr.binsize + assert len(oe_clr.bins()) == len(original_clr.bins()) + assert list(oe_clr.chromnames) == list(original_clr.chromnames) + + # 2. Check that pixels have float count values (O/E ratios) + oe_pixels = oe_clr.pixels()[:] + assert np.issubdtype(oe_pixels["count"].dtype, np.floating) + + # 3. Verify a cis region: pick the first intra-chromosomal block in the view + region_name = view_df.iloc[0]["name"] + region_ucsc = ( + view_df.iloc[0].iloc[0], + view_df.iloc[0].iloc[1], + view_df.iloc[0].iloc[2], + ) + + # Get balanced matrix from original + orig_matrix = original_clr.matrix(balance=clr_weight_name).fetch(region_ucsc) + oe_matrix = oe_clr.matrix(balance=False).fetch(region_ucsc) + + # Get expected for this region + mask = ( + (expected_cis_df["region1"] == region_name) + & (expected_cis_df["region2"] == region_name) + ) + region_expected = expected_cis_df.loc[mask].set_index("dist")[expected_value_col] + + # For each element in the matrix, verify O/E = balanced / expected[dist] + n = orig_matrix.shape[0] + for d in range(ignore_diags, min(n, 10)): # check a few diagonals + exp_val = region_expected.get(d, np.nan) + if np.isnan(exp_val) or exp_val == 0: + continue + # Get the diagonal from both matrices + orig_diag = np.diag(orig_matrix, d) + oe_diag = np.diag(oe_matrix, d) + # Where original has valid values, O/E should be balanced / expected + valid = ~np.isnan(orig_diag) & ~np.isnan(oe_diag) + if valid.any(): + expected_oe = orig_diag[valid] / exp_val + testing.assert_allclose( + oe_diag[valid], + expected_oe, + atol=atol, + rtol=1e-4, + err_msg=f"O/E mismatch at diagonal {d} for region {region_name}", + ) + + # 4. Verify O/E values: finite values should be non-negative + # NaN values are expected (bad bins, ignored diagonals, etc.) + oe_all = oe_clr.pixels()[:]["count"] + finite_mask = np.isfinite(oe_all) + assert finite_mask.sum() > 0, "No finite O/E values found" + assert np.all(oe_all[finite_mask] >= 0), "Found negative O/E values" + + # 5. If checking trans, verify that trans pixels exist (or don't) + if check_trans and expected_trans_df is not None: + # There should be some inter-chromosomal pixels in the O/E cooler + oe_pix = oe_clr.pixels()[:] + bins = oe_clr.bins()[:] + annotated = cooler.annotate(oe_pix, bins) + trans_mask = annotated["chrom1"] != annotated["chrom2"] + assert trans_mask.sum() > 0, "Expected trans pixels in O/E cooler but found none" + # Some trans O/E values should be finite + trans_vals = oe_pix.loc[trans_mask, "count"] + assert np.isfinite(trans_vals).sum() > 0, ( + "Trans pixels exist but none have finite O/E values" + ) + elif not check_trans: + # With no-trans, trans pixels may still be present (streamed from + # the cooler) but should all have NaN O/E values since there was + # no trans expected to divide by. + oe_pix = oe_clr.pixels()[:] + bins = oe_clr.bins()[:] + annotated = cooler.annotate(oe_pix, bins) + trans_mask = annotated["chrom1"] != annotated["chrom2"] + if trans_mask.sum() > 0: + trans_vals = oe_pix.loc[trans_mask, "count"] + assert np.all(np.isnan(trans_vals)), ( + "Cis-only O/E cooler has finite trans pixel values" + ) + + +# --------------------------------------------------------------------------- +# Tests for create_obs_over_exp_cooler (Python API) +# --------------------------------------------------------------------------- + +class TestCreateObsOverExpCooler: + """Tests for the create_obs_over_exp_cooler function.""" + + def test_basic_with_precomputed_expected(self, clr, cis_exp, trans_exp, tmp_path): + """Test basic O/E cooler creation with pre-computed cis and trans expected.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_basic.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=trans_exp, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + # Verify the output + oe_clr = cooler.Cooler(out_path) + _verify_oe_cooler( + oe_clr, clr, cis_exp, trans_exp, view_df, + expected_value_col="balanced.avg", + ) + + def test_cis_only_no_trans(self, clr, cis_exp, tmp_path): + """Test O/E cooler with expected_trans_df=False (cis only).""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_cis_only.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=False, # skip trans + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + _verify_oe_cooler( + oe_clr, clr, cis_exp, None, view_df, + expected_value_col="balanced.avg", + check_trans=False, + ) + + def test_smoothed_expected_value_col(self, clr, cis_exp_smoothed, tmp_path): + """Test using a smoothed expected value column.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_smoothed.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp_smoothed, + expected_trans_df=False, # skip trans (smoothed cols don't exist there) + view_df=view_df, + expected_value_col="balanced.avg.smoothed.agg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + # Structural check + assert len(oe_clr.bins()) == len(clr.bins()) + oe_pixels = oe_clr.pixels()[:] + assert np.issubdtype(oe_pixels["count"].dtype, np.floating) + assert len(oe_pixels) > 0 + + def test_compute_expected_on_the_fly(self, clr, tmp_path): + """Test that expected is computed on the fly when not provided.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_auto.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=None, # compute on the fly + expected_trans_df=None, # compute on the fly + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + nproc=1, + ) + + oe_clr = cooler.Cooler(out_path) + assert len(oe_clr.bins()) == len(clr.bins()) + oe_pixels = oe_clr.pixels()[:] + assert np.issubdtype(oe_pixels["count"].dtype, np.floating) + assert len(oe_pixels) > 0 + + def test_no_view_uses_full_chroms(self, clr, tmp_path): + """Test that view_df=None defaults to full chromosomes.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_noview.cool") + + # Compute expected with full-chromosome view for comparison + full_view = make_cooler_view(clr) + full_cis_exp = cooltools.api.expected.expected_cis( + clr, + view_df=full_view, + intra_only=False, + clr_weight_name=clr_weight_name, + chunksize=chunksize, + ignore_diags=ignore_diags, + ) + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=full_cis_exp, + expected_trans_df=False, + view_df=None, # should default to full chroms + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + assert len(oe_clr.bins()) == len(clr.bins()) + oe_pixels = oe_clr.pixels()[:] + assert len(oe_pixels) > 0 + + def test_trans_fallback_column(self, clr, cis_exp_smoothed, trans_exp, tmp_path): + """Test that trans expected falls back when the requested column is missing.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_trans_fallback.cool") + + # Request "balanced.avg.smoothed.agg" which exists in cis but not in trans. + # The function should fall back to "balanced.avg" for trans. + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp_smoothed, + expected_trans_df=trans_exp, + view_df=view_df, + expected_value_col="balanced.avg.smoothed.agg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + oe_pixels = oe_clr.pixels()[:] + assert len(oe_pixels) > 0 + + def test_invalid_expected_value_col_raises(self, clr, cis_exp, tmp_path): + """Test that an invalid expected_value_col raises ValueError.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_bad_col.cool") + + with pytest.raises(ValueError, match="not found in cis-expected"): + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=False, + view_df=view_df, + expected_value_col="nonexistent_column", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + ) + + def test_output_bins_have_no_weights(self, clr, cis_exp, tmp_path): + """Test that output cooler bins only have chrom/start/end (no weight cols).""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_bins.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=False, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + bin_cols = list(oe_clr.bins().columns) + assert bin_cols == ["chrom", "start", "end"] + + def test_oe_values_are_around_one(self, clr, cis_exp, tmp_path): + """Test that O/E values are centered roughly around 1.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / "oe_around_one.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=False, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + oe_clr = cooler.Cooler(out_path) + oe_vals = oe_clr.pixels()[:]["count"].values + # Median of finite O/E should be somewhat near 1 for a well-balanced dataset. + # NaN pixels (bad bins, trans in cis-only, ignored diags) are excluded. + finite_vals = oe_vals[np.isfinite(oe_vals)] + assert len(finite_vals) > 0, "No finite O/E values" + median_oe = np.median(finite_vals) + assert 0.01 < median_oe < 100, f"Median O/E = {median_oe}, expected near 1" + + @pytest.mark.parametrize("dtype", ["float32", "float64"]) + def test_output_dtype(self, clr, cis_exp, tmp_path, dtype): + """Test that output_dtype controls the storage precision of O/E values.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_path = str(tmp_path / f"oe_{dtype}.cool") + + create_obs_over_exp_cooler( + clr, + out_path, + expected_cis_df=cis_exp, + expected_trans_df=False, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + output_dtype=dtype, + ) + + oe_clr = cooler.Cooler(out_path) + oe_pixels = oe_clr.pixels()[:] + assert oe_pixels["count"].dtype == np.dtype(dtype), ( + f"Expected dtype {dtype}, got {oe_pixels['count'].dtype}" + ) + + def test_parallel_matches_sequential(self, clr, cis_exp, trans_exp, tmp_path): + """Test that nproc>1 gives identical results to nproc=1.""" + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + + out_seq = str(tmp_path / "oe_seq.cool") + out_par = str(tmp_path / "oe_par.cool") + + common_kwargs = dict( + expected_cis_df=cis_exp, + expected_trans_df=trans_exp, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + output_dtype="float64", # use float64 to avoid precision loss in comparison + ) + + create_obs_over_exp_cooler(clr, out_seq, nproc=1, **common_kwargs) + create_obs_over_exp_cooler(clr, out_par, nproc=2, **common_kwargs) + + seq_pixels = cooler.Cooler(out_seq).pixels()[:].sort_values( + ["bin1_id", "bin2_id"] + ).reset_index(drop=True) + par_pixels = cooler.Cooler(out_par).pixels()[:].sort_values( + ["bin1_id", "bin2_id"] + ).reset_index(drop=True) + + assert len(seq_pixels) == len(par_pixels), ( + f"Pixel count mismatch: sequential={len(seq_pixels)}, parallel={len(par_pixels)}" + ) + testing.assert_array_equal( + seq_pixels["bin1_id"].values, par_pixels["bin1_id"].values + ) + testing.assert_array_equal( + seq_pixels["bin2_id"].values, par_pixels["bin2_id"].values + ) + testing.assert_allclose( + seq_pixels["count"].values, + par_pixels["count"].values, + rtol=1e-12, + err_msg="Parallel and sequential O/E values differ", + ) + + +# --------------------------------------------------------------------------- +# Tests for CLI (cooltools.sandbox.cli_obs_over_exp) +# --------------------------------------------------------------------------- + +class TestObsOverExpCLI: + """Tests for the standalone CLI tool.""" + + def test_cli_with_precomputed_expected(self, clr, cis_exp, trans_exp, cool_path, tmp_path): + """Test CLI with pre-computed cis and trans expected TSV files.""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + cis_exp_path = str(tmp_path / "cis_exp.tsv") + trans_exp_path = str(tmp_path / "trans_exp.tsv") + view_path = str(tmp_path / "view.bed") + out_path = str(tmp_path / "oe_cli.cool") + + cis_exp.to_csv(cis_exp_path, sep="\t", index=False) + trans_exp.to_csv(trans_exp_path, sep="\t", index=False) + view_df.to_csv(view_path, sep="\t", index=False, header=False) + + runner = CliRunner() + result = runner.invoke( + obs_over_exp, + [ + cool_path, + "--cis-expected", cis_exp_path, + "--trans-expected", trans_exp_path, + "--view", view_path, + "--expected-value-col", "balanced.avg", + "--clr-weight-name", clr_weight_name, + "--ignore-diags", str(ignore_diags), + "-o", out_path, + ], + ) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + oe_clr = cooler.Cooler(out_path) + assert len(oe_clr.pixels()) > 0 + + def test_cli_cis_only(self, clr, cis_exp, cool_path, tmp_path): + """Test CLI with cis-expected only and --no-trans flag.""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + cis_exp_path = str(tmp_path / "cis_exp.tsv") + view_path = str(tmp_path / "view.bed") + out_path = str(tmp_path / "oe_cis_cli.cool") + + cis_exp.to_csv(cis_exp_path, sep="\t", index=False) + view_df.to_csv(view_path, sep="\t", index=False, header=False) + + runner = CliRunner() + result = runner.invoke( + obs_over_exp, + [ + cool_path, + "--cis-expected", cis_exp_path, + "--no-trans", + "--view", view_path, + "--expected-value-col", "balanced.avg", + "-o", out_path, + ], + ) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + oe_clr = cooler.Cooler(out_path) + # Verify trans pixels have NaN O/E values (cis-only mode) + oe_pix = oe_clr.pixels()[:] + bins = oe_clr.bins()[:] + annotated = cooler.annotate(oe_pix, bins) + trans_mask = annotated["chrom1"] != annotated["chrom2"] + if trans_mask.sum() > 0: + trans_vals = oe_pix.loc[trans_mask, "count"] + assert np.all(np.isnan(trans_vals)), ( + "Cis-only O/E cooler has finite trans pixel values" + ) + + def test_cli_compute_on_fly(self, cool_path, tmp_path): + """Test CLI computing expected on the fly (no --cis-expected).""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + view_path = str(tmp_path / "view.bed") + out_path = str(tmp_path / "oe_auto_cli.cool") + + view_df.to_csv(view_path, sep="\t", index=False, header=False) + + runner = CliRunner() + result = runner.invoke( + obs_over_exp, + [ + cool_path, + "--view", view_path, + "--expected-value-col", "balanced.avg", + "-o", out_path, + "-p", "1", + ], + ) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + oe_clr = cooler.Cooler(out_path) + assert len(oe_clr.pixels()) > 0 + + def test_cli_smooth_auto_adjusts_col(self, cool_path, tmp_path): + """Test that --smooth auto-adjusts expected_value_col when computing on the fly.""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + view_path = str(tmp_path / "view.bed") + out_path = str(tmp_path / "oe_smooth_cli.cool") + + view_df.to_csv(view_path, sep="\t", index=False, header=False) + + runner = CliRunner() + result = runner.invoke( + obs_over_exp, + [ + cool_path, + "--view", view_path, + "--smooth", + "--aggregate-smoothed", + "--no-trans", + "-o", out_path, + "-p", "1", + ], + ) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + + def test_cli_help(self): + """Test that --help works.""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + runner = CliRunner() + result = runner.invoke(obs_over_exp, ["--help"]) + assert result.exit_code == 0 + assert "obs-over-exp" in result.output.lower() or "COOL_PATH" in result.output + + def test_cli_missing_output(self, cool_path): + """Test that CLI fails if -o is not provided.""" + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp + + runner = CliRunner() + result = runner.invoke( + obs_over_exp, + [cool_path], + ) + + assert result.exit_code != 0 + + +# --------------------------------------------------------------------------- +# Tests for consistency between API and CLI +# --------------------------------------------------------------------------- + +class TestAPIvsCLIConsistency: + """Verify that the Python API and the CLI produce identical results.""" + + def test_api_and_cli_produce_same_cooler(self, clr, cis_exp, cool_path, tmp_path): + """ + Run create_obs_over_exp_cooler via Python API and via CLI, then + compare the resulting coolers pixel-by-pixel. + """ + from cooltools.sandbox.obs_over_exp_cooler import create_obs_over_exp_cooler + from cooltools.sandbox.cli_obs_over_exp import obs_over_exp as cli_cmd + + # --- Python API --- + api_out = str(tmp_path / "oe_api.cool") + create_obs_over_exp_cooler( + clr, + api_out, + expected_cis_df=cis_exp, + expected_trans_df=False, + view_df=view_df, + expected_value_col="balanced.avg", + clr_weight_name=clr_weight_name, + ignore_diags=ignore_diags, + chunksize=chunksize, + ) + + # --- CLI --- + cis_exp_path = str(tmp_path / "cis_exp.tsv") + view_path = str(tmp_path / "view.bed") + cli_out = str(tmp_path / "oe_cli.cool") + + cis_exp.to_csv(cis_exp_path, sep="\t", index=False) + view_df.to_csv(view_path, sep="\t", index=False, header=False) + + runner = CliRunner() + result = runner.invoke( + cli_cmd, + [ + cool_path, + "--cis-expected", cis_exp_path, + "--no-trans", + "--view", view_path, + "--expected-value-col", "balanced.avg", + "--clr-weight-name", clr_weight_name, + "--ignore-diags", str(ignore_diags), + "-o", cli_out, + ], + ) + assert result.exit_code == 0, f"CLI failed: {result.output}" + + # --- Compare --- + api_clr = cooler.Cooler(api_out) + cli_clr = cooler.Cooler(cli_out) + + api_pixels = api_clr.pixels()[:].sort_values(["bin1_id", "bin2_id"]).reset_index(drop=True) + cli_pixels = cli_clr.pixels()[:].sort_values(["bin1_id", "bin2_id"]).reset_index(drop=True) + + assert len(api_pixels) == len(cli_pixels), ( + f"Pixel count mismatch: API={len(api_pixels)}, CLI={len(cli_pixels)}" + ) + testing.assert_array_equal(api_pixels["bin1_id"].values, cli_pixels["bin1_id"].values) + testing.assert_array_equal(api_pixels["bin2_id"].values, cli_pixels["bin2_id"].values) + testing.assert_allclose( + api_pixels["count"].values, + cli_pixels["count"].values, + rtol=1e-10, + err_msg="API and CLI produced different O/E values", + )