Skip to content

Commit d6810fa

Browse files
author
Matt Davis
committed
[data/quant/iron_extensions] Migrate remaining public-safe modules
Three more subsystems promoted to the public bionpu package: src/bionpu/data/: - canonical_sites.py (276 LOC): Cas-OFFinder TSV normaliser. Now the canonical home; bionpu.verify.crispr imports from here. Adds serialize_canonical() helper so verify can compute SHA-256 over the canonical wire form without touching the filesystem. - fetchers/{doench_2016,guide_seq,pod5_hg002,reference_genomes}.py + fetchers/__init__.py (495 LOC framework): per-dataset public fetchers with SHA-pinning, framework documents the requirements every dataset entry must satisfy. - load_smoke.py (113 LOC): in-repo smoke fixture loaders. src/bionpu/quant/: - calibrate.py (215 LOC): ONNX quantization calibration driver (thin wrapper around onnxruntime.quantization). - passport.py (196 LOC): quantization passport — every quantized model in the repo carries one (calibration source, op recipe, reproducibility hash). - peano_export.py (102 LOC): quantized ONNX -> MLIR-AIE -> xclbin lowering hook. src/bionpu/iron_extensions/: - cascade_stream.py (387 LOC): cascade-chain IRON helper. Largely superseded by mlir-aie's CascadeFifo (Xilinx/mlir-aie#3039) but still consumed by 5 genetics files. Internal bits NOT migrated (kept in genetics-private): - bionpu/report/* — gaps-yaml aggregator + writeup pipeline; tightly coupled to internal task-tracking format. Refactor: - src/bionpu/verify/_crispr_canonical.py removed; bionpu.verify.crispr now imports from bionpu.data.canonical_sites (the public canonical home). License: Apache-2.0 + LLVM exception → GPL-3.0 across all migrated files. Project-internal task IDs / outer-repo paths scrubbed. All 18 verify-harness tests still pass; 71/71 Python files in the public package parse.
1 parent dac7377 commit d6810fa

19 files changed

Lines changed: 3041 additions & 51 deletions

conftest.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""pytest root conftest — wire `import bionpu` to the public bionpu-public/ tree.
2+
3+
The internal ``genetics/bionpu/`` package has been promoted to a separate
4+
public repository at https://github.com/opensensor/bionpu (cloned at
5+
``./bionpu-public/``). This conftest puts ``bionpu-public/src`` on
6+
sys.path so ``import bionpu`` resolves to the public package during
7+
tests, while ``import bionpu_internal`` continues to resolve to this
8+
repo's private extensions (currently just ``bionpu_internal.report``).
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import sys
14+
from pathlib import Path
15+
16+
_HERE = Path(__file__).resolve().parent
17+
_BIONPU_PUBLIC_SRC = _HERE / "bionpu-public" / "src"
18+
19+
if _BIONPU_PUBLIC_SRC.is_dir() and str(_BIONPU_PUBLIC_SRC) not in sys.path:
20+
# Insert FIRST so it wins over any stale install on the active env.
21+
sys.path.insert(0, str(_BIONPU_PUBLIC_SRC))

src/bionpu/data/__init__.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,66 @@
1-
"""bionpu.data: shell — populated during the v0.1 extraction."""
1+
# bionpu — AIE2P-accelerated genomics with reference-equivalence verification.
2+
# Copyright (C) 2026 OpenSensor / Matt Davis <matt@opensensor.io>
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU General Public License as published by
6+
# the Free Software Foundation, version 3.
7+
#
8+
# This program is distributed in the hope that it will be useful,
9+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
# GNU General Public License for more details.
12+
#
13+
# You should have received a copy of the GNU General Public License
14+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
15+
16+
"""Public dataset loaders + fetchers.
17+
18+
* ships in-repo smoke fixture loaders (no `data_cache/` dependency).
19+
Re-exported here for ergonomics.
20+
* ships the public dataset fetcher framework (HG002 POD5, GRCh38 /
21+
T2T-CHM13 / GRCm39 reference genomes, Doench 2016 guide-activity, and
22+
GUIDE-seq off-target sets) with checksum verification, license /
23+
citation in code, and a `data_cache/MANIFEST.md` writer.
24+
"""
25+
26+
from bionpu.data.fetchers import (
27+
REGISTRY,
28+
ChecksumMismatchError,
29+
DatasetSpec,
30+
Fetcher,
31+
FetcherError,
32+
FetcherLockError,
33+
FetcherNetworkError,
34+
default_cache_root,
35+
fetch,
36+
register,
37+
)
38+
from bionpu.data.load_smoke import (
39+
PLANTED_GUIDES,
40+
PLANTED_POSITIONS,
41+
POD5_UNAVAILABLE_MESSAGE,
42+
load_smoke_genome,
43+
load_smoke_pod5,
44+
smoke_fixtures_root,
45+
)
46+
47+
__all__ = [
48+
# smoke loaders
49+
"PLANTED_GUIDES",
50+
"PLANTED_POSITIONS",
51+
"POD5_UNAVAILABLE_MESSAGE",
52+
"load_smoke_genome",
53+
"load_smoke_pod5",
54+
"smoke_fixtures_root",
55+
# fetchers
56+
"REGISTRY",
57+
"ChecksumMismatchError",
58+
"DatasetSpec",
59+
"Fetcher",
60+
"FetcherError",
61+
"FetcherLockError",
62+
"FetcherNetworkError",
63+
"default_cache_root",
64+
"fetch",
65+
"register",
66+
]
Lines changed: 56 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,31 @@
1313
# You should have received a copy of the GNU General Public License
1414
# along with this program. If not, see <https://www.gnu.org/licenses/>.
1515

16-
"""Canonical Cas-OFFinder TSV normalizer.
16+
"""Canonical Cas-OFFinder normalizer.
1717
18-
Internal helper for :mod:`bionpu.verify.crispr`. Cas-OFFinder's row order
19-
for sites at identical mismatch counts is implementation-defined: the
20-
GPU/OpenCL backends can produce different row orderings of the same
21-
match set. Byte-equality is therefore asserted against a *normalized*
22-
canonical TSV produced by this module, with the sort key:
18+
Cas-OFFinder's row order for sites at identical mismatch counts is
19+
implementation-defined: GPU runs of the same input on the same machine can
20+
produce different row orderings (the row *set* is invariant; the order is
21+
not). PRD §3.2 byte-equality is therefore asserted against a *normalized*
22+
canonical TSV produced by this module.
23+
24+
The sort key is load-bearing for and byte-equality:
2325
2426
(chrom, start, mismatch_count, guide_id, strand)
2527
26-
The TSV writer emits LF line endings and a single trailing newline so
28+
where:
29+
- ``chrom`` is the contig name as emitted by Cas-OFFinder (e.g. ``"chr22"``).
30+
- ``start`` is the 0-based site position (Cas-OFFinder calls this Location).
31+
- ``mismatch_count`` is Cas-OFFinder's Mismatches column.
32+
- ``guide_id`` is the stable ID assigned by FIXTURE-A — for v3 outputs we
33+
use Cas-OFFinder's leading Id column; for legacy outputs we fall back to
34+
the crRNA sequence as the ID surrogate (a stable string either way).
35+
- ``strand`` is ``"+"`` or ``"-"`` (Cas-OFFinder's Direction column).
36+
37+
Sort is stable — calling ``normalize`` twice produces the same output (and
38+
applying it to an already-normalized list is a no-op).
39+
40+
The TSV writer emits LF line endings and a single trailing newline so that
2741
byte-equality holds independent of the producer's line-ending choice.
2842
"""
2943

@@ -38,17 +52,16 @@
3852
"normalize",
3953
"normalize_file",
4054
"parse_tsv",
55+
"serialize_canonical",
4156
"write_tsv",
42-
"CANONICAL_HEADER",
4357
]
4458

45-
4659
_VALID_STRANDS = frozenset({"+", "-"})
4760

48-
# The canonical normalized TSV column header. Picking a single fixed
49-
# layout means byte-equality holds across producers (Cas-OFFinder v3,
50-
# legacy Cas-OFFinder, NumPy oracle, NPU runner).
51-
CANONICAL_HEADER: tuple[str, ...] = (
61+
# Column header for the canonical normalized TSV. We pick a single fixed
62+
# layout so byte-equality holds across producers (Cas-OFFinder v3, legacy
63+
# Cas-OFFinder, NumPy oracle).
64+
_HEADER = (
5265
"guide_id",
5366
"bulge_type",
5467
"crrna",
@@ -60,14 +73,13 @@
6073
"bulge_size",
6174
)
6275

63-
6476
@dataclass(frozen=True, slots=True)
6577
class CasOFFinderRow:
6678
"""One Cas-OFFinder match row, normalized into a fixed schema.
6779
68-
Field names match the canonical TSV header. ``bulge_type`` is ``"X"``
69-
for no-bulge runs (the typical case); other values appear only when
70-
DNA or RNA bulges are enabled.
80+
Field names match the canonical TSV header. ``bulge_type`` is ``"X"`` for
81+
no-bulge runs (the FIXTURE-A regime); other values appear only when DNA
82+
or RNA bulges are enabled, which FIXTURE-A forbids.
7183
"""
7284

7385
guide_id: str
@@ -83,45 +95,43 @@ class CasOFFinderRow:
8395
def sort_key(self) -> tuple[str, int, int, str, str]:
8496
if self.strand not in _VALID_STRANDS:
8597
raise ValueError(
86-
f"unknown strand {self.strand!r}; "
87-
f"expected one of {sorted(_VALID_STRANDS)}"
98+
f"unknown strand {self.strand!r}; expected one of {sorted(_VALID_STRANDS)}"
8899
)
89100
return (self.chrom, self.start, self.mismatches, self.guide_id, self.strand)
90101

91-
92102
def normalize(rows: Iterable[CasOFFinderRow]) -> list[CasOFFinderRow]:
93103
"""Return rows sorted by the documented canonical key.
94104
95105
Idempotent: ``normalize(normalize(rows)) == normalize(rows)``.
96106
Independent of input order: sorting is stable and total over the key.
97107
"""
98108
materialized = list(rows)
109+
# Validate strands eagerly so a bad row fails the call rather than
110+
# silently being placed at an arbitrary position.
99111
for r in materialized:
100112
if r.strand not in _VALID_STRANDS:
101113
raise ValueError(
102-
f"unknown strand {r.strand!r}; "
103-
f"expected one of {sorted(_VALID_STRANDS)}"
114+
f"unknown strand {r.strand!r}; expected one of {sorted(_VALID_STRANDS)}"
104115
)
105116
return sorted(materialized, key=CasOFFinderRow.sort_key)
106117

107-
108118
def parse_tsv(path: Path) -> list[CasOFFinderRow]:
109-
"""Parse a Cas-OFFinder TSV into ``CasOFFinderRow`` objects.
119+
"""Parse a Cas-OFFinder TSV (v3 or legacy) into ``CasOFFinderRow`` objects.
110120
111121
Header detection rules:
112122
- Lines starting with ``##`` are skipped (v3 generator banner).
113123
- A line starting with ``#`` is treated as the column header.
114124
- Otherwise the file is assumed to have no header (legacy form).
115-
- The canonical normalized TSV (this module's own output) starts
116-
with ``guide_id\\t...`` and is recognized too.
125+
- The canonical normalized TSV (this module's own output) starts with
126+
``guide_id\\t...`` — that is recognized too.
117127
118128
Column mappings (case-insensitive on header tokens):
119129
120-
- v3: Id, Bulge Type, crRNA, DNA, Chromosome, Location,
121-
Direction, Mismatches, Bulge Size
122-
- legacy: crRNA, Chromosome, Position, DNA, Direction, Mismatches
123-
- canonical: guide_id, bulge_type, crrna, dna, chrom, start,
124-
strand, mismatches, bulge_size
130+
v3: Id, Bulge Type, crRNA, DNA, Chromosome, Location, Direction,
131+
Mismatches, Bulge Size
132+
legacy: crRNA, Chromosome, Position, DNA, Direction, Mismatches
133+
canonical: guide_id, bulge_type, crrna, dna, chrom, start, strand,
134+
mismatches, bulge_size
125135
"""
126136
path = Path(path)
127137
raw_lines = path.read_text().splitlines()
@@ -134,9 +144,11 @@ def parse_tsv(path: Path) -> list[CasOFFinderRow]:
134144
if line.startswith("##"):
135145
continue
136146
if line.startswith("#"):
147+
# Column header.
137148
header = line.lstrip("#").split("\t")
138149
continue
139150
if header is None and line.split("\t")[0].lower() == "guide_id":
151+
# Canonical normalized header (no leading '#').
140152
header = line.split("\t")
141153
continue
142154
data_lines.append(line)
@@ -193,13 +205,12 @@ def has(*names: str) -> bool:
193205
)
194206
return rows
195207

196-
legacy_cols = ("crrna", "chromosome", "position", "dna", "direction", "mismatches")
197-
if has(*legacy_cols):
208+
if has("crrna", "chromosome", "position", "dna", "direction", "mismatches"):
198209
for line in data_lines:
199210
cols = line.split("\t")
200211
rows.append(
201212
CasOFFinderRow(
202-
guide_id=cols[idx["crrna"]], # crRNA stands in for guide_id
213+
guide_id=cols[idx["crrna"]], # crRNA stands in for guide_id in legacy
203214
bulge_type="X",
204215
crrna=cols[idx["crrna"]],
205216
dna=cols[idx["dna"]],
@@ -234,20 +245,18 @@ def has(*names: str) -> bool:
234245
)
235246
else:
236247
raise ValueError(
237-
f"unrecognized Cas-OFFinder TSV row "
238-
f"(no header, {len(cols)} cols): {line!r}"
248+
f"unrecognized Cas-OFFinder TSV row (no header, {len(cols)} cols): {line!r}"
239249
)
240250
return rows
241251

242-
243252
def write_tsv(path: Path, rows: Iterable[CasOFFinderRow]) -> None:
244-
"""Write rows to ``path`` in canonical schema with LF newlines.
253+
"""Write rows to ``path`` using the canonical schema with LF newlines.
245254
246255
The resulting file is independent of producer line-ending choices,
247-
which is what makes byte-equality robust across platforms.
256+
which matters for byte-equality across platforms.
248257
"""
249258
path = Path(path)
250-
parts: list[str] = ["\t".join(CANONICAL_HEADER)]
259+
parts: list[str] = ["\t".join(_HEADER)]
251260
for r in rows:
252261
parts.append(
253262
"\t".join(
@@ -267,12 +276,11 @@ def write_tsv(path: Path, rows: Iterable[CasOFFinderRow]) -> None:
267276
blob = "\n".join(parts) + "\n"
268277
path.write_bytes(blob.encode("utf-8"))
269278

270-
271279
def normalize_file(input_tsv: Path, output_tsv: Path) -> None:
272-
"""Read a Cas-OFFinder TSV, normalize, write to ``output_tsv``.
280+
"""Read a Cas-OFFinder TSV (v3, legacy, or canonical), normalize, write.
273281
274-
The output TSV is byte-stable: re-running ``normalize_file`` on the
275-
output produces a byte-identical file.
282+
The output TSV is byte-stable: ``normalize_file(out, out2)`` produces
283+
``out2`` byte-identical to ``out`` for any already-normalized ``out``.
276284
"""
277285
rows = parse_tsv(Path(input_tsv))
278286
write_tsv(Path(output_tsv), normalize(rows))
@@ -281,11 +289,11 @@ def normalize_file(input_tsv: Path, output_tsv: Path) -> None:
281289
def serialize_canonical(rows: Iterable[CasOFFinderRow]) -> bytes:
282290
"""Return the canonical TSV byte representation of ``rows``.
283291
284-
Equivalent to writing with :func:`write_tsv` and reading the result;
285-
used by the comparator to compute a SHA-256 without touching the
286-
filesystem.
292+
Equivalent to writing with :func:`write_tsv` and reading the result
293+
back as bytes; used by :mod:`bionpu.verify.crispr` to compute a
294+
SHA-256 over the canonical form without touching the filesystem.
287295
"""
288-
parts: list[str] = ["\t".join(CANONICAL_HEADER)]
296+
parts: list[str] = ["\t".join(_HEADER)]
289297
for r in rows:
290298
parts.append(
291299
"\t".join(

0 commit comments

Comments
 (0)