1313# You should have received a copy of the GNU General Public License
1414# along with this program. If not, see <https://www.gnu.org/licenses/>.
1515
16- """Canonical Cas-OFFinder TSV normalizer.
16+ """Canonical Cas-OFFinder normalizer.
1717
18- Internal helper for :mod:`bionpu.verify.crispr`. Cas-OFFinder's row order
19- for sites at identical mismatch counts is implementation-defined: the
20- GPU/OpenCL backends can produce different row orderings of the same
21- match set. Byte-equality is therefore asserted against a *normalized*
22- canonical TSV produced by this module, with the sort key:
18+ Cas-OFFinder's row order for sites at identical mismatch counts is
19+ implementation-defined: GPU runs of the same input on the same machine can
20+ produce different row orderings (the row *set* is invariant; the order is
21+ not). PRD §3.2 byte-equality is therefore asserted against a *normalized*
22+ canonical TSV produced by this module.
23+
24+ The sort key is load-bearing for and byte-equality:
2325
2426 (chrom, start, mismatch_count, guide_id, strand)
2527
26- The TSV writer emits LF line endings and a single trailing newline so
28+ where:
29+ - ``chrom`` is the contig name as emitted by Cas-OFFinder (e.g. ``"chr22"``).
30+ - ``start`` is the 0-based site position (Cas-OFFinder calls this Location).
31+ - ``mismatch_count`` is Cas-OFFinder's Mismatches column.
32+ - ``guide_id`` is the stable ID assigned by FIXTURE-A — for v3 outputs we
33+ use Cas-OFFinder's leading Id column; for legacy outputs we fall back to
34+ the crRNA sequence as the ID surrogate (a stable string either way).
35+ - ``strand`` is ``"+"`` or ``"-"`` (Cas-OFFinder's Direction column).
36+
37+ Sort is stable — calling ``normalize`` twice produces the same output (and
38+ applying it to an already-normalized list is a no-op).
39+
40+ The TSV writer emits LF line endings and a single trailing newline so that
2741byte-equality holds independent of the producer's line-ending choice.
2842"""
2943
3852 "normalize" ,
3953 "normalize_file" ,
4054 "parse_tsv" ,
55+ "serialize_canonical" ,
4156 "write_tsv" ,
42- "CANONICAL_HEADER" ,
4357]
4458
45-
4659_VALID_STRANDS = frozenset ({"+" , "-" })
4760
48- # The canonical normalized TSV column header. Picking a single fixed
49- # layout means byte-equality holds across producers (Cas-OFFinder v3,
50- # legacy Cas-OFFinder, NumPy oracle, NPU runner ).
51- CANONICAL_HEADER : tuple [ str , ...] = (
61+ # Column header for the canonical normalized TSV. We pick a single fixed
62+ # layout so byte-equality holds across producers (Cas-OFFinder v3, legacy
63+ # Cas-OFFinder, NumPy oracle).
64+ _HEADER = (
5265 "guide_id" ,
5366 "bulge_type" ,
5467 "crrna" ,
6073 "bulge_size" ,
6174)
6275
63-
6476@dataclass (frozen = True , slots = True )
6577class CasOFFinderRow :
6678 """One Cas-OFFinder match row, normalized into a fixed schema.
6779
68- Field names match the canonical TSV header. ``bulge_type`` is ``"X"``
69- for no-bulge runs (the typical case ); other values appear only when
70- DNA or RNA bulges are enabled.
80+ Field names match the canonical TSV header. ``bulge_type`` is ``"X"`` for
81+ no-bulge runs (the FIXTURE-A regime ); other values appear only when DNA
82+ or RNA bulges are enabled, which FIXTURE-A forbids .
7183 """
7284
7385 guide_id : str
@@ -83,45 +95,43 @@ class CasOFFinderRow:
8395 def sort_key (self ) -> tuple [str , int , int , str , str ]:
8496 if self .strand not in _VALID_STRANDS :
8597 raise ValueError (
86- f"unknown strand { self .strand !r} ; "
87- f"expected one of { sorted (_VALID_STRANDS )} "
98+ f"unknown strand { self .strand !r} ; expected one of { sorted (_VALID_STRANDS )} "
8899 )
89100 return (self .chrom , self .start , self .mismatches , self .guide_id , self .strand )
90101
91-
92102def normalize (rows : Iterable [CasOFFinderRow ]) -> list [CasOFFinderRow ]:
93103 """Return rows sorted by the documented canonical key.
94104
95105 Idempotent: ``normalize(normalize(rows)) == normalize(rows)``.
96106 Independent of input order: sorting is stable and total over the key.
97107 """
98108 materialized = list (rows )
109+ # Validate strands eagerly so a bad row fails the call rather than
110+ # silently being placed at an arbitrary position.
99111 for r in materialized :
100112 if r .strand not in _VALID_STRANDS :
101113 raise ValueError (
102- f"unknown strand { r .strand !r} ; "
103- f"expected one of { sorted (_VALID_STRANDS )} "
114+ f"unknown strand { r .strand !r} ; expected one of { sorted (_VALID_STRANDS )} "
104115 )
105116 return sorted (materialized , key = CasOFFinderRow .sort_key )
106117
107-
108118def parse_tsv (path : Path ) -> list [CasOFFinderRow ]:
109- """Parse a Cas-OFFinder TSV into ``CasOFFinderRow`` objects.
119+ """Parse a Cas-OFFinder TSV (v3 or legacy) into ``CasOFFinderRow`` objects.
110120
111121 Header detection rules:
112122 - Lines starting with ``##`` are skipped (v3 generator banner).
113123 - A line starting with ``#`` is treated as the column header.
114124 - Otherwise the file is assumed to have no header (legacy form).
115- - The canonical normalized TSV (this module's own output) starts
116- with ``guide_id\\ t...`` and is recognized too.
125+ - The canonical normalized TSV (this module's own output) starts with
126+ ``guide_id\\ t...`` — that is recognized too.
117127
118128 Column mappings (case-insensitive on header tokens):
119129
120- - v3: Id, Bulge Type, crRNA, DNA, Chromosome, Location,
121- Direction, Mismatches, Bulge Size
122- - legacy: crRNA, Chromosome, Position, DNA, Direction, Mismatches
123- - canonical: guide_id, bulge_type, crrna, dna, chrom, start,
124- strand, mismatches, bulge_size
130+ v3: Id, Bulge Type, crRNA, DNA, Chromosome, Location, Direction ,
131+ Mismatches, Bulge Size
132+ legacy: crRNA, Chromosome, Position, DNA, Direction, Mismatches
133+ canonical: guide_id, bulge_type, crrna, dna, chrom, start, strand ,
134+ mismatches, bulge_size
125135 """
126136 path = Path (path )
127137 raw_lines = path .read_text ().splitlines ()
@@ -134,9 +144,11 @@ def parse_tsv(path: Path) -> list[CasOFFinderRow]:
134144 if line .startswith ("##" ):
135145 continue
136146 if line .startswith ("#" ):
147+ # Column header.
137148 header = line .lstrip ("#" ).split ("\t " )
138149 continue
139150 if header is None and line .split ("\t " )[0 ].lower () == "guide_id" :
151+ # Canonical normalized header (no leading '#').
140152 header = line .split ("\t " )
141153 continue
142154 data_lines .append (line )
@@ -193,13 +205,12 @@ def has(*names: str) -> bool:
193205 )
194206 return rows
195207
196- legacy_cols = ("crrna" , "chromosome" , "position" , "dna" , "direction" , "mismatches" )
197- if has (* legacy_cols ):
208+ if has ("crrna" , "chromosome" , "position" , "dna" , "direction" , "mismatches" ):
198209 for line in data_lines :
199210 cols = line .split ("\t " )
200211 rows .append (
201212 CasOFFinderRow (
202- guide_id = cols [idx ["crrna" ]], # crRNA stands in for guide_id
213+ guide_id = cols [idx ["crrna" ]], # crRNA stands in for guide_id in legacy
203214 bulge_type = "X" ,
204215 crrna = cols [idx ["crrna" ]],
205216 dna = cols [idx ["dna" ]],
@@ -234,20 +245,18 @@ def has(*names: str) -> bool:
234245 )
235246 else :
236247 raise ValueError (
237- f"unrecognized Cas-OFFinder TSV row "
238- f"(no header, { len (cols )} cols): { line !r} "
248+ f"unrecognized Cas-OFFinder TSV row (no header, { len (cols )} cols): { line !r} "
239249 )
240250 return rows
241251
242-
243252def write_tsv (path : Path , rows : Iterable [CasOFFinderRow ]) -> None :
244- """Write rows to ``path`` in canonical schema with LF newlines.
253+ """Write rows to ``path`` using the canonical schema with LF newlines.
245254
246255 The resulting file is independent of producer line-ending choices,
247- which is what makes byte-equality robust across platforms.
256+ which matters for byte-equality across platforms.
248257 """
249258 path = Path (path )
250- parts : list [str ] = ["\t " .join (CANONICAL_HEADER )]
259+ parts : list [str ] = ["\t " .join (_HEADER )]
251260 for r in rows :
252261 parts .append (
253262 "\t " .join (
@@ -267,12 +276,11 @@ def write_tsv(path: Path, rows: Iterable[CasOFFinderRow]) -> None:
267276 blob = "\n " .join (parts ) + "\n "
268277 path .write_bytes (blob .encode ("utf-8" ))
269278
270-
271279def normalize_file (input_tsv : Path , output_tsv : Path ) -> None :
272- """Read a Cas-OFFinder TSV, normalize, write to ``output_tsv`` .
280+ """Read a Cas-OFFinder TSV (v3, legacy, or canonical), normalize, write .
273281
274- The output TSV is byte-stable: re-running ``normalize_file`` on the
275- output produces a byte-identical file .
282+ The output TSV is byte-stable: ``normalize_file(out, out2) `` produces
283+ ``out2`` byte-identical to ``out`` for any already-normalized ``out`` .
276284 """
277285 rows = parse_tsv (Path (input_tsv ))
278286 write_tsv (Path (output_tsv ), normalize (rows ))
@@ -281,11 +289,11 @@ def normalize_file(input_tsv: Path, output_tsv: Path) -> None:
281289def serialize_canonical (rows : Iterable [CasOFFinderRow ]) -> bytes :
282290 """Return the canonical TSV byte representation of ``rows``.
283291
284- Equivalent to writing with :func:`write_tsv` and reading the result;
285- used by the comparator to compute a SHA-256 without touching the
286- filesystem.
292+ Equivalent to writing with :func:`write_tsv` and reading the result
293+ back as bytes; used by :mod:`bionpu.verify.crispr` to compute a
294+ SHA-256 over the canonical form without touching the filesystem.
287295 """
288- parts : list [str ] = ["\t " .join (CANONICAL_HEADER )]
296+ parts : list [str ] = ["\t " .join (_HEADER )]
289297 for r in rows :
290298 parts .append (
291299 "\t " .join (
0 commit comments