Merge pull request #14 from SaridakisStamatisChristos/codex/apply-one-shot-sota-patch

SaridakisStamatisChristos · web-flow · commit 7cd2d3729bcc · 2025-10-05T11:57:31.000+03:00
Add Sudoku canonicalization support
diff --git a/README.md b/README.md
@@ -42,6 +42,10 @@ sudoku-dlx solve --grid "<81chars>" --pretty --stats
 # Rate difficulty (0..10)
 sudoku-dlx rate  --grid "<81chars>"
 
+# Canonicalize (dedupe isomorphic puzzles)
+sudoku-dlx canon --grid "<81chars>"  # D4 × bands/stacks × inner row/col × digit relabel
+# Produces a stable 81-char string for deduping datasets.
+
 # Generate a unique puzzle (deterministic with seed)
 sudoku-dlx gen   --seed 123 --givens 30           # ~target clue count (approx)
 sudoku-dlx gen   --seed 123 --givens 30 --pretty
diff --git a/src/sudoku_dlx/__init__.py b/src/sudoku_dlx/__init__.py
@@ -10,6 +10,7 @@
     solve,
     to_string,
 )
+from .canonical import canonical_form
 from .generate import generate
 from .rating import rate
 from .solver import (
@@ -35,6 +36,7 @@
     "solve",
     "count_solutions",
     "rate",
+    "canonical_form",
     "generate",
     # Legacy exports
     "SOLVER",
diff --git a/src/sudoku_dlx/canonical.py b/src/sudoku_dlx/canonical.py
@@ -0,0 +1,208 @@
+from __future__ import annotations
+"""
+State-of-the-art canonicalization for Sudoku puzzles.
+
+Maps isomorphic puzzles to a single 81-char canonical form using:
+  • Dihedral symmetries D4 (8 transforms)
+  • Band (row bands) and stack (column stacks) permutations (3! each)
+  • Row swaps within each band and column swaps within each stack (3! for each band/stack)
+  • Greedy digit relabeling (first-appearance maps to 1..9)
+
+Total variants explored per grid: 8 × (3!)^4 = 10,368 — acceptable for CLI/tests.
+"""
+from itertools import permutations
+from typing import List, Sequence, Tuple
+
+from .api import Grid
+
+# --------- Dihedral transforms over 9x9 grids (D4) ----------
+
+def _rot90(g: Grid) -> Grid:
+    return [[g[9 - 1 - c][r] for c in range(9)] for r in range(9)]
+
+
+def _rot180(g: Grid) -> Grid:
+    return [[g[9 - 1 - r][9 - 1 - c] for c in range(9)] for r in range(9)]
+
+
+def _rot270(g: Grid) -> Grid:
+    return [[g[c][9 - 1 - r] for c in range(9)] for r in range(9)]
+
+
+def _flip_h(g: Grid) -> Grid:
+    # horizontal flip (mirror over vertical axis)
+    return [[g[r][9 - 1 - c] for c in range(9)] for r in range(9)]
+
+
+def _flip_v(g: Grid) -> Grid:
+    # vertical flip (mirror over horizontal axis)
+    return [g[9 - 1 - r][:] for r in range(9)]
+
+
+def _flip_main_diag(g: Grid) -> Grid:
+    # transpose over main diagonal
+    return [[g[c][r] for c in range(9)] for r in range(9)]
+
+
+def _flip_anti_diag(g: Grid) -> Grid:
+    # reflect over anti-diagonal (r,c) -> (8-c,8-r)
+    return [[g[9 - 1 - c][9 - 1 - r] for c in range(9)] for r in range(9)]
+
+
+_TRANSFORMS = (
+    lambda x: x,
+    _rot90,
+    _rot180,
+    _rot270,
+    _flip_h,
+    _flip_v,
+    _flip_main_diag,
+    _flip_anti_diag,
+)
+
+# --------- Permutations for bands/stacks and inner rows/cols ----------
+
+_PERM3 = list(permutations((0, 1, 2)))  # 6 perms
+
+
+def _cell_char(value: int) -> str:
+    if value == 0:
+        return "."
+    if isinstance(value, str):
+        return value if value not in {"0", "-"} else "."
+    return str(value)
+
+
+def _canonical_band_stack(
+    grid_chars: Sequence[Sequence[str]],
+    band_perm: Tuple[int, int, int],
+    stack_perm: Tuple[int, int, int],
+    best: str | None,
+) -> str | None:
+    best_local = best
+    chosen_row_perms: dict[int, Tuple[int, int, int]] = {}
+    chosen_col_perms: dict[int, Tuple[int, int, int]] = {}
+    mapping: dict[str, str] = {}
+    out_chars: List[str] = []
+    next_digit = ord("1")
+    cmp_state = 0
+
+    def rollback(inserted: List[str], saved_len: int, saved_next: int, saved_cmp: int) -> None:
+        nonlocal next_digit, cmp_state
+        del out_chars[saved_len:]
+        next_digit = saved_next
+        cmp_state = saved_cmp
+        for key in reversed(inserted):
+            mapping.pop(key, None)
+
+    def dfs(block_idx: int) -> None:
+        nonlocal best_local, next_digit, cmp_state
+        if block_idx == 9:
+            candidate = "".join(out_chars)
+            if best_local is None or candidate < best_local:
+                best_local = candidate
+            return
+
+        band_idx = block_idx // 3
+        stack_idx = block_idx % 3
+        band = band_perm[band_idx]
+        stack = stack_perm[stack_idx]
+
+        row_options = (
+            (chosen_row_perms[band],)
+            if band in chosen_row_perms
+            else _PERM3
+        )
+        col_options = (
+            (chosen_col_perms[stack],)
+            if stack in chosen_col_perms
+            else _PERM3
+        )
+
+        for row_perm in row_options:
+            assigned_row = False
+            if band not in chosen_row_perms:
+                chosen_row_perms[band] = row_perm
+                assigned_row = True
+            for col_perm in col_options:
+                assigned_col = False
+                if stack not in chosen_col_perms:
+                    chosen_col_perms[stack] = col_perm
+                    assigned_col = True
+
+                saved_len = len(out_chars)
+                saved_next = next_digit
+                saved_cmp = cmp_state
+                inserted: List[str] = []
+                pruned = False
+
+                for r_local in row_perm:
+                    row = grid_chars[band * 3 + r_local]
+                    for c_local in col_perm:
+                        ch = row[stack * 3 + c_local]
+                        if ch == ".":
+                            mapped = "."
+                        else:
+                            mapped = mapping.get(ch)
+                            if mapped is None:
+                                mapped = chr(next_digit)
+                                mapping[ch] = mapped
+                                inserted.append(ch)
+                                if next_digit < ord("9"):
+                                    next_digit += 1
+                        out_chars.append(mapped)
+                        if best_local is not None and cmp_state == 0:
+                            best_char = best_local[len(out_chars) - 1]
+                            if mapped > best_char:
+                                pruned = True
+                                break
+                            if mapped < best_char:
+                                cmp_state = -1
+                    if pruned:
+                        break
+
+                if not pruned:
+                    dfs(block_idx + 1)
+
+                rollback(inserted, saved_len, saved_next, saved_cmp)
+
+                if assigned_col:
+                    chosen_col_perms.pop(stack, None)
+
+                if pruned and best_local is not None and cmp_state == 0:
+                    # If pruning occurred due to mapped > best prefix, remaining column perms
+                    # in this branch are unlikely to improve; continue to next col perm.
+                    pass
+
+            if assigned_row:
+                chosen_row_perms.pop(band, None)
+
+    dfs(0)
+    return best_local
+
+
+# --------- Public API (full canon) ----------
+
+
+def canonical_form(grid: Grid) -> str:
+    """
+    Return the lexicographically smallest normalized string among all:
+      - D4 dihedral transforms
+      - Band and stack permutations
+      - Row swaps within each band, column swaps within each stack
+    Each candidate is normalized by greedy digit relabeling before compare.
+    """
+    best: str | None = None
+    for tf in _TRANSFORMS:
+        g1 = tf(grid)
+        grid_chars = [[_cell_char(cell) for cell in row] for row in g1]
+        for band_perm in _PERM3:
+            for stack_perm in _PERM3:
+                cand = _canonical_band_stack(grid_chars, band_perm, stack_perm, best)
+                if cand is not None and (best is None or cand < best):
+                    best = cand
+    assert best is not None
+    return best
+
+
+__all__ = ["canonical_form"]
diff --git a/src/sudoku_dlx/cli.py b/src/sudoku_dlx/cli.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 from .api import from_string, is_valid, solve, to_string
+from .canonical import canonical_form
 from .generate import generate
 from .rating import rate
 
@@ -62,6 +63,12 @@ def cmd_gen(ns: argparse.Namespace) -> int:
     return 0
 
 
+def cmd_canon(ns: argparse.Namespace) -> int:
+    grid = from_string(_read_grid_arg(ns))
+    print(canonical_form(grid))
+    return 0
+
+
 def main(argv: Optional[list[str]] = None) -> int:
     parser = argparse.ArgumentParser(
         prog="sudoku-dlx",
@@ -98,6 +105,16 @@ def main(argv: Optional[list[str]] = None) -> int:
     gen_parser.add_argument("--pretty", action="store_true")
     gen_parser.set_defaults(func=cmd_gen)
 
+    canon_parser = sub.add_parser(
+        "canon",
+        help=(
+            "print canonical 81-char form (D4 × bands/stacks × inner row/col swaps × digit relabel)"
+        ),
+    )
+    canon_parser.add_argument("--grid", help="81-char string; 0/./- for blanks")
+    canon_parser.add_argument("--file", help="path to a file with 9 lines of 9 chars")
+    canon_parser.set_defaults(func=cmd_canon)
+
     args = parser.parse_args(argv)
     if not hasattr(args, "func"):
         parser.print_help()
diff --git a/tests/test_canonical.py b/tests/test_canonical.py
@@ -0,0 +1,104 @@
+from textwrap import dedent
+
+from sudoku_dlx import from_string, to_string, canonical_form
+
+BASE = dedent(
+    """
+    53..7....
+    6..195...
+    .98....6.
+    8...6...3
+    4..8.3..1
+    7...2...6
+    .6....28.
+    ...419..5
+    ....8..79
+    """
+).strip().replace("\n", "")
+
+
+def rot90_string(s: str) -> str:
+    g = [list(s[r * 9 : (r + 1) * 9]) for r in range(9)]
+    out = [["."] * 9 for _ in range(9)]
+    for r in range(9):
+        for c in range(9):
+            out[r][c] = g[9 - 1 - c][r]
+    return "".join("".join(row) for row in out)
+
+
+def relabel_123_to_456(s: str) -> str:
+    table = str.maketrans({"1": "4", "2": "5", "3": "6", "4": "1", "5": "2", "6": "3"})
+    return s.translate(table)
+
+
+def swap_bands_string(s: str, order=(1, 0, 2)) -> str:
+    rows = [s[i * 9 : (i + 1) * 9] for i in range(9)]
+    new_rows = []
+    for b in order:
+        new_rows.extend(rows[b * 3 : (b + 1) * 3])
+    return "".join(new_rows)
+
+
+def swap_stacks_string(s: str, order=(2, 1, 0)) -> str:
+    rows = [list(s[i * 9 : (i + 1) * 9]) for i in range(9)]
+    for r in range(9):
+        chunks = [rows[r][i * 3 : (i + 1) * 3] for i in range(3)]
+        rows[r] = [v for idx in order for v in chunks[idx]]
+    return "".join("".join(r) for r in rows)
+
+
+def swap_rows_in_band_string(s: str, band=1, perm=(2, 0, 1)) -> str:
+    rows = [s[i * 9 : (i + 1) * 9] for i in range(9)]
+    start = band * 3
+    block = rows[start : start + 3]
+    new_block = [block[i] for i in perm]
+    rows[start : start + 3] = new_block
+    return "".join(rows)
+
+
+def swap_cols_in_stack_string(s: str, stack=0, perm=(1, 2, 0)) -> str:
+    rows = [list(s[i * 9 : (i + 1) * 9]) for i in range(9)]
+    start = stack * 3
+    for r in range(9):
+        block = rows[r][start : start + 3]
+        rows[r][start : start + 3] = [block[i] for i in perm]
+    return "".join("".join(r) for r in rows)
+
+
+def test_canonical_equal_under_rotation():
+    g0 = from_string(BASE)
+    g1 = from_string(rot90_string(BASE))
+    c0 = canonical_form(g0)
+    c1 = canonical_form(g1)
+    assert c0 == c1
+
+
+def test_canonical_equal_under_digit_relabel():
+    s2 = relabel_123_to_456(BASE)
+    c0 = canonical_form(from_string(BASE))
+    c2 = canonical_form(from_string(s2))
+    assert c0 == c2
+
+
+def test_canonical_is_81_chars_and_uses_dots():
+    c = canonical_form(from_string(BASE))
+    assert len(c) == 81
+    assert set(c) <= set("123456789.")
+
+
+def test_canonical_equal_under_band_and_stack_swaps():
+    s_band = swap_bands_string(BASE, order=(1, 0, 2))
+    s_stack = swap_stacks_string(BASE, order=(2, 1, 0))
+    c0 = canonical_form(from_string(BASE))
+    c_band = canonical_form(from_string(s_band))
+    c_stack = canonical_form(from_string(s_stack))
+    assert c0 == c_band == c_stack
+
+
+def test_canonical_equal_under_inner_row_col_swaps():
+    s_rows = swap_rows_in_band_string(BASE, band=2, perm=(1, 2, 0))
+    s_cols = swap_cols_in_stack_string(BASE, stack=1, perm=(2, 0, 1))
+    c0 = canonical_form(from_string(BASE))
+    c_rows = canonical_form(from_string(s_rows))
+    c_cols = canonical_form(from_string(s_cols))
+    assert c0 == c_rows == c_cols