Add CLI tools for format conversion, CNF export, and batch explain

SaridakisStamatisChristos · SaridakisStamatisChristos · commit ee181b32a333 · 2025-10-07T07:32:50.000+03:00
diff --git a/README.md b/README.md
@@ -109,6 +109,22 @@ sudoku-dlx solve --grid "<81chars>" --trace out.json
 # Open web/visualizer.html and load out.json
 ```
 
+## Formats & batch explain
+Convert between txt/csv/jsonl:
+```bash
+sudoku-dlx convert --in puzzles.txt --out puzzles.csv
+```
+
+Explain many puzzles to NDJSON:
+```bash
+sudoku-dlx explain-file --in puzzles.txt --out steps.ndjson --max-steps 200
+```
+
+Export to DIMACS CNF:
+```bash
+sudoku-dlx to-cnf --grid "<81chars>" --out puzzle.cnf
+```
+
 ## Cross-check with SAT (optional)
 Install the optional extra:
 
diff --git a/docs/batch.md b/docs/batch.md
@@ -21,3 +21,21 @@ sudoku-dlx stats-file --in puzzles.txt --limit 5000 --sample 1000 --json stats.j
 ```bash
 sudoku-dlx dedupe --in puzzles.txt --out unique.txt
 ```
+
+## Convert between formats
+Supported: txt (one 81-char per line), csv (column grid), jsonl/ndjson ({"grid": "..."} per line).
+```bash
+sudoku-dlx convert --in puzzles.txt --out puzzles.csv
+sudoku-dlx convert --in puzzles.csv --out puzzles.jsonl
+```
+
+## Batch explain
+Produce one JSON object per line with steps and progress:
+```bash
+sudoku-dlx explain-file --in puzzles.txt --out steps.ndjson --max-steps 200
+```
+
+## Export to CNF
+```bash
+sudoku-dlx to-cnf --grid "<81chars>" --out puzzle.cnf
+```
diff --git a/docs/cli.md b/docs/cli.md
@@ -2,6 +2,7 @@
 
 Run `sudoku-dlx --help` for a full list.
 
+<!-- core -->
 ## Solve
 ```bash
 sudoku-dlx solve --grid "<81chars>" [--pretty] [--stats] [--trace out.json] [--crosscheck sat]
@@ -44,3 +45,21 @@ sudoku-dlx rate-file --in puzzles.txt --json > scores.ndjson
 # Stats with sampling & histogram CSV
 sudoku-dlx stats-file --in puzzles.txt --limit 5000 --sample 1000 --json stats.json
 ```
+
+<!-- extras -->
+## Convert formats
+```bash
+# auto-detects txt/csv/jsonl by extension
+sudoku-dlx convert --in puzzles.txt --out puzzles.csv
+sudoku-dlx convert --in puzzles.csv --out puzzles.jsonl
+```
+
+## Explain (batch)
+```bash
+sudoku-dlx explain-file --in puzzles.txt --out steps.ndjson --max-steps 200
+```
+
+## Export to DIMACS CNF
+```bash
+sudoku-dlx to-cnf --grid "<81chars>" --out puzzle.cnf
+```
diff --git a/src/sudoku_dlx/__init__.py b/src/sudoku_dlx/__init__.py
@@ -16,7 +16,8 @@
 from .canonical import canonical_form
 from .generate import generate
 from .rating import rate
-from .crosscheck import sat_solve
+from .crosscheck import sat_solve, cnf_dimacs_lines
+from .formats import read_grids, write_grids, detect_format
 from .solver import (
     SOLVER,
     generate_minimal,
@@ -46,6 +47,10 @@
     "canonical_form",
     "generate",
     "sat_solve",
+    "cnf_dimacs_lines",
+    "read_grids",
+    "write_grids",
+    "detect_format",
     # Legacy exports
     "SOLVER",
     "generate_minimal",
diff --git a/src/sudoku_dlx/cli.py b/src/sudoku_dlx/cli.py
@@ -4,11 +4,12 @@
 from typing import Optional
 
 from .api import analyze, build_reveal_trace, from_string, is_valid, solve, to_string
-from .crosscheck import sat_solve
+from .crosscheck import sat_solve, cnf_dimacs_lines
 from .explain import explain
 from .canonical import canonical_form
 from .generate import generate
 from .rating import rate
+from .formats import detect_format, read_grids, write_grids
 from statistics import mean
 
 
@@ -122,6 +123,44 @@ def cmd_check(ns: argparse.Namespace) -> int:
     return 0
 
 
+def cmd_convert(ns: argparse.Namespace) -> int:
+    infmt = ns.in_format or detect_format(ns.in_path)
+    outfmt = ns.out_format or detect_format(ns.out_path)
+    grids = read_grids(ns.in_path, infmt)
+    write_grids(ns.out_path, grids, outfmt)
+    print(f"# converted {len(grids)} grids {infmt} → {outfmt}", file=sys.stderr)
+    return 0
+
+
+def cmd_to_cnf(ns: argparse.Namespace) -> int:
+    grid = from_string(_read_grid_arg(ns))
+    outp = pathlib.Path(ns.out_path)
+    outp.parent.mkdir(parents=True, exist_ok=True)
+    with outp.open("w", encoding="utf-8") as handle:
+        for line in cnf_dimacs_lines(grid):
+            handle.write(line)
+            if not line.endswith("\n"):
+                handle.write("\n")
+    return 0
+
+
+def cmd_explain_file(ns: argparse.Namespace) -> int:
+    inp = pathlib.Path(ns.in_path)
+    outp = pathlib.Path(ns.out_path)
+    grids = read_grids(str(inp), ns.in_format)
+    outp.parent.mkdir(parents=True, exist_ok=True)
+    written = 0
+    with outp.open("w", encoding="utf-8") as handle:
+        for s in grids:
+            grid = from_string(s)
+            data = explain(grid, max_steps=ns.max_steps)
+            obj = {"grid": s, **data}
+            handle.write(json.dumps(obj, separators=(",", ":"), sort_keys=True) + "\n")
+            written += 1
+    print(f"# wrote {written} explanations to {outp}", file=sys.stderr)
+    return 0
+
+
 def cmd_explain(ns: argparse.Namespace) -> int:
     grid = from_string(_read_grid_arg(ns))
     data = explain(grid, max_steps=ns.max_steps)
@@ -383,6 +422,26 @@ def main(argv: Optional[list[str]] = None) -> int:
     check_parser.add_argument("--json", action="store_true", help="output JSON")
     check_parser.set_defaults(func=cmd_check)
 
+    convert_parser = sub.add_parser("convert", help="convert between txt/csv/jsonl formats")
+    convert_parser.add_argument("--in", dest="in_path", required=True)
+    convert_parser.add_argument("--out", dest="out_path", required=True)
+    convert_parser.add_argument("--in-format", dest="in_format", choices=["txt", "csv", "jsonl"])
+    convert_parser.add_argument("--out-format", dest="out_format", choices=["txt", "csv", "jsonl"])
+    convert_parser.set_defaults(func=cmd_convert)
+
+    tocnf_parser = sub.add_parser("to-cnf", help="export one puzzle to DIMACS CNF")
+    tocnf_parser.add_argument("--grid", help="81-char string; 0/./- for blanks")
+    tocnf_parser.add_argument("--file", help="path to a file with 9 lines of 9 chars")
+    tocnf_parser.add_argument("--out", dest="out_path", required=True)
+    tocnf_parser.set_defaults(func=cmd_to_cnf)
+
+    explainf_parser = sub.add_parser("explain-file", help="explain many puzzles into NDJSON")
+    explainf_parser.add_argument("--in", dest="in_path", required=True)
+    explainf_parser.add_argument("--out", dest="out_path", required=True)
+    explainf_parser.add_argument("--in-format", dest="in_format", choices=["txt", "csv", "jsonl"])
+    explainf_parser.add_argument("--max-steps", type=int, default=200)
+    explainf_parser.set_defaults(func=cmd_explain_file)
+
     explain_parser = sub.add_parser(
         "explain", help="human-style steps (naked/hidden single, locked candidates)"
     )
diff --git a/src/sudoku_dlx/crosscheck.py b/src/sudoku_dlx/crosscheck.py
@@ -2,7 +2,7 @@
 
 """SAT cross-check utilities using python-sat (optional extra)."""
 
-from typing import List, Optional
+from typing import Iterable, List, Optional
 
 Grid = List[List[int]]
 
@@ -59,6 +59,18 @@ def _encode_cnf(grid: Grid) -> list[list[int]]:
     return cnf
 
 
+def cnf_dimacs_lines(grid: Grid) -> Iterable[str]:
+    """Yield DIMACS CNF lines for ``grid`` using variables in ``[1, 729]``."""
+
+    cnf = _encode_cnf(grid)
+    num_vars = 9 * 9 * 9
+    num_clauses = len(cnf)
+    yield f"p cnf {num_vars} {num_clauses}"
+    for clause in cnf:
+        literals = " ".join(str(int(lit)) for lit in clause)
+        yield f"{literals} 0"
+
+
 def sat_solve(grid: Grid) -> Optional[Grid]:
     """Solve a Sudoku grid via SAT; returns the solved grid or ``None`` if unavailable."""
 
@@ -83,4 +95,4 @@ def sat_solve(grid: Grid) -> Optional[Grid]:
     return solved
 
 
-__all__ = ["sat_solve"]
+__all__ = ["sat_solve", "cnf_dimacs_lines"]
diff --git a/src/sudoku_dlx/formats.py b/src/sudoku_dlx/formats.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+from typing import Iterable, List, Optional
+import csv
+import json
+import pathlib
+
+
+# All “grid strings” are 81 chars, dots for blanks.
+
+def _strip_grid_line(s: str) -> str:
+    return "".join(ch for ch in s.strip() if not ch.isspace())
+
+
+def _is_81(s: str) -> bool:
+    return len(s) == 81
+
+
+def detect_format(path: str) -> str:
+    p = pathlib.Path(path)
+    ext = p.suffix.lower().lstrip(".")
+    if ext in {"txt", "sdk"}:
+        return "txt"
+    if ext in {"csv"}:
+        return "csv"
+    if ext in {"jsonl", "ndjson"}:
+        return "jsonl"
+    # default: try txt
+    return "txt"
+
+
+def read_grids(path: str, fmt: Optional[str] = None) -> List[str]:
+    fmt = fmt or detect_format(path)
+    p = pathlib.Path(path)
+    if fmt == "txt":
+        out: List[str] = []
+        for line in p.read_text(encoding="utf-8").splitlines():
+            s = _strip_grid_line(line)
+            if not s:
+                continue
+            if not _is_81(s):
+                raise ValueError(f"bad grid length (expected 81): {s!r}")
+            out.append(s)
+        return out
+    if fmt == "csv":
+        out: List[str] = []
+        with p.open("r", encoding="utf-8", newline="") as f:
+            sniffer = csv.Sniffer()
+            text = f.read()
+            f.seek(0)
+            try:
+                dialect = sniffer.sniff(text)
+            except Exception:
+                dialect = csv.excel
+            reader = csv.DictReader(f, dialect=dialect)
+            if reader.fieldnames is None or len(reader.fieldnames) == 0:
+                raise ValueError("CSV missing header row")
+            field = "grid" if "grid" in reader.fieldnames else reader.fieldnames[0]
+            for row in reader:
+                cell = row.get(field, "")
+                s = _strip_grid_line(cell)
+                if _is_81(s):
+                    out.append(s)
+        return out
+    if fmt == "jsonl":
+        out: List[str] = []
+        with p.open("r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                obj = json.loads(line)
+                s = _strip_grid_line(obj.get("grid", ""))
+                if _is_81(s):
+                    out.append(s)
+        return out
+    raise ValueError(f"unknown format: {fmt}")
+
+
+def write_grids(path: str, grids: Iterable[str], fmt: Optional[str] = None) -> None:
+    fmt = fmt or detect_format(path)
+    p = pathlib.Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    if fmt == "txt":
+        p.write_text("\n".join(grids) + "\n", encoding="utf-8")
+        return
+    if fmt == "csv":
+        with p.open("w", encoding="utf-8", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["grid"])
+            for s in grids:
+                writer.writerow([s])
+        return
+    if fmt == "jsonl":
+        with p.open("w", encoding="utf-8") as f:
+            for s in grids:
+                f.write(json.dumps({"grid": s}, separators=(",", ":")) + "\n")
+        return
+    raise ValueError(f"unknown format: {fmt}")
diff --git a/tests/test_convert_cnf_explainfile.py b/tests/test_convert_cnf_explainfile.py
@@ -0,0 +1,48 @@
+import json
+
+from sudoku_dlx import cli
+
+PUZ = (
+    "53..7...."
+    "6..195..."
+    ".98....6."
+    "8...6...3"
+    "4..8.3..1"
+    "7...2...6"
+    ".6....28."
+    "...419..5"
+    "....8..79"
+)
+
+
+def test_convert_txt_csv_roundtrip(tmp_path):
+    ptxt = tmp_path / "p.txt"
+    ptxt.write_text(PUZ + "\n" + PUZ + "\n", encoding="utf-8")
+    pcsv = tmp_path / "p.csv"
+    rc = cli.main(["convert", "--in", str(ptxt), "--out", str(pcsv)])
+    assert rc == 0
+    ptxt2 = tmp_path / "q.txt"
+    rc = cli.main(["convert", "--in", str(pcsv), "--out", str(ptxt2)])
+    assert rc == 0
+    assert ptxt2.read_text(encoding="utf-8").strip().splitlines()[0] == PUZ
+
+
+def test_to_cnf_writes_dimacs(tmp_path):
+    out = tmp_path / "p.cnf"
+    rc = cli.main(["to-cnf", "--grid", PUZ, "--out", str(out)])
+    assert rc == 0
+    lines = out.read_text(encoding="utf-8").splitlines()
+    assert lines[0].startswith("p cnf ")
+    assert all(line.endswith(" 0") or line.startswith("p ") for line in lines)
+
+
+def test_explain_file_ndjson(tmp_path):
+    ptxt = tmp_path / "p.txt"
+    ptxt.write_text(PUZ + "\n", encoding="utf-8")
+    out = tmp_path / "steps.ndjson"
+    rc = cli.main(["explain-file", "--in", str(ptxt), "--out", str(out)])
+    assert rc == 0
+    data = [json.loads(x) for x in out.read_text(encoding="utf-8").splitlines() if x.strip()]
+    assert len(data) == 1
+    obj = data[0]
+    assert "grid" in obj and "steps" in obj and "progress" in obj