Skip to content

Commit 551edc8

Browse files
authored
Merge pull request #133 from teslanika/feat/check-language-validator
feat: add content language validation for markdown artifacts
2 parents 0f3f2ba + 1349380 commit 551edc8

22 files changed

Lines changed: 2472 additions & 3 deletions

File tree

.bootstrap/.core/skills/cypilot/scripts/cypilot/cli.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ def _cmd_doctor(argv: List[str]) -> int:
154154
def _cmd_delegate(argv: List[str]) -> int:
155155
from .commands.delegate import cmd_delegate
156156
return cmd_delegate(argv)
157+
158+
def _cmd_check_language(argv: List[str]) -> int:
159+
from .commands.check_language import cmd_check_language
160+
return cmd_check_language(argv)
157161
# @cpt-end:cpt-cypilot-algo-core-infra-route-command:p1:inst-route-helpers
158162

159163
# =============================================================================
@@ -180,7 +184,7 @@ def main(argv: Optional[List[str]] = None) -> int:
180184
# Context may be None if Cypilot not initialized - that's OK for some commands like init
181185

182186
# Define all available commands
183-
analysis_commands = ["validate", "validate-kits", "validate-toc", "spec-coverage"]
187+
analysis_commands = ["validate", "validate-kits", "validate-toc", "spec-coverage", "check-language"]
184188
legacy_aliases = ["validate-code", "validate-rules"]
185189
kit_commands = ["kit"]
186190
utility_commands = ["toc", "chunk-input"]
@@ -359,6 +363,8 @@ def main(argv: Optional[List[str]] = None) -> int:
359363
return _cmd_delegate(rest)
360364
elif cmd == "doctor":
361365
return _cmd_doctor(rest)
366+
elif cmd == "check-language":
367+
return _cmd_check_language(rest)
362368
else:
363369
# @cpt-begin:cpt-cypilot-algo-core-infra-route-command:p1:inst-if-no-handler
364370
# @cpt-begin:cpt-cypilot-algo-core-infra-route-command:p1:inst-return-unknown
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
"""check-language command — scan Markdown artifacts for disallowed Unicode scripts."""
2+
3+
import argparse
4+
from pathlib import Path
5+
from typing import List
6+
7+
from ..utils import error_codes as EC
8+
from ..utils.ui import ui
9+
10+
11+
def cmd_check_language(argv: List[str]) -> int:
12+
"""Scan Markdown files for characters outside the allowed language set.
13+
14+
Exit codes:
15+
0 — all files pass
16+
1 — configuration / path error
17+
2 — one or more language violations found
18+
"""
19+
p = argparse.ArgumentParser(
20+
prog="check-language",
21+
description=(
22+
"Scan Markdown artifacts for characters outside the allowed Unicode "
23+
"script set. Language policy is read from workspace config "
24+
"([validation] allowed_content_languages) or set via --languages."
25+
),
26+
)
27+
p.add_argument(
28+
"paths",
29+
nargs="*",
30+
metavar="path",
31+
help="Files or directories to scan (default: project architecture/ folder)",
32+
)
33+
p.add_argument(
34+
"--languages",
35+
default=None,
36+
metavar="CODES",
37+
help="Comma-separated language codes to allow, e.g. 'en' or 'en,ru'. "
38+
"Overrides workspace config.",
39+
)
40+
p.add_argument(
41+
"--exclude",
42+
action="append",
43+
default=[],
44+
metavar="GLOB",
45+
dest="exclude",
46+
help=(
47+
"Glob pattern for paths to skip (relative to each scan root). "
48+
"Repeatable: --exclude 'translations/**' --exclude 'specs/i18n/*.md'. "
49+
"Merged with check_language_ignore_paths from workspace config."
50+
),
51+
)
52+
p.add_argument(
53+
"--quiet",
54+
"-q",
55+
action="store_true",
56+
help="Suppress summary header; show violations only.",
57+
)
58+
args = p.parse_args(argv)
59+
60+
from ..utils.content_language import (
61+
SUPPORTED_LANGUAGES,
62+
build_allowed_ranges,
63+
scan_paths,
64+
)
65+
66+
# ── Resolve allowed languages ────────────────────────────────────────────
67+
if args.languages is not None:
68+
raw_langs = [lang_code.strip().lower() for lang_code in args.languages.split(",") if lang_code.strip()]
69+
unknown = [lang_code for lang_code in raw_langs if lang_code not in SUPPORTED_LANGUAGES]
70+
if unknown:
71+
ui.result({
72+
"status": "ERROR",
73+
"message": (
74+
f"Unknown language code(s): {', '.join(unknown)}. "
75+
f"Supported: {', '.join(SUPPORTED_LANGUAGES)}"
76+
),
77+
})
78+
return 1
79+
allowed_langs = raw_langs
80+
else:
81+
allowed_langs = _read_config_languages()
82+
83+
# ── Resolve ignore globs ─────────────────────────────────────────────────
84+
ignore_globs: List[str] = list(args.exclude) + _read_config_ignore_paths()
85+
86+
# ── Resolve scan roots ───────────────────────────────────────────────────
87+
if args.paths:
88+
roots = [Path(pth) for pth in args.paths]
89+
else:
90+
roots = _default_roots()
91+
92+
missing = [str(r) for r in roots if not r.exists()]
93+
if missing:
94+
ui.result({
95+
"status": "ERROR",
96+
"message": f"Path(s) not found: {', '.join(missing)}",
97+
})
98+
return 1
99+
100+
# ── Scan ─────────────────────────────────────────────────────────────────
101+
allowed_ranges = build_allowed_ranges(allowed_langs)
102+
from ..utils.content_language import LangScanError
103+
try:
104+
violations = scan_paths(roots, allowed_ranges, ignore_globs=ignore_globs or None)
105+
except LangScanError as exc:
106+
ui.result({
107+
"status": "ERROR",
108+
"message": str(exc),
109+
})
110+
return 1
111+
112+
files_scanned = _count_md_files(roots)
113+
114+
if not violations:
115+
result = {
116+
"status": "PASS",
117+
"allowed_languages": allowed_langs,
118+
"files_scanned": files_scanned,
119+
"violation_count": 0,
120+
}
121+
if ignore_globs:
122+
result["ignore_globs"] = ignore_globs
123+
ui.result(result, human_fn=lambda d: _human_result(d, quiet=args.quiet))
124+
return 0
125+
126+
# Group violations by file for reporting
127+
by_file: dict = {}
128+
for v in violations:
129+
by_file.setdefault(str(v.path), []).append(v)
130+
131+
violation_items = []
132+
for file_path, file_violations in by_file.items():
133+
for v in file_violations:
134+
violation_items.append({
135+
"path": file_path,
136+
"line": v.lineno,
137+
"chars": v.bad_chars_preview(),
138+
"preview": v.line_preview(),
139+
"code": EC.CONTENT_LANGUAGE_VIOLATION,
140+
})
141+
142+
result = {
143+
"status": "FAIL",
144+
"allowed_languages": allowed_langs,
145+
"files_scanned": files_scanned,
146+
"violation_count": len(violations),
147+
"file_count": len(by_file),
148+
"violations": violation_items,
149+
}
150+
if ignore_globs:
151+
result["ignore_globs"] = ignore_globs
152+
ui.result(result, human_fn=lambda d: _human_result(d, quiet=args.quiet))
153+
return 2
154+
155+
156+
# ---------------------------------------------------------------------------
157+
# Helpers
158+
# ---------------------------------------------------------------------------
159+
160+
def _read_config_languages() -> List[str]:
161+
"""Read allowed_content_languages from workspace config; fall back to ['en']."""
162+
try:
163+
from ..utils.context import get_context
164+
from ..utils.workspace import find_workspace_config
165+
166+
ctx = get_context()
167+
if ctx is None:
168+
return ["en"]
169+
_ws_cfg, _ = find_workspace_config(ctx.project_root)
170+
if _ws_cfg is not None and _ws_cfg.validation is not None: # type: ignore[union-attr]
171+
langs = _ws_cfg.validation.allowed_content_languages # type: ignore[union-attr]
172+
if langs:
173+
return langs
174+
except Exception:
175+
pass
176+
return ["en"]
177+
178+
179+
def _read_config_ignore_paths() -> List[str]:
180+
"""Read check_language_ignore_paths from workspace config; fall back to []."""
181+
try:
182+
from ..utils.context import get_context
183+
from ..utils.workspace import find_workspace_config
184+
185+
ctx = get_context()
186+
if ctx is None:
187+
return []
188+
_ws_cfg, _ = find_workspace_config(ctx.project_root)
189+
if _ws_cfg is not None and _ws_cfg.validation is not None: # type: ignore[union-attr]
190+
paths = _ws_cfg.validation.check_language_ignore_paths # type: ignore[union-attr]
191+
if paths:
192+
return list(paths)
193+
except Exception:
194+
pass
195+
return []
196+
197+
198+
def _default_roots() -> List[Path]:
199+
"""Return the default scan root (architecture/ under project root)."""
200+
try:
201+
from ..utils.context import get_context
202+
203+
ctx = get_context()
204+
if ctx is not None:
205+
return [ctx.project_root / "architecture"]
206+
except (ImportError, AttributeError):
207+
pass
208+
return [Path.cwd() / "architecture"]
209+
210+
211+
def _count_md_files(roots: List[Path]) -> int:
212+
count = 0
213+
for root in roots:
214+
if root.is_file():
215+
if root.suffix.lower() == ".md":
216+
count += 1
217+
elif root.is_dir():
218+
count += sum(1 for _ in root.rglob("*.md"))
219+
return count
220+
221+
222+
# ---------------------------------------------------------------------------
223+
# Human formatter
224+
# ---------------------------------------------------------------------------
225+
226+
def _human_result(data: dict, quiet: bool = False) -> None:
227+
status = data.get("status", "")
228+
allowed = data.get("allowed_languages", [])
229+
230+
if not quiet:
231+
ui.header("check-language")
232+
ui.detail("Allowed languages", ", ".join(allowed))
233+
n_files = data.get("files_scanned", 0)
234+
ui.detail("Files scanned", str(n_files))
235+
ui.blank()
236+
237+
if status == "PASS":
238+
ui.success("No language violations found.")
239+
ui.blank()
240+
return
241+
242+
if status == "ERROR":
243+
ui.error(str(data.get("message", "Unknown error")))
244+
ui.blank()
245+
return
246+
247+
n_viol = data.get("violation_count", 0)
248+
n_file_count = data.get("file_count", 0)
249+
ui.warn(f"FAIL {n_viol} violation(s) in {n_file_count} file(s)")
250+
ui.blank()
251+
252+
violations = data.get("violations", [])
253+
by_file: dict = {}
254+
for v in violations:
255+
by_file.setdefault(v["path"], []).append(v)
256+
257+
for file_path, file_violations in by_file.items():
258+
ui.substep(f" {ui.relpath(file_path)} ({len(file_violations)} line(s))")
259+
for v in file_violations:
260+
ui.substep(f" line {v['line']:>4} [{v['chars']}] {v['preview']}")
261+
ui.blank()
262+
263+
ui.hint("Fix: rewrite flagged content in the allowed language(s).")
264+
ui.hint(
265+
"To allow additional scripts, add to .cypilot-workspace.toml:\n"
266+
" [validation]\n"
267+
" allowed_content_languages = [\"en\", \"ru\"]"
268+
)
269+
ui.hint(
270+
"To ignore specific paths (e.g. translation specs), use --exclude or add to config:\n"
271+
" [validation]\n"
272+
" check_language_ignore_paths = [\"translations/**\", \"specs/i18n/*.md\"]\n"
273+
"To ignore a single file, add <!-- cpt-lang: ignore --> anywhere in the file."
274+
)
275+
if data.get("ignore_globs"):
276+
ui.detail("Active ignore globs", ", ".join(data["ignore_globs"]))
277+
ui.blank()

.bootstrap/.core/skills/cypilot/scripts/cypilot/commands/validate.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,15 @@ def _attach_issue_to_artifact_report(issue: Dict[str, object], *, is_error: bool
390390
rep["warnings"].append(issue)
391391
# @cpt-end:cpt-cypilot-flow-traceability-validation-validate:p1:inst-validate-helpers
392392

393+
# Content language check — runs after per-artifact structure validation.
394+
# Skipped if structure has already failed (all_errors non-empty) so language
395+
# issues never obscure structural errors.
396+
if not all_errors:
397+
_lang_errs = _run_content_language_check(artifacts_to_validate, ws_ctx, project_root)
398+
for _le in _lang_errs:
399+
all_errors.append(_le)
400+
_attach_issue_to_artifact_report(_le, is_error=True)
401+
393402
# @cpt-begin:cpt-cypilot-flow-traceability-validation-validate:p1:inst-if-structure-fail
394403
# Stop early: cross-artifact reference checks and code traceability checks are run only
395404
# after per-artifact structure/content checks pass.
@@ -897,6 +906,74 @@ def _suggest_path_from_autodetect(node: object, target_kind: str) -> Optional[st
897906
return None
898907
# @cpt-end:cpt-cypilot-flow-traceability-validation-validate:p1:inst-validate-helpers
899908

909+
# ---------------------------------------------------------------------------
910+
# Content language check helper
911+
# ---------------------------------------------------------------------------
912+
913+
def _run_content_language_check(
914+
artifacts_to_validate: list,
915+
ws_ctx: object,
916+
project_root: "Path",
917+
) -> list:
918+
"""Return language-violation error dicts for all validated .md artifacts.
919+
920+
Uses ``ws_ctx.project_root`` when available, otherwise falls back to
921+
``project_root``, so the check works in both workspace mode and
922+
single-repo mode. Returns an empty list when
923+
``allowed_content_languages`` is not configured. Returns a validation
924+
error entry when the workspace config file exists but fails to load.
925+
"""
926+
from ..utils.workspace import find_workspace_config as _find_ws
927+
from ..utils.content_language import (
928+
LangScanError as _LangScanError,
929+
build_allowed_ranges,
930+
scan_file as _scan_file,
931+
)
932+
from ..utils.constraints import error as _error
933+
from ..utils import error_codes as _EC
934+
935+
root = getattr(ws_ctx, "project_root", None) or project_root
936+
_ws_cfg, _ws_err = _find_ws(root)
937+
if _ws_err is not None:
938+
return [_error(
939+
"workspace",
940+
f"Failed to load workspace config for language check: {_ws_err}",
941+
path=root,
942+
code=_EC.FILE_LOAD_ERROR,
943+
)]
944+
if _ws_cfg is None or _ws_cfg.validation is None:
945+
return []
946+
allowed_langs = _ws_cfg.validation.allowed_content_languages
947+
if not allowed_langs:
948+
return []
949+
950+
allowed_ranges = build_allowed_ranges(allowed_langs)
951+
results = []
952+
for artifact_path, _template_path, _artifact_type, _traceability, _kit_id in artifacts_to_validate:
953+
if artifact_path.suffix.lower() != ".md":
954+
continue
955+
try:
956+
violations = _scan_file(artifact_path, allowed_ranges)
957+
except _LangScanError as exc:
958+
results.append(_error(
959+
"language",
960+
f"Cannot read file for language scan: {exc}",
961+
path=artifact_path,
962+
code=_EC.FILE_READ_ERROR,
963+
))
964+
continue
965+
for v in violations:
966+
results.append(_error(
967+
"language",
968+
f"Non-allowed characters [{v.bad_chars_preview()}] — {v.line_preview()}",
969+
path=artifact_path,
970+
line=v.lineno,
971+
code=_EC.CONTENT_LANGUAGE_VIOLATION,
972+
allowed_languages=allowed_langs,
973+
))
974+
return results
975+
976+
900977
# ---------------------------------------------------------------------------
901978
# Human-friendly formatter
902979
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)