kreuzberg/.typos.toml at main · geyerandreas/kreuzberg · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Configuration for the typos spell checker.
# https://github.com/crate-ci/typos

[default]
extend-ignore-re = [
    # Hex strings (e.g. commit hashes, checksums)
    "[a-fA-F0-9]{32,}",
    # Base64 encoded strings
    "[A-Za-z0-9+/]{40,}={0,2}",
    # URLs — avoid flagging path segments
    "https?://[^\\s]+",
    # ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.)
    "fo:[a-z-]+",
    "xsl-fo-compatible",
    # Unicode escapes (e.g. \u{0065})
    "'\\\\u\\{[0-9a-fA-F]+\\}'",
    # Foreign language text in test strings and OCR backend language lists
    '"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"',
]

[default.extend-words]
# Project-specific terms that are not typos.
kreuzberg = "kreuzberg"
zensical = "zensical"
pdfium = "pdfium"
tesseract = "tesseract"
onnx = "onnx"
surrealdb = "surrealdb"
docling = "docling"
markitdown = "markitdown"
pymupdf = "pymupdf"
openwebui = "openwebui"
webui = "webui"
wasm = "wasm"
mkdocs = "mkdocs"
mkdocstrings = "mkdocstrings"
rumdl = "rumdl"
flate = "flate"
tha = "tha"
# Domain-specific terms
opf = "opf"
hocr = "hocr"
odf = "odf"
# LaTeX environments
multline = "multline"
hom = "hom"
# RTF control words
headerr = "headerr"
pard = "pard"
# DOC format field names
edn = "edn"
# DOCX measurement units (twips, 50ths of percent, 240ths of line)
ths = "ths"
# Test data / examples
ove = "ove"
ges = "ges"
caf = "caf"
helo = "helo"
# Common short variable names / identifiers in code
fo = "fo"
pn = "pn"
thr = "thr"
nd = "nd"
ba = "ba"
iy = "iy"
siz = "siz"
# Tesseract upstream API spelling
extention = "extention"
# PDFium upstream constant/function naming
portait = "portait"
fith = "fith"
threed = "threed"
chlidren = "chlidren"
formated = "formated"
specifing = "specifing"
# English variants / valid words flagged incorrectly
unparseable = "unparseable"
# PaddleOCR upstream naming
substract = "substract"
charater = "charater"
# OCR language codes (ISO 639)
inh = "inh"
bre = "bre"
yor = "yor"
# English suffix patterns in semantic analysis
ment = "ment"
# PaddleOCR upstream naming
cliper = "cliper"
# PDFium upstream doc typos
similarily = "similarily"
execpt = "execpt"
faiure = "faiure"
# Tesseract upstream code
splitted = "splitted"
# Short words flagged in code/data contexts
mis = "mis"
tre = "tre"
ist = "ist"
ein = "ein"

[default.extend-identifiers]
# Allow these identifiers in code
PyMuPDF = "PyMuPDF"
MarkItDown = "MarkItDown"
SurrealDB = "SurrealDB"
PDFium = "PDFium"
WebUI = "WebUI"
traineddata = "traineddata"

[files]
extend-exclude = [
    # Test fixtures and vendor files
    "test_documents/",
    "fixtures/",
    # Lock files
    "*.lock",
    "pnpm-lock.yaml",
    # Build artifacts
    "target/",
    "node_modules/",
    "dist/",
    "site/",
    # Code snippets (validated separately by snippet-runner)
    "docs/snippets/",
    # Generated files (e2e tests, bindgen)
    "e2e/",
    "**/bindgen/",
    # Stopwords files (foreign language words)
    "**/stopwords/",
    # Test data (hOCR samples, etc.)
    "**/test_data/",
    # Patch files (upstream diffs)
    "**/*.diff",
    "**/*.patch",
    # PDF text repair tests (intentionally broken text)
    "**/text_repair.rs",
    # Changelog (contains intentional examples of garbled text)
    "CHANGELOG.md",
    "docs/CHANGELOG.md",
    # Vendored / third-party code
    "**/vendor/",
    "**/vendored/",
    # Binary files
    "*.whl",
    "*.tar.gz",
    "*.png",
    "*.ico",
    "*.svg",
]