Skip to content

Commit b5352b0

Browse files
Prevent snake_case identifiers from being detected as LaTeX math (#291)
Tighten the bare subscript/superscript patterns in UNDELIMITED_LATEX_RE and LATEX_EXPR_RE with lookaround assertions so identifiers like `variable_name` are no longer matched, while expressions like `X_i` and `X^2` still convert correctly.
1 parent f52543e commit b5352b0

File tree

2 files changed

+24
-3
lines changed

2 files changed

+24
-3
lines changed

packages/pipeline/src/__tests__/package-web.test.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,6 +1121,22 @@ describe("convertLatexToMathml (HTML)", () => {
11211121
const result = convertLatexToMathml(html)
11221122
expect(result).toContain("<math")
11231123
})
1124+
1125+
it("does not convert snake_case identifiers in prose as math", () => {
1126+
const html = '<p>Set the variable_name to the desired value and review snake_case style.</p>'
1127+
const result = convertLatexToMathml(html)
1128+
expect(result).toBe(html)
1129+
expect(result).not.toContain("<math")
1130+
})
1131+
1132+
it("does not mangle snake_case words embedded in math-containing prose", () => {
1133+
const html = '<p>Please configure the variable_name setting before running any experiments where X_i represents each independent measurement taken during the trial.</p>'
1134+
const result = convertLatexToMathml(html)
1135+
// snake_case word must be preserved verbatim
1136+
expect(result).toContain("variable_name")
1137+
// X_i should be converted to math
1138+
expect(result).toContain("<math")
1139+
})
11241140
})
11251141

11261142
describe("packageWebpub", () => {

packages/pipeline/src/package-web.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,9 +1331,11 @@ const MATH_INDICATORS = [
13311331
* Matches: \text{}, \hat{}, \frac{}{}, \sqrt{}, \vec{}, \bar{}, \overline{},
13321332
* \circ, ^\circ, _{...}, ^{...}, \times, \div, \pm, \leq, \geq, \neq,
13331333
* \mathcal{}, \mathbb{}, \in, \leftarrow, etc.
1334-
* Also matches bare subscript/superscript like X_i or X^2.
1334+
* Also matches bare subscript/superscript like X_i or X^2, but NOT snake_case
1335+
* identifiers like `variable_name` — the base letter must not be preceded by
1336+
* another letter, and the subscript char must not be followed by another letter.
13351337
*/
1336-
const UNDELIMITED_LATEX_RE = /\\(?:text|mbox|hat|frac|sqrt|vec|bar|overline|underline|mathbf|mathrm|mathit|mathcal|mathbb|mathfrak|mathscr|circ|times|div|pm|mp|leq|geq|neq|approx|equiv|sim|in|notin|subset|supset|cup|cap|leftarrow|rightarrow|Leftarrow|Rightarrow|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|pi|sigma|omega|phi|psi|infty|partial|nabla|sum|prod|int|lim|log|ln|sin|cos|tan|sec|csc|cot|left|right|cdot|ldots|cdots|quad|qquad|binom|tag)\b|[_^]\{|[A-Za-z][_^][A-Za-z0-9]/
1338+
const UNDELIMITED_LATEX_RE = /\\(?:text|mbox|hat|frac|sqrt|vec|bar|overline|underline|mathbf|mathrm|mathit|mathcal|mathbb|mathfrak|mathscr|circ|times|div|pm|mp|leq|geq|neq|approx|equiv|sim|in|notin|subset|supset|cup|cap|leftarrow|rightarrow|Leftarrow|Rightarrow|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|pi|sigma|omega|phi|psi|infty|partial|nabla|sum|prod|int|lim|log|ln|sin|cos|tan|sec|csc|cot|left|right|cdot|ldots|cdots|quad|qquad|binom|tag)\b|[_^]\{|(?<![A-Za-z])[A-Za-z][_^][A-Za-z0-9](?![A-Za-z])/
13371339

13381340
function containsMathContent(html: string): boolean {
13391341
if (MATH_INDICATORS.some((indicator) => html.includes(indicator))) return true
@@ -1377,7 +1379,10 @@ function convertInlineLatexFragments(text: string): string {
13771379
// Match LaTeX expressions: optional leading alphanumeric, one or more "atoms",
13781380
// with optional alphanumeric glue between atoms.
13791381
// Atoms: \command{...}, _\command{...}, _{...}, ^{...}, _x, ^x, \{ or \}
1380-
const LATEX_EXPR_RE = /[A-Za-z0-9]*(?:(?:\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\{(?:[^{}]|\{[^{}]*\})*\}|[_^][A-Za-z0-9]|\\[{}])[A-Za-z0-9]*)+/g
1382+
// The expression must start at a word boundary (not inside an identifier),
1383+
// and bare `_x`/`^x` subscripts cannot be followed by more letters — this
1384+
// prevents snake_case identifiers like `variable_name` from being matched.
1385+
const LATEX_EXPR_RE = /(?<![A-Za-z0-9])[A-Za-z0-9]*(?:(?:\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\{(?:[^{}]|\{[^{}]*\})*\}|[_^][A-Za-z0-9](?![A-Za-z])|\\[{}])[A-Za-z0-9]*)+/g
13811386

13821387
text = text.replace(LATEX_EXPR_RE, (expr) => {
13831388
const trimmed = expr.trim()

0 commit comments

Comments
 (0)