Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions packages/pipeline/src/__tests__/package-web.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,22 @@ describe("convertLatexToMathml (HTML)", () => {
const result = convertLatexToMathml(html)
expect(result).toContain("<math")
})

it("does not convert snake_case identifiers in prose as math", () => {
const html = '<p>Set the variable_name to the desired value and review snake_case style.</p>'
const result = convertLatexToMathml(html)
expect(result).toBe(html)
expect(result).not.toContain("<math")
})

it("does not mangle snake_case words embedded in math-containing prose", () => {
const html = '<p>Please configure the variable_name setting before running any experiments where X_i represents each independent measurement taken during the trial.</p>'
const result = convertLatexToMathml(html)
// snake_case word must be preserved verbatim
expect(result).toContain("variable_name")
// X_i should be converted to math
expect(result).toContain("<math")
})
})

describe("packageWebpub", () => {
Expand Down
11 changes: 8 additions & 3 deletions packages/pipeline/src/package-web.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1331,9 +1331,11 @@ const MATH_INDICATORS = [
* Matches: \text{}, \hat{}, \frac{}{}, \sqrt{}, \vec{}, \bar{}, \overline{},
* \circ, ^\circ, _{...}, ^{...}, \times, \div, \pm, \leq, \geq, \neq,
* \mathcal{}, \mathbb{}, \in, \leftarrow, etc.
* Also matches bare subscript/superscript like X_i or X^2.
* Also matches bare subscript/superscript like X_i or X^2, but NOT snake_case
* identifiers like `variable_name` — the base letter must not be preceded by
* another letter, and the subscript char must not be followed by another letter.
*/
const UNDELIMITED_LATEX_RE = /\\(?:text|mbox|hat|frac|sqrt|vec|bar|overline|underline|mathbf|mathrm|mathit|mathcal|mathbb|mathfrak|mathscr|circ|times|div|pm|mp|leq|geq|neq|approx|equiv|sim|in|notin|subset|supset|cup|cap|leftarrow|rightarrow|Leftarrow|Rightarrow|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|pi|sigma|omega|phi|psi|infty|partial|nabla|sum|prod|int|lim|log|ln|sin|cos|tan|sec|csc|cot|left|right|cdot|ldots|cdots|quad|qquad|binom|tag)\b|[_^]\{|[A-Za-z][_^][A-Za-z0-9]/
const UNDELIMITED_LATEX_RE = /\\(?:text|mbox|hat|frac|sqrt|vec|bar|overline|underline|mathbf|mathrm|mathit|mathcal|mathbb|mathfrak|mathscr|circ|times|div|pm|mp|leq|geq|neq|approx|equiv|sim|in|notin|subset|supset|cup|cap|leftarrow|rightarrow|Leftarrow|Rightarrow|alpha|beta|gamma|delta|epsilon|theta|lambda|mu|pi|sigma|omega|phi|psi|infty|partial|nabla|sum|prod|int|lim|log|ln|sin|cos|tan|sec|csc|cot|left|right|cdot|ldots|cdots|quad|qquad|binom|tag)\b|[_^]\{|(?<![A-Za-z])[A-Za-z][_^][A-Za-z0-9](?![A-Za-z])/

function containsMathContent(html: string): boolean {
if (MATH_INDICATORS.some((indicator) => html.includes(indicator))) return true
Expand Down Expand Up @@ -1377,7 +1379,10 @@ function convertInlineLatexFragments(text: string): string {
// Match LaTeX expressions: optional leading alphanumeric, one or more "atoms",
// with optional alphanumeric glue between atoms.
// Atoms: \command{...}, _\command{...}, _{...}, ^{...}, _x, ^x, \{ or \}
const LATEX_EXPR_RE = /[A-Za-z0-9]*(?:(?:\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\{(?:[^{}]|\{[^{}]*\})*\}|[_^][A-Za-z0-9]|\\[{}])[A-Za-z0-9]*)+/g
// The expression must start at a word boundary (not inside an identifier),
// and bare `_x`/`^x` subscripts cannot be followed by more letters — this
// prevents snake_case identifiers like `variable_name` from being matched.
const LATEX_EXPR_RE = /(?<![A-Za-z0-9])[A-Za-z0-9]*(?:(?:\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\\[a-zA-Z]+(?:\{(?:[^{}]|\{[^{}]*\})*\})*|[_^]\{(?:[^{}]|\{[^{}]*\})*\}|[_^][A-Za-z0-9](?![A-Za-z])|\\[{}])[A-Za-z0-9]*)+/g

text = text.replace(LATEX_EXPR_RE, (expr) => {
const trimmed = expr.trim()
Expand Down
Loading