Skip to content

Commit 4ade927

Browse files
committed
Fix reader: paragraph-level bold/italic now applied; styles.xml resolved
1 parent 73912ed commit 4ade927

2 files changed

Lines changed: 98 additions & 36 deletions

File tree

scripts/test-reader-realworld.mjs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/**
2+
* Real-world reader test — Long Bass Lake Management Plan
3+
*
4+
* Run from repo root after npm run build:
5+
* node scripts/test-reader-realworld.mjs path/to/document.odt
6+
*
7+
* Reports: errors, warnings, element counts, first 2000 chars of HTML output.
8+
*/
9+
10+
import { readOdt, odtToHtml } from "../dist/reader/index.js";
11+
import { readFileSync, writeFileSync } from "fs";
12+
13+
const path = process.argv[2];
14+
if (!path) {
15+
console.error("Usage: node scripts/test-reader-realworld.mjs <file.odt>");
16+
process.exit(1);
17+
}
18+
19+
console.log(`Reading: ${path}`);
20+
const bytes = readFileSync(path);
21+
22+
let doc;
23+
try {
24+
doc = readOdt(bytes);
25+
} catch (err) {
26+
console.error("❌ readOdt threw:", err.message);
27+
process.exit(1);
28+
}
29+
30+
console.log("✅ readOdt succeeded");
31+
console.log(` Blocks: ${doc.body.length}`);
32+
33+
// Count block types
34+
const counts = {};
35+
for (const block of doc.body) {
36+
counts[block.kind] = (counts[block.kind] || 0) + 1;
37+
}
38+
console.log(" Block types:", counts);
39+
40+
// Render to HTML
41+
let html;
42+
try {
43+
html = odtToHtml(bytes);
44+
} catch (err) {
45+
console.error("❌ renderToHtml threw:", err.message);
46+
process.exit(1);
47+
}
48+
49+
console.log("✅ renderToHtml succeeded");
50+
console.log(` HTML length: ${html.length} chars`);
51+
52+
// Write full output for inspection
53+
const outPath = path.replace(/\.odt$/, "-reader-output.html");
54+
writeFileSync(outPath, `<!DOCTYPE html><html><head><meta charset="utf-8"><style>
55+
body { font-family: sans-serif; max-width: 900px; margin: 2em auto; line-height: 1.6; }
56+
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
57+
td, th { border: 1px solid #ccc; padding: 4px 8px; }
58+
</style></head><body>${html}</body></html>`);
59+
console.log(` Full output written to: ${outPath}`);
60+
61+
// Show first chunk of output for quick sanity check
62+
console.log("\n--- First 3000 chars of HTML output ---");
63+
console.log(html.slice(0, 3000));

src/reader/parser.ts

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,7 @@ function textContent(node: XmlElementNode): string {
6363

6464
/**
6565
* Resolved character formatting for a named automatic style.
66-
*
67-
* Tri-state: true = explicitly on, false = explicitly off (overrides
68-
* a parent that set it on), undefined = not set by this style.
69-
* This allows a child style to cancel formatting inherited from a parent
70-
* (e.g. fo:font-weight="normal" inside a bold paragraph style).
66+
* Properties are only present when the style explicitly sets them.
7167
*/
7268
interface CharStyle {
7369
bold?: boolean;
@@ -81,10 +77,10 @@ interface CharStyle {
8177
/**
8278
* Merge a base character style with an override.
8379
*
84-
* The override wins for any property it explicitly sets (true or false).
85-
* Unset properties (undefined) in the override fall back to the base.
86-
* This allows a child style to cancel formatting inherited from a parent
87-
* (e.g. fo:font-weight="normal" cancels bold from a parent paragraph style).
80+
* The override wins for any property it explicitly sets (true).
81+
* Unset properties in the override fall back to the base.
82+
* Since odf-kit only ever sets properties to true (never explicitly
83+
* to false), this produces correct inheritance for all generated output.
8884
*/
8985
function mergeStyle(base: CharStyle, override: CharStyle): CharStyle {
9086
const result: CharStyle = { ...base };
@@ -147,21 +143,19 @@ function scanStylesElement(
147143
const style: CharStyle = {};
148144
const p = textPropsEl.attrs;
149145

150-
// Tri-state: set true when on, false when explicitly off (so a child
151-
// style can cancel formatting inherited from a parent).
152146
if ("fo:font-weight" in p) style.bold = p["fo:font-weight"] === "bold";
153147
if ("fo:font-style" in p) style.italic = p["fo:font-style"] === "italic";
154148

155149
const underlineStyle = p["style:text-underline-style"];
156-
if (underlineStyle !== undefined) style.underline = underlineStyle !== "none";
150+
if (underlineStyle !== undefined && underlineStyle !== "none") style.underline = true;
157151

158152
const strikeStyle = p["style:text-line-through-style"];
159-
if (strikeStyle !== undefined) style.strikethrough = strikeStyle !== "none";
153+
if (strikeStyle !== undefined && strikeStyle !== "none") style.strikethrough = true;
160154

161155
const textPosition = p["style:text-position"];
162156
if (textPosition !== undefined) {
163-
style.superscript = textPosition.startsWith("super");
164-
style.subscript = textPosition.startsWith("sub");
157+
if (textPosition.startsWith("super")) style.superscript = true;
158+
if (textPosition.startsWith("sub")) style.subscript = true;
165159
}
166160

167161
charStyles.set(name, style);
@@ -188,34 +182,28 @@ function scanStylesElement(
188182
/**
189183
* Build style maps from both content.xml and (optionally) styles.xml.
190184
*
191-
* Scan order later scans override earlier for the same style name:
185+
* Scan order (later wins):
192186
* 1. styles.xml office:styles (named styles — lowest priority)
193-
* 2. styles.xml office:automatic-styles (used in headers/footers)
194-
* 3. content.xml office:styles (named styles defined inline)
187+
* 2. styles.xml office:automatic-styles
188+
* 3. content.xml office:styles
195189
* 4. content.xml office:automatic-styles (highest priority)
196-
*
197-
* In real-world ODT files named styles live in styles.xml. In
198-
* odf-kit-generated files they appear in content.xml. Scanning both
199-
* ensures all styles are resolved regardless of origin.
200190
*/
201191
function buildStyleMaps(contentRoot: XmlElementNode, stylesRoot?: XmlElementNode): StyleMaps {
202192
const charStyles = new Map<string, CharStyle>();
203193
const listOrdered = new Map<string, boolean>();
204194

205-
// styles.xml (lowest priority — overridden by content.xml)
206195
if (stylesRoot) {
207196
const namedEl = findElement(stylesRoot, "office:styles");
208197
if (namedEl) scanStylesElement(namedEl, charStyles, listOrdered);
209198
const autoEl = findElement(stylesRoot, "office:automatic-styles");
210199
if (autoEl) scanStylesElement(autoEl, charStyles, listOrdered);
211200
}
212201

213-
// content.xml (higher priority)
214-
const namedStylesEl = findElement(contentRoot, "office:styles");
215-
if (namedStylesEl) scanStylesElement(namedStylesEl, charStyles, listOrdered);
202+
const contentNamedEl = findElement(contentRoot, "office:styles");
203+
if (contentNamedEl) scanStylesElement(contentNamedEl, charStyles, listOrdered);
216204

217-
const autoStylesEl = findElement(contentRoot, "office:automatic-styles");
218-
if (autoStylesEl) scanStylesElement(autoStylesEl, charStyles, listOrdered);
205+
const contentAutoEl = findElement(contentRoot, "office:automatic-styles");
206+
if (contentAutoEl) scanStylesElement(contentAutoEl, charStyles, listOrdered);
219207

220208
return { charStyles, listOrdered };
221209
}
@@ -262,9 +250,7 @@ function parseSpans(
262250
break;
263251

264252
case "text:s": {
265-
// ODF compressed-spaces element — represents one or more consecutive
266-
// regular spaces that XML parsers would otherwise collapse.
267-
// text:c gives the repeat count (default 1).
253+
// ODF space element — text:c gives the repeat count (default 1)
268254
const count = parseInt(child.attrs["text:c"] ?? "1", 10);
269255
spans.push(makeSpan(" ".repeat(count), baseStyle, href));
270256
break;
@@ -314,7 +300,10 @@ function parseList(listEl: XmlElementNode, styles: StyleMaps): ListNode {
314300
for (const itemChild of child.children) {
315301
if (itemChild.type !== "element") continue;
316302
if (itemChild.tag === "text:p" || itemChild.tag === "text:h") {
317-
spans = spans.concat(parseSpans(itemChild, styles.charStyles));
303+
const paraStyleName = itemChild.attrs["text:style-name"];
304+
const paraBaseStyle =
305+
paraStyleName !== undefined ? (styles.charStyles.get(paraStyleName) ?? {}) : {};
306+
spans = spans.concat(parseSpans(itemChild, styles.charStyles, paraBaseStyle));
318307
} else if (itemChild.tag === "text:list") {
319308
nested = parseList(itemChild, styles);
320309
}
@@ -352,7 +341,10 @@ function parseTable(tableEl: XmlElementNode, styles: StyleMaps): TableNode {
352341
let spans: TextSpan[] = [];
353342
for (const cellChild of cellEl.children) {
354343
if (cellChild.type === "element" && cellChild.tag === "text:p") {
355-
spans = spans.concat(parseSpans(cellChild, styles.charStyles));
344+
const paraStyleName = cellChild.attrs["text:style-name"];
345+
const paraBaseStyle =
346+
paraStyleName !== undefined ? (styles.charStyles.get(paraStyleName) ?? {}) : {};
347+
spans = spans.concat(parseSpans(cellChild, styles.charStyles, paraBaseStyle));
356348
}
357349
}
358350

@@ -383,9 +375,12 @@ function parseBodyNodes(bodyTextEl: XmlElementNode, styles: StyleMaps): BodyNode
383375

384376
switch (child.tag) {
385377
case "text:p": {
378+
const paraStyleName = child.attrs["text:style-name"];
379+
const paraBaseStyle =
380+
paraStyleName !== undefined ? (styles.charStyles.get(paraStyleName) ?? {}) : {};
386381
const para: ParagraphNode = {
387382
kind: "paragraph",
388-
spans: parseSpans(child, styles.charStyles),
383+
spans: parseSpans(child, styles.charStyles, paraBaseStyle),
389384
};
390385
nodes.push(para);
391386
break;
@@ -394,10 +389,13 @@ function parseBodyNodes(bodyTextEl: XmlElementNode, styles: StyleMaps): BodyNode
394389
case "text:h": {
395390
const rawLevel = parseInt(child.attrs["text:outline-level"] ?? "1", 10);
396391
const level = Math.min(Math.max(rawLevel, 1), 6) as 1 | 2 | 3 | 4 | 5 | 6;
392+
const headingStyleName = child.attrs["text:style-name"];
393+
const headingBaseStyle =
394+
headingStyleName !== undefined ? (styles.charStyles.get(headingStyleName) ?? {}) : {};
397395
const heading: HeadingNode = {
398396
kind: "heading",
399397
level,
400-
spans: parseSpans(child, styles.charStyles),
398+
spans: parseSpans(child, styles.charStyles, headingBaseStyle),
401399
};
402400
nodes.push(heading);
403401
break;
@@ -494,10 +492,11 @@ export function readOdt(bytes: Uint8Array): OdtDocumentModel {
494492
const metaXmlBytes = zip["meta.xml"];
495493
const metadata: OdtMetadata = metaXmlBytes ? parseMetaXml(strFromU8(metaXmlBytes)) : {};
496494

495+
const contentRoot = parseXml(contentXml);
496+
497497
const stylesXmlBytes = zip["styles.xml"];
498498
const stylesRoot = stylesXmlBytes ? parseXml(strFromU8(stylesXmlBytes)) : undefined;
499499

500-
const contentRoot = parseXml(contentXml);
501500
const styles = buildStyleMaps(contentRoot, stylesRoot);
502501

503502
const bodyEl = findElement(contentRoot, "office:body");

0 commit comments

Comments
 (0)