Skip to content

Commit 83c1fc6

Browse files
fix: claude pr review tests and code polish
1 parent 2505420 commit 83c1fc6

8 files changed

Lines changed: 142 additions & 22 deletions

File tree

packages/core/src/ast/uslm-builder.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,17 @@ export class ASTBuilder {
149149
return typeof at === "string" ? levelType === at : at.has(levelType);
150150
}
151151

152+
/**
153+
* True iff some level frame currently on the stack is itself an emit
154+
* target. Used by `closeLevel` to decide whether the closing node must be
155+
* attached to its parent so a higher emission sees the full subtree.
156+
*
157+
* This is the correct check for USLM's permissive level nesting (e.g. an
158+
* `<appendix>` inside a `<part>`) — reasoning via `LEVEL_TYPES` index
159+
* ordering would drop the appendix because its index is shallower than
160+
* the containing part's. Live stack membership handles anomalous nesting
161+
* correctly.
162+
*/
152163
private hasEmittingAncestorOnStack(): boolean {
153164
for (const f of this.stack) {
154165
if (f.kind === "level" && f.node?.type === "level") {

packages/ecfr/src/converter.test.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,37 @@ describe("convertEcfrTitle multi-granularity parity", () => {
159159
const stats = await stat(singleDirs.section);
160160
expect(stats.isDirectory()).toBe(true);
161161
});
162+
163+
it("dry-run in multi-granularity mode reports counts without writing files", async () => {
164+
const results = await convertEcfrTitle({
165+
...BASE,
166+
input: resolve(FIXTURES_DIR, "title-structure.xml"),
167+
dryRun: true,
168+
granularities: [
169+
{ granularity: "section", output: multiDirs.section },
170+
{ granularity: "part", output: multiDirs.part },
171+
{ granularity: "title", output: multiDirs.title },
172+
],
173+
});
174+
175+
expect(results).toHaveLength(3);
176+
for (const r of results) {
177+
expect(r.dryRun).toBe(true);
178+
expect(r.files).toEqual([]);
179+
}
180+
181+
const byGranularity = Object.fromEntries(results.map((r) => [r.granularity, r]));
182+
// title-structure fixture has 3 sections, 2 parts, 1 title
183+
expect(byGranularity.section!.sectionsWritten).toBe(3);
184+
expect(byGranularity.part!.sectionsWritten).toBe(2);
185+
expect(byGranularity.title!.sectionsWritten).toBe(1);
186+
187+
// Verify no files were actually written to the multi output dirs.
188+
const sectionFiles = await listFiles(multiDirs.section);
189+
const partFiles = await listFiles(multiDirs.part);
190+
const titleFiles = await listFiles(multiDirs.title);
191+
expect(sectionFiles).toEqual([]);
192+
expect(partFiles).toEqual([]);
193+
expect(titleFiles).toEqual([]);
194+
});
162195
});

packages/ecfr/src/converter.ts

Lines changed: 53 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,24 @@ export interface BaseEcfrConvertOptions {
6969

7070
/** Single-granularity mode: one output directory, one granularity. */
7171
export interface SingleEcfrConvertOptions extends BaseEcfrConvertOptions {
72-
/** Output root directory */
72+
/** Output root directory. Required in single-granularity mode. */
7373
output: string;
74-
/** Output granularity. Defaults to "section" when omitted. */
74+
/** Output granularity. Defaults to `"section"` when omitted. */
7575
granularity?: EcfrGranularity | undefined;
7676
/** @internal — must not be set in single-granularity mode */
7777
granularities?: undefined;
7878
}
7979

80-
/** Multi-granularity mode: a set of `{granularity, output}` pairs emitted from one parse. */
80+
/**
81+
* Multi-granularity mode: a set of `{granularity, output}` pairs emitted from
82+
* one parse.
83+
*
84+
* The builder emits at the set of unique `LevelType`s needed to satisfy the
85+
* requested granularities. `section` and `chapter` both emit at the section
86+
* level — chapter output is synthesized from the section bucket at write
87+
* time (by grouping sections under their chapter ancestor). `part` and
88+
* `title` each emit at their own level.
89+
*/
8190
export interface MultiEcfrConvertOptions extends BaseEcfrConvertOptions {
8291
/** Multiple `{granularity, output}` pairs to produce in a single parse. */
8392
granularities: readonly EcfrGranularityOutput[];
@@ -259,14 +268,18 @@ export async function convertEcfrTitle(
259268
return first;
260269
}
261270

262-
/** Extract title number and name from the first available collected node. */
271+
/**
272+
* Extract title number and name from the first available collected node.
273+
*
274+
* Falls back to `{"0", ""}` when no emitted node has a title ancestor and no
275+
* title-level node was emitted. That path produces `/us/cfr/t0/...` canonical
276+
* identifiers, which is almost always a sign of malformed source XML — we
277+
* warn rather than silently corrupt downstream data.
278+
*/
263279
function extractTitleInfo(collectedByLevel: Map<LevelType, CollectedSection[]>): {
264280
titleNumber: string;
265281
titleName: string;
266282
} {
267-
let titleNumber = "0";
268-
let titleName = "";
269-
270283
// Prefer section emissions (richest ancestor chain), fall back to others.
271284
const probeOrder: LevelType[] = ["section", "part", "chapter", "title"];
272285
for (const lt of probeOrder) {
@@ -275,18 +288,24 @@ function extractTitleInfo(collectedByLevel: Map<LevelType, CollectedSection[]>):
275288
if (!first) continue;
276289
const titleAncestor = first.context.ancestors.find((a) => a.levelType === "title");
277290
if (titleAncestor) {
278-
titleNumber = titleAncestor.numValue ?? "0";
279-
titleName = titleAncestor.heading ?? first.context.documentMeta.dcTitle ?? "";
280-
return { titleNumber, titleName };
291+
return {
292+
titleNumber: titleAncestor.numValue ?? "0",
293+
titleName: titleAncestor.heading ?? first.context.documentMeta.dcTitle ?? "",
294+
};
281295
}
282296
if (first.node.levelType === "title") {
283-
titleNumber = first.node.numValue ?? "0";
284-
titleName = first.node.heading ?? first.context.documentMeta.dcTitle ?? "";
285-
return { titleNumber, titleName };
297+
return {
298+
titleNumber: first.node.numValue ?? "0",
299+
titleName: first.node.heading ?? first.context.documentMeta.dcTitle ?? "",
300+
};
286301
}
287302
}
288303

289-
return { titleNumber, titleName };
304+
console.warn(
305+
"[@lexbuild/ecfr] convertEcfrTitle: could not resolve title number from emitted nodes; " +
306+
"output will use `/us/cfr/t0/...` identifiers. Source XML likely missing a DIV1 TYPE=\"TITLE\".",
307+
);
308+
return { titleNumber: "0", titleName: "" };
290309
}
291310

292311
interface WriteGranularityArgs {
@@ -339,20 +358,32 @@ async function writeGranularity(args: WriteGranularityArgs): Promise<EcfrConvert
339358
{ sections: CollectedSection[]; chapterAncestor: AncestorInfo; firstContext: EmitContext }
340359
>();
341360

361+
let skippedRootless = 0;
342362
for (const item of collected) {
343363
const chapterAnc = item.context.ancestors.find((a) => a.levelType === "chapter");
344-
const chapterKey = chapterAnc?.numValue ?? "__root__";
345-
const existing = chapterMap.get(chapterKey);
364+
if (!chapterAnc?.numValue) {
365+
// Section without a chapter ancestor cannot be placed in a chapter
366+
// file. Rare in eCFR (e.g. parts directly under subtitle with no
367+
// surrounding chapter). Drop rather than synthesize a junk filename.
368+
skippedRootless++;
369+
continue;
370+
}
371+
const existing = chapterMap.get(chapterAnc.numValue);
346372
if (existing) {
347373
existing.sections.push(item);
348374
} else {
349-
chapterMap.set(chapterKey, {
375+
chapterMap.set(chapterAnc.numValue, {
350376
sections: [item],
351-
chapterAncestor: chapterAnc ?? { levelType: "chapter", numValue: chapterKey },
377+
chapterAncestor: chapterAnc,
352378
firstContext: item.context,
353379
});
354380
}
355381
}
382+
if (skippedRootless > 0) {
383+
console.warn(
384+
`[@lexbuild/ecfr] convertEcfrTitle: chapter granularity skipped ${skippedRootless} section(s) with no chapter ancestor`,
385+
);
386+
}
356387

357388
for (const [_chapterKey, { sections, chapterAncestor, firstContext }] of chapterMap) {
358389
const chapterNode: LevelNode = {
@@ -555,11 +586,13 @@ function buildDryRunResult(
555586
let count: number;
556587

557588
if (granularity === "chapter") {
589+
// Mirror the write-phase filter: sections with no chapter ancestor
590+
// would be dropped rather than grouped under a synthetic key.
558591
const chapterKeys = new Set<string>();
559592
for (const { node, context } of collected) {
560593
const chapterAnc = context.ancestors.find((a) => a.levelType === "chapter");
561-
const key = chapterAnc?.numValue ?? "__root__";
562-
chapterKeys.add(key);
594+
if (!chapterAnc?.numValue) continue;
595+
chapterKeys.add(chapterAnc.numValue);
563596
totalEstimate += estimateTokens(node);
564597
}
565598
count = chapterKeys.size;

packages/ecfr/src/ecfr-builder.test.ts

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,4 +257,29 @@ describe("EcfrASTBuilder", () => {
257257
expect(appendixChildren.length).toBe(1);
258258
expect((appendixChildren[0] as LevelNode).numValue).toBe("Appendix A");
259259
});
260+
261+
it("multi-emit {part,title} attaches appendix to part exactly once (stack-based attach rule)", async () => {
262+
// An appendix nested inside a part is the motivating case for
263+
// `hasEmittingAncestorOnStack` — appendix is a "big level" with a lower
264+
// hierarchy index than part, so naive index-based reasoning would drop
265+
// the appendix from the part's children. The live stack check keeps it
266+
// attached once.
267+
const collected = await parseFixture(
268+
"appendix.xml",
269+
new Set<"section" | "part" | "title">(["part", "title"]),
270+
);
271+
272+
const parts = collected.filter((c) => c.node.levelType === "part");
273+
const titles = collected.filter((c) => c.node.levelType === "title");
274+
expect(parts.length).toBe(1);
275+
expect(titles.length).toBe(1);
276+
277+
const partAppendixes = collectLevelsOfType(parts[0]!.node, "appendix");
278+
expect(partAppendixes.length).toBe(1);
279+
expect(partAppendixes[0]!.numValue).toBe("Appendix A");
280+
281+
// The same appendix instance is reachable from the title subtree (via part).
282+
const titleAppendixes = collectLevelsOfType(titles[0]!.node, "appendix");
283+
expect(titleAppendixes).toContain(partAppendixes[0]);
284+
});
260285
});

packages/ecfr/src/ecfr-builder.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,17 @@ export class EcfrASTBuilder {
106106
return typeof at === "string" ? levelType === at : at.has(levelType);
107107
}
108108

109+
/**
110+
* True iff some level frame currently on the stack is itself an emit
111+
* target. Used by `closeLevel` to decide whether the closing node must be
112+
* attached to its parent so a higher emission sees the full subtree.
113+
*
114+
* This is the correct check for USLM's permissive level nesting (e.g. an
115+
* `<appendix>` inside a `<part>`) — reasoning via `LEVEL_TYPES` index
116+
* ordering would drop the appendix because its index is shallower than
117+
* the containing part's. Live stack membership handles anomalous nesting
118+
* correctly.
119+
*/
109120
private hasEmittingAncestorOnStack(): boolean {
110121
for (const f of this.stack) {
111122
if (f.kind === "level" && f.node?.type === "level") {

packages/usc/src/converter.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,9 @@ export interface BaseConvertOptions {
6767

6868
/** Single-granularity mode: one output directory, one granularity. */
6969
export interface SingleConvertOptions extends BaseConvertOptions {
70-
/** Output directory root */
70+
/** Output directory root. Required in single-granularity mode. */
7171
output: string;
72-
/** Output granularity. Defaults to "section" when omitted. */
72+
/** Output granularity. Defaults to `"section"` when omitted. */
7373
granularity?: UscGranularity | undefined;
7474
/** @internal — must not be set in single-granularity mode */
7575
granularities?: undefined;

scripts/update-ecfr.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,10 @@ if [ "$DEPLOY_ONLY" = false ]; then
160160
fi
161161

162162
# Step 2: Download
163+
# $TITLE_ARG is intentionally unquoted so `--titles 1,17` word-splits into
164+
# two CLI args. shellcheck disable=SC2086
163165
echo "--- Step 2/7: Downloading eCFR titles ($TITLE_ARG)"
166+
# shellcheck disable=SC2086
164167
$CLI download-ecfr $TITLE_ARG
165168
echo ""
166169

@@ -170,6 +173,7 @@ if [ "$DEPLOY_ONLY" = false ]; then
170173
# emits section + title + chapter + part from one pass of the XML, writing
171174
# each to its own output dir. writeFileIfChanged preserves mtimes.
172175
echo "--- Step 3/7: Converting eCFR titles ($TITLE_ARG) at all granularities"
176+
# shellcheck disable=SC2086
173177
$CLI convert-ecfr $TITLE_ARG $CURRENCY_ARG \
174178
--granularities section,title,chapter,part \
175179
--output ./output \

scripts/update-usc.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,10 @@ if [ "$DEPLOY_ONLY" = false ]; then
152152
# Step 3: Convert at every granularity in a single parse. --granularities
153153
# emits section + title + chapter from one pass of the XML, writing each
154154
# to its own output dir. writeFileIfChanged preserves mtimes.
155+
# $CLI is intentionally unquoted so its embedded spaces word-split into
156+
# "node ... dist/index.js" args. shellcheck disable=SC2086
155157
echo "--- Step 3/7: Converting USC titles at all granularities"
158+
# shellcheck disable=SC2086
156159
$CLI convert-usc --all \
157160
--granularities section,title,chapter \
158161
--output ./output \

0 commit comments

Comments
 (0)