fix: claude pr review tests and code polish

chris-c-thomas · chris-c-thomas · commit 83c1fc6ea033 · 2026-04-22T18:41:00.000-05:00
diff --git a/packages/core/src/ast/uslm-builder.ts b/packages/core/src/ast/uslm-builder.ts
@@ -149,6 +149,17 @@ export class ASTBuilder {
     return typeof at === "string" ? levelType === at : at.has(levelType);
   }
 
+  /**
+   * True iff some level frame currently on the stack is itself an emit
+   * target. Used by `closeLevel` to decide whether the closing node must be
+   * attached to its parent so a higher emission sees the full subtree.
+   *
+   * This is the correct check for USLM's permissive level nesting (e.g. an
+   * `<appendix>` inside a `<part>`) — reasoning via `LEVEL_TYPES` index
+   * ordering would drop the appendix because its index is shallower than
+   * the containing part's. Live stack membership handles anomalous nesting
+   * correctly.
+   */
   private hasEmittingAncestorOnStack(): boolean {
     for (const f of this.stack) {
       if (f.kind === "level" && f.node?.type === "level") {
diff --git a/packages/ecfr/src/converter.test.ts b/packages/ecfr/src/converter.test.ts
@@ -159,4 +159,37 @@ describe("convertEcfrTitle multi-granularity parity", () => {
     const stats = await stat(singleDirs.section);
     expect(stats.isDirectory()).toBe(true);
   });
+
+  it("dry-run in multi-granularity mode reports counts without writing files", async () => {
+    const results = await convertEcfrTitle({
+      ...BASE,
+      input: resolve(FIXTURES_DIR, "title-structure.xml"),
+      dryRun: true,
+      granularities: [
+        { granularity: "section", output: multiDirs.section },
+        { granularity: "part", output: multiDirs.part },
+        { granularity: "title", output: multiDirs.title },
+      ],
+    });
+
+    expect(results).toHaveLength(3);
+    for (const r of results) {
+      expect(r.dryRun).toBe(true);
+      expect(r.files).toEqual([]);
+    }
+
+    const byGranularity = Object.fromEntries(results.map((r) => [r.granularity, r]));
+    // title-structure fixture has 3 sections, 2 parts, 1 title
+    expect(byGranularity.section!.sectionsWritten).toBe(3);
+    expect(byGranularity.part!.sectionsWritten).toBe(2);
+    expect(byGranularity.title!.sectionsWritten).toBe(1);
+
+    // Verify no files were actually written to the multi output dirs.
+    const sectionFiles = await listFiles(multiDirs.section);
+    const partFiles = await listFiles(multiDirs.part);
+    const titleFiles = await listFiles(multiDirs.title);
+    expect(sectionFiles).toEqual([]);
+    expect(partFiles).toEqual([]);
+    expect(titleFiles).toEqual([]);
+  });
 });
diff --git a/packages/ecfr/src/converter.ts b/packages/ecfr/src/converter.ts
@@ -69,15 +69,24 @@ export interface BaseEcfrConvertOptions {
 
 /** Single-granularity mode: one output directory, one granularity. */
 export interface SingleEcfrConvertOptions extends BaseEcfrConvertOptions {
-  /** Output root directory */
+  /** Output root directory. Required in single-granularity mode. */
   output: string;
-  /** Output granularity. Defaults to "section" when omitted. */
+  /** Output granularity. Defaults to `"section"` when omitted. */
   granularity?: EcfrGranularity | undefined;
   /** @internal — must not be set in single-granularity mode */
   granularities?: undefined;
 }
 
-/** Multi-granularity mode: a set of `{granularity, output}` pairs emitted from one parse. */
+/**
+ * Multi-granularity mode: a set of `{granularity, output}` pairs emitted from
+ * one parse.
+ *
+ * The builder emits at the set of unique `LevelType`s needed to satisfy the
+ * requested granularities. `section` and `chapter` both emit at the section
+ * level — chapter output is synthesized from the section bucket at write
+ * time (by grouping sections under their chapter ancestor). `part` and
+ * `title` each emit at their own level.
+ */
 export interface MultiEcfrConvertOptions extends BaseEcfrConvertOptions {
   /** Multiple `{granularity, output}` pairs to produce in a single parse. */
   granularities: readonly EcfrGranularityOutput[];
@@ -259,14 +268,18 @@ export async function convertEcfrTitle(
   return first;
 }
 
-/** Extract title number and name from the first available collected node. */
+/**
+ * Extract title number and name from the first available collected node.
+ *
+ * Falls back to `{"0", ""}` when no emitted node has a title ancestor and no
+ * title-level node was emitted. That path produces `/us/cfr/t0/...` canonical
+ * identifiers, which is almost always a sign of malformed source XML — we
+ * warn rather than silently corrupt downstream data.
+ */
 function extractTitleInfo(collectedByLevel: Map<LevelType, CollectedSection[]>): {
   titleNumber: string;
   titleName: string;
 } {
-  let titleNumber = "0";
-  let titleName = "";
-
   // Prefer section emissions (richest ancestor chain), fall back to others.
   const probeOrder: LevelType[] = ["section", "part", "chapter", "title"];
   for (const lt of probeOrder) {
@@ -275,18 +288,24 @@ function extractTitleInfo(collectedByLevel: Map<LevelType, CollectedSection[]>):
     if (!first) continue;
     const titleAncestor = first.context.ancestors.find((a) => a.levelType === "title");
     if (titleAncestor) {
-      titleNumber = titleAncestor.numValue ?? "0";
-      titleName = titleAncestor.heading ?? first.context.documentMeta.dcTitle ?? "";
-      return { titleNumber, titleName };
+      return {
+        titleNumber: titleAncestor.numValue ?? "0",
+        titleName: titleAncestor.heading ?? first.context.documentMeta.dcTitle ?? "",
+      };
     }
     if (first.node.levelType === "title") {
-      titleNumber = first.node.numValue ?? "0";
-      titleName = first.node.heading ?? first.context.documentMeta.dcTitle ?? "";
-      return { titleNumber, titleName };
+      return {
+        titleNumber: first.node.numValue ?? "0",
+        titleName: first.node.heading ?? first.context.documentMeta.dcTitle ?? "",
+      };
     }
   }
 
-  return { titleNumber, titleName };
+  console.warn(
+    "[@lexbuild/ecfr] convertEcfrTitle: could not resolve title number from emitted nodes; " +
+      "output will use `/us/cfr/t0/...` identifiers. Source XML likely missing a DIV1 TYPE=\"TITLE\".",
+  );
+  return { titleNumber: "0", titleName: "" };
 }
 
 interface WriteGranularityArgs {
@@ -339,20 +358,32 @@ async function writeGranularity(args: WriteGranularityArgs): Promise<EcfrConvert
       { sections: CollectedSection[]; chapterAncestor: AncestorInfo; firstContext: EmitContext }
     >();
 
+    let skippedRootless = 0;
     for (const item of collected) {
       const chapterAnc = item.context.ancestors.find((a) => a.levelType === "chapter");
-      const chapterKey = chapterAnc?.numValue ?? "__root__";
-      const existing = chapterMap.get(chapterKey);
+      if (!chapterAnc?.numValue) {
+        // Section without a chapter ancestor cannot be placed in a chapter
+        // file. Rare in eCFR (e.g. parts directly under subtitle with no
+        // surrounding chapter). Drop rather than synthesize a junk filename.
+        skippedRootless++;
+        continue;
+      }
+      const existing = chapterMap.get(chapterAnc.numValue);
       if (existing) {
         existing.sections.push(item);
       } else {
-        chapterMap.set(chapterKey, {
+        chapterMap.set(chapterAnc.numValue, {
           sections: [item],
-          chapterAncestor: chapterAnc ?? { levelType: "chapter", numValue: chapterKey },
+          chapterAncestor: chapterAnc,
           firstContext: item.context,
         });
       }
     }
+    if (skippedRootless > 0) {
+      console.warn(
+        `[@lexbuild/ecfr] convertEcfrTitle: chapter granularity skipped ${skippedRootless} section(s) with no chapter ancestor`,
+      );
+    }
 
     for (const [_chapterKey, { sections, chapterAncestor, firstContext }] of chapterMap) {
       const chapterNode: LevelNode = {
@@ -555,11 +586,13 @@ function buildDryRunResult(
   let count: number;
 
   if (granularity === "chapter") {
+    // Mirror the write-phase filter: sections with no chapter ancestor
+    // would be dropped rather than grouped under a synthetic key.
     const chapterKeys = new Set<string>();
     for (const { node, context } of collected) {
       const chapterAnc = context.ancestors.find((a) => a.levelType === "chapter");
-      const key = chapterAnc?.numValue ?? "__root__";
-      chapterKeys.add(key);
+      if (!chapterAnc?.numValue) continue;
+      chapterKeys.add(chapterAnc.numValue);
       totalEstimate += estimateTokens(node);
     }
     count = chapterKeys.size;
diff --git a/packages/ecfr/src/ecfr-builder.test.ts b/packages/ecfr/src/ecfr-builder.test.ts
@@ -257,4 +257,29 @@ describe("EcfrASTBuilder", () => {
     expect(appendixChildren.length).toBe(1);
     expect((appendixChildren[0] as LevelNode).numValue).toBe("Appendix A");
   });
+
+  it("multi-emit {part,title} attaches appendix to part exactly once (stack-based attach rule)", async () => {
+    // An appendix nested inside a part is the motivating case for
+    // `hasEmittingAncestorOnStack` — appendix is a "big level" with a lower
+    // hierarchy index than part, so naive index-based reasoning would drop
+    // the appendix from the part's children. The live stack check keeps it
+    // attached once.
+    const collected = await parseFixture(
+      "appendix.xml",
+      new Set<"section" | "part" | "title">(["part", "title"]),
+    );
+
+    const parts = collected.filter((c) => c.node.levelType === "part");
+    const titles = collected.filter((c) => c.node.levelType === "title");
+    expect(parts.length).toBe(1);
+    expect(titles.length).toBe(1);
+
+    const partAppendixes = collectLevelsOfType(parts[0]!.node, "appendix");
+    expect(partAppendixes.length).toBe(1);
+    expect(partAppendixes[0]!.numValue).toBe("Appendix A");
+
+    // The same appendix instance is reachable from the title subtree (via part).
+    const titleAppendixes = collectLevelsOfType(titles[0]!.node, "appendix");
+    expect(titleAppendixes).toContain(partAppendixes[0]);
+  });
 });
diff --git a/packages/ecfr/src/ecfr-builder.ts b/packages/ecfr/src/ecfr-builder.ts
@@ -106,6 +106,17 @@ export class EcfrASTBuilder {
     return typeof at === "string" ? levelType === at : at.has(levelType);
   }
 
+  /**
+   * True iff some level frame currently on the stack is itself an emit
+   * target. Used by `closeLevel` to decide whether the closing node must be
+   * attached to its parent so a higher emission sees the full subtree.
+   *
+   * This is the correct check for USLM's permissive level nesting (e.g. an
+   * `<appendix>` inside a `<part>`) — reasoning via `LEVEL_TYPES` index
+   * ordering would drop the appendix because its index is shallower than
+   * the containing part's. Live stack membership handles anomalous nesting
+   * correctly.
+   */
   private hasEmittingAncestorOnStack(): boolean {
     for (const f of this.stack) {
       if (f.kind === "level" && f.node?.type === "level") {
diff --git a/packages/usc/src/converter.ts b/packages/usc/src/converter.ts
@@ -67,9 +67,9 @@ export interface BaseConvertOptions {
 
 /** Single-granularity mode: one output directory, one granularity. */
 export interface SingleConvertOptions extends BaseConvertOptions {
-  /** Output directory root */
+  /** Output directory root. Required in single-granularity mode. */
   output: string;
-  /** Output granularity. Defaults to "section" when omitted. */
+  /** Output granularity. Defaults to `"section"` when omitted. */
   granularity?: UscGranularity | undefined;
   /** @internal — must not be set in single-granularity mode */
   granularities?: undefined;
diff --git a/scripts/update-ecfr.sh b/scripts/update-ecfr.sh
@@ -160,7 +160,10 @@ if [ "$DEPLOY_ONLY" = false ]; then
   fi
 
   # Step 2: Download
+  # $TITLE_ARG is intentionally unquoted so `--titles 1,17` word-splits into
+  # two CLI args. shellcheck disable=SC2086
   echo "--- Step 2/7: Downloading eCFR titles ($TITLE_ARG)"
+  # shellcheck disable=SC2086
   $CLI download-ecfr $TITLE_ARG
   echo ""
 
@@ -170,6 +173,7 @@ if [ "$DEPLOY_ONLY" = false ]; then
   # emits section + title + chapter + part from one pass of the XML, writing
   # each to its own output dir. writeFileIfChanged preserves mtimes.
   echo "--- Step 3/7: Converting eCFR titles ($TITLE_ARG) at all granularities"
+  # shellcheck disable=SC2086
   $CLI convert-ecfr $TITLE_ARG $CURRENCY_ARG \
     --granularities section,title,chapter,part \
     --output ./output \
diff --git a/scripts/update-usc.sh b/scripts/update-usc.sh
@@ -152,7 +152,10 @@ if [ "$DEPLOY_ONLY" = false ]; then
   # Step 3: Convert at every granularity in a single parse. --granularities
   # emits section + title + chapter from one pass of the XML, writing each
   # to its own output dir. writeFileIfChanged preserves mtimes.
+  # $CLI is intentionally unquoted so its embedded spaces word-split into
+  # "node ... dist/index.js" args. shellcheck disable=SC2086
   echo "--- Step 3/7: Converting USC titles at all granularities"
+  # shellcheck disable=SC2086
   $CLI convert-usc --all \
     --granularities section,title,chapter \
     --output ./output \