feat(#23): kb ingest accepts multiple ids and prints a batch summary (#27)

Vigtu · web-flow · commit 50c5786f857c · 2026-05-22T08:06:03.000-03:00
* feat(#23): kb ingest accepts multiple ids and prints a batch summary paper7 kb ingest used to take exactly one identifier. Running 10-15 ingests in a research session meant launching the command per paper, then diffing paper7 kb list against the expected id list to find the failures, then retrying by hand. The argument is now variadic via Argument.variadic({ min: 1 }). Single-id behaviour is preserved exactly — the paper's markdown still streams to stdout, so existing pipes keep working. With two or more ids, a new runKbIngestBatch path takes over: it ingests serially (arxiv enforces a ~3s rate limit and S2 caps at ~1 req/s on the unauth tier; concurrency buys 429s rather than throughput), and prints one summary block: Ingested: N/M papers to <sources-dir> Failed: <id> — <reason> Parse failures, network errors, and cache errors all land in the Failed: list with a per-id reason. The batch exits 0 as long as at least one paper landed; if every id failed the new KbIngestBatchFailed error fires and the process exits 1 with 'error: all kb ingests failed' on stderr while the summary still goes to stdout. The renderer is intentionally terse — soft fallbacks from PR3 (ar5iv → abstract-only) print their own warnings via Effect.logWarning during ingest and count toward Ingested:, so the summary just reports the final tally. Closes #23. * feat(#23): preserve effect boundaries in kb ingest batch Pull rendering and the final fail-decision out of src/kb.ts so the domain module returns data and the CLI adapter decides how to present it. runKbIngestBatch now returns KbIngestBatchResult (attempts + sourcesDir); src/commands/kb.ts logs the summary, formats per-id errors, and raises KbIngestBatchFailed when every id failed. Narrow the per-id error boundary with Effect.catchTags. Only the four external fetch failures (GetArxivError, GetAr5ivError, GetPubmedError, GetCrossrefError) are converted to per-id Failed entries; KbIoError and the rest of GetError stay in the typed error channel so a wiki write failure / disk-full / permission problem still fails the whole batch loudly instead of being silently reported as a skipped paper. BatchAttempt now carries the typed BatchIngestError payload (new KbInvalidIdentifier tag covers unparseable raw ids), and the CLI renderer is the only place that stringifies it. KbIngestBatchFailed is raised with bare 'yield* new KbIngestBatchFailed(...)' per repo convention.
diff --git a/src/cli.ts b/src/cli.ts
@@ -305,11 +305,18 @@ export const makeRootCommand = (loaders?: Partial<CommandLoaders>) => {
   )
 
   const kbIngestCommand = Command.make("ingest", {
-    id: Argument.string("id").pipe(Argument.withDescription("Paper identifier"))
-  }, (config) =>
-    parseIdentifierEffect("kb ingest", config.id).pipe(
-      Effect.flatMap((id) => runCommand({ tag: "kb-ingest", id }))
-    )).pipe(Command.withShortDescription("Fetch paper into wiki sources"))
+    ids: Argument.string("id").pipe(
+      Argument.withDescription("One or more paper identifiers"),
+      Argument.variadic({ min: 1 })
+    )
+  }, (config) => {
+    if (config.ids.length === 1) {
+      return parseIdentifierEffect("kb ingest", config.ids[0]).pipe(
+        Effect.flatMap((id) => runCommand({ tag: "kb-ingest", id }))
+      )
+    }
+    return runCommand({ tag: "kb-ingest-batch", rawIds: config.ids })
+  }).pipe(Command.withShortDescription("Fetch one or more papers into wiki sources"))
 
   const kbReadCommand = Command.make("read", {
     slug: Argument.string("slug").pipe(Argument.withDescription("Page slug, index, or log"))
@@ -476,6 +483,7 @@ function makeRunCommand(loaders: CommandLoaders) {
           })
         )
       case "kb-ingest":
+      case "kb-ingest-batch":
       case "kb-read":
       case "kb-write":
       case "kb-search":
@@ -506,6 +514,7 @@ const isGetError = (error: KbError | GetError): error is GetError => {
     case "KbIoError":
     case "KbInvalidSlug":
     case "KbGetError":
+    case "KbIngestBatchFailed":
       return false
   }
 }
@@ -518,6 +527,8 @@ const formatKbError = (error: KbError): string => {
       return `error: invalid wiki slug: ${error.slug}`
     case "KbGetError":
       return formatGetError(error.error)
+    case "KbIngestBatchFailed":
+      return `error: ${error.message}`
   }
 }
 
diff --git a/src/commands/kb.ts b/src/commands/kb.ts
@@ -1,9 +1,55 @@
 import { Console, Effect } from "effect"
-import { runKb, type KbEnvironment, type KbError } from "../kb.js"
 import type { GetError } from "../get.js"
+import {
+  type BatchAttempt,
+  type BatchIngestError,
+  type KbEnvironment,
+  type KbError,
+  KbIngestBatchFailed,
+  type KbIngestBatchResult,
+  runKb,
+  runKbIngestBatch,
+} from "../kb.js"
 import type { CliCommand } from "../parser.js"
 
 export const runKbCommand = (
   command: Extract<CliCommand, { readonly tag: `kb-${string}` }>
-): Effect.Effect<void, KbError | GetError, KbEnvironment> =>
-  runKb(command).pipe(Effect.flatMap((output) => Console.log(output)))
+): Effect.Effect<void, KbError | GetError, KbEnvironment> => {
+  if (command.tag === "kb-ingest-batch") {
+    return Effect.gen(function*() {
+      const result = yield* runKbIngestBatch(command.rawIds)
+      yield* Console.log(renderBatchSummary(result))
+      const ingested = result.attempts.filter((attempt) => attempt._tag === "Ingested").length
+      if (ingested === 0 && result.attempts.length > 0) {
+        return yield* new KbIngestBatchFailed({ message: "all kb ingests failed" })
+      }
+    })
+  }
+  return runKb(command).pipe(Effect.flatMap((output) => Console.log(output)))
+}
+
+const renderBatchSummary = (result: KbIngestBatchResult): string => {
+  const ingested = result.attempts.filter((attempt) => attempt._tag === "Ingested").length
+  const header = `Ingested: ${ingested}/${result.attempts.length} papers to ${result.sourcesDir}`
+  const failed = result.attempts.filter(
+    (attempt): attempt is Extract<BatchAttempt, { readonly _tag: "Failed" }> => attempt._tag === "Failed"
+  )
+  if (failed.length === 0) return header
+  return [
+    header,
+    "Failed:",
+    ...failed.map((attempt) => `  ${attempt.raw} — ${formatBatchError(attempt.error)}`),
+  ].join("\n")
+}
+
+const formatBatchError = (error: BatchIngestError): string => {
+  switch (error._tag) {
+    case "KbInvalidIdentifier":
+      return "invalid identifier"
+    case "GetArxivError":
+    case "GetAr5ivError":
+    case "GetPubmedError":
+    case "GetCrossrefError":
+      return error.error.message
+  }
+}
diff --git a/src/kb.ts b/src/kb.ts
@@ -4,8 +4,9 @@ import { join } from "node:path"
 import type { Ar5ivClient } from "./ar5iv.js"
 import type { ArxivClient } from "./arxiv.js"
 import type { CrossrefClient } from "./crossref.js"
-import { getPaper, type GetError } from "./get.js"
+import { type GetAr5ivError, type GetArxivError, type GetCrossrefError, type GetError, getPaper, type GetPubmedError } from "./get.js"
 import type { CliCommand, PaperIdentifier } from "./parser.js"
+import { parsePaperIdentifier } from "./parser.js"
 import type { PubmedClient } from "./pubmed.js"
 import type { SemanticScholarClient } from "./semanticScholar.js"
 
@@ -22,7 +23,31 @@ export class KbGetError extends Data.TaggedError("KbGetError")<{
   readonly error: GetError
 }> {}
 
-export type KbError = KbIoError | KbInvalidSlug | KbGetError
+export class KbInvalidIdentifier extends Data.TaggedError("KbInvalidIdentifier")<{
+  readonly raw: string
+}> {}
+
+export class KbIngestBatchFailed extends Data.TaggedError("KbIngestBatchFailed")<{
+  readonly message: string
+}> {}
+
+export type KbError = KbIoError | KbInvalidSlug | KbGetError | KbIngestBatchFailed
+
+export type BatchIngestError =
+  | KbInvalidIdentifier
+  | GetArxivError
+  | GetAr5ivError
+  | GetPubmedError
+  | GetCrossrefError
+
+export type BatchAttempt =
+  | { readonly _tag: "Ingested"; readonly raw: string }
+  | { readonly _tag: "Failed"; readonly raw: string; readonly error: BatchIngestError }
+
+export type KbIngestBatchResult = {
+  readonly attempts: ReadonlyArray<BatchAttempt>
+  readonly sourcesDir: string
+}
 
 type WikiPaths = {
   readonly root: string
@@ -34,7 +59,7 @@ type WikiPaths = {
 
 export type KbEnvironment = Ar5ivClient | ArxivClient | CrossrefClient | PubmedClient | SemanticScholarClient
 
-export const runKb = (command: Extract<CliCommand, { readonly tag: `kb-${string}` }>): Effect.Effect<string, KbError | GetError, KbEnvironment> => {
+export const runKb = (command: Exclude<Extract<CliCommand, { readonly tag: `kb-${string}` }>, { readonly tag: "kb-ingest-batch" }>): Effect.Effect<string, KbError | GetError, KbEnvironment> => {
   switch (command.tag) {
     case "kb-ingest":
       return ingest(command.id)
@@ -51,6 +76,30 @@ export const runKb = (command: Extract<CliCommand, { readonly tag: `kb-${string}
   }
 }
 
+export const runKbIngestBatch = (rawIds: ReadonlyArray<string>): Effect.Effect<KbIngestBatchResult, KbError | GetError, KbEnvironment> =>
+  Effect.gen(function*() {
+    const paths = wikiPaths()
+    yield* ensureWiki(paths)
+    const attempts = yield* Effect.forEach(rawIds, ingestOneForBatch)
+    return { attempts, sourcesDir: paths.sources }
+  })
+
+const ingestOneForBatch = (raw: string): Effect.Effect<BatchAttempt, KbError | GetError, KbEnvironment> => {
+  const id = parsePaperIdentifier(raw)
+  if (id === undefined) {
+    return Effect.succeed<BatchAttempt>({ _tag: "Failed", raw, error: new KbInvalidIdentifier({ raw }) })
+  }
+  return ingest(id).pipe(
+    Effect.map((): BatchAttempt => ({ _tag: "Ingested", raw })),
+    Effect.catchTags({
+      GetArxivError: (error) => Effect.succeed<BatchAttempt>({ _tag: "Failed", raw, error }),
+      GetAr5ivError: (error) => Effect.succeed<BatchAttempt>({ _tag: "Failed", raw, error }),
+      GetPubmedError: (error) => Effect.succeed<BatchAttempt>({ _tag: "Failed", raw, error }),
+      GetCrossrefError: (error) => Effect.succeed<BatchAttempt>({ _tag: "Failed", raw, error }),
+    }),
+  )
+}
+
 const ingest = (id: PaperIdentifier): Effect.Effect<string, KbError | GetError, KbEnvironment> => {
   const paths = wikiPaths()
   const sourceName = sourceFileName(id)
diff --git a/src/parser.ts b/src/parser.ts
@@ -50,6 +50,7 @@ export type CliCommand =
   | { readonly tag: "vault-all" }
   | { readonly tag: "browse" }
   | { readonly tag: "kb-ingest"; readonly id: PaperIdentifier }
+  | { readonly tag: "kb-ingest-batch"; readonly rawIds: ReadonlyArray<string> }
   | { readonly tag: "kb-read"; readonly slug: string }
   | { readonly tag: "kb-write"; readonly slug: string }
   | { readonly tag: "kb-search"; readonly pattern: string }
diff --git a/tests/kb.test.ts b/tests/kb.test.ts
@@ -96,4 +96,42 @@ describe("kb command", () => {
       expect(status.stdout).toContain("Pages: 1")
       expect(ingest.stdout).toContain("Attention Is All You Need")
     })))
+
+  it.effect("kb ingest with multiple ids prints a summary instead of paper markdown (issue #23)", () =>
+    withTempHome(Effect.gen(function*() {
+      const home = process.env.HOME ?? ""
+      const sources = join(home, ".paper7", "wiki", "sources")
+      const result = yield* run(["kb", "ingest", "1706.03762", "2401.04088"])
+
+      expect(result.exit._tag).toBe("Success")
+      expect(result.stderr).toBe("")
+      expect(result.stdout).toBe(`Ingested: 2/2 papers to ${sources}`)
+      expect(result.stdout).not.toContain("Attention Is All You Need")
+    })))
+
+  it.effect("kb ingest batch reports invalid identifiers in the Failed section (issue #23)", () =>
+    withTempHome(Effect.gen(function*() {
+      const home = process.env.HOME ?? ""
+      const sources = join(home, ".paper7", "wiki", "sources")
+      const result = yield* run(["kb", "ingest", "1706.03762", "bogus.id"])
+
+      expect(result.exit._tag).toBe("Success")
+      expect(result.stderr).toBe("")
+      expect(result.stdout).toContain(`Ingested: 1/2 papers to ${sources}`)
+      expect(result.stdout).toContain("Failed:")
+      expect(result.stdout).toContain("bogus.id — invalid identifier")
+    })))
+
+  it.effect("kb ingest batch exits 1 when every id fails (issue #23)", () =>
+    withTempHome(Effect.gen(function*() {
+      const home = process.env.HOME ?? ""
+      const sources = join(home, ".paper7", "wiki", "sources")
+      const result = yield* run(["kb", "ingest", "bogus.one", "bogus.two"])
+
+      expect(result.exit._tag).toBe("Failure")
+      expect(result.stdout).toContain(`Ingested: 0/2 papers to ${sources}`)
+      expect(result.stdout).toContain("bogus.one — invalid identifier")
+      expect(result.stdout).toContain("bogus.two — invalid identifier")
+      expect(result.stderr).toBe("error: all kb ingests failed")
+    })))
 })