diff --git a/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md b/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md index d973db0ad4b0..7fa5fafb809b 100644 --- a/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md +++ b/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md @@ -1,5 +1,15 @@ # Release History +## 1.2.0-beta.2 (Unreleased) + +### Bugs Fixed + +- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter. + +### Other Changes + +- Updated `toLlmInput` page markers from `` to `` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers. + ## 1.2.0-beta.1 (2026-04-30) ### Features Added diff --git a/sdk/contentunderstanding/ai-content-understanding/README.md b/sdk/contentunderstanding/ai-content-understanding/README.md index b4d3334b727c..b47a0eebaa51 100644 --- a/sdk/contentunderstanding/ai-content-understanding/README.md +++ b/sdk/contentunderstanding/ai-content-understanding/README.md @@ -364,7 +364,7 @@ console.log(text); // figure illustrating monthly values, and describes the AI Document // Intelligence service... // --- -// +// // # ==This is title== // ## 1. Text // [Latin](https://en.wikipedia.org/wiki/Latin) refers to an ancient Italic language... diff --git a/sdk/contentunderstanding/ai-content-understanding/package.json b/sdk/contentunderstanding/ai-content-understanding/package.json index a34d73a95555..8687c7b575b9 100644 --- a/sdk/contentunderstanding/ai-content-understanding/package.json +++ b/sdk/contentunderstanding/ai-content-understanding/package.json @@ -1,6 +1,6 @@ { "name": "@azure/ai-content-understanding", - "version": "1.2.0-beta.1", + "version": "1.2.0-beta.2", "description": "Azure Content Understanding Rest Client", "engines": { "node": ">=20.0.0" diff --git a/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts b/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts index 2a16c03b2fe0..8120156016ea 100644 --- a/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts +++ b/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts @@ -114,7 +114,7 @@ export async function main(): Promise { // Analyze specific pages using contentRange. // Page markers in the output will use the original document page numbers, // so even though we only requested pages 2-3 and 5, the markers will say - // , , (not 1, 2, 3). + // , , (not 1, 2, 3). console.log("Analyzing pages 2-3 and 5 of a multi-page PDF..."); console.log(` URL: ${multiPageUrl}`); console.log(" contentRange: '2-3,5'\n"); diff --git a/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/javascript/toLlmInput.js b/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/javascript/toLlmInput.js index bfe89512e678..d8c993bbc82d 100644 --- a/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/javascript/toLlmInput.js +++ b/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/javascript/toLlmInput.js @@ -112,7 +112,7 @@ async function main() { // Analyze specific pages using contentRange. // Page markers in the output will use the original document page numbers, // so even though we only requested pages 2-3 and 5, the markers will say - // , , (not 1, 2, 3). + // , , (not 1, 2, 3). console.log("Analyzing pages 2-3 and 5 of a multi-page PDF..."); console.log(` URL: ${multiPageUrl}`); console.log(" contentRange: '2-3,5'\n"); diff --git a/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts b/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts index a73f9a65c5a7..fe651484ac2b 100644 --- a/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts +++ b/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts @@ -112,7 +112,7 @@ export async function main(): Promise { // Analyze specific pages using contentRange. // Page markers in the output will use the original document page numbers, // so even though we only requested pages 2-3 and 5, the markers will say - // , , (not 1, 2, 3). + // , , (not 1, 2, 3). console.log("Analyzing pages 2-3 and 5 of a multi-page PDF..."); console.log(` URL: ${multiPageUrl}`); console.log(" contentRange: '2-3,5'\n"); diff --git a/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts b/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts index 2a0d89ba2286..2e2127a69ac1 100644 --- a/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts +++ b/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts @@ -53,6 +53,41 @@ const RESERVED_METADATA_KEYS: ReadonlySet = new Set([ "rai_warnings", ]); +/** + * Marker emitted by {@link toLlmInput} at each page boundary. Future + * Content Understanding service versions emit this same marker directly in + * the returned markdown (per ContentUnderstanding-Docs#249). When the helper + * sees any occurrence of this prefix in the input markdown it treats the + * service as having already paginated the content and skips its own + * injection to avoid duplicate markers. + * + * @internal + */ +const INPUT_PAGE_MARKER_PREFIX = "`) inserted at page boundaries so downstream consumers - * can locate content by page number. + * (``) inserted at page boundaries so downstream + * consumers can locate content by page number. When the service-provided + * markdown already contains `\n*/g; function addPageMarkers(content: DocumentContent, markdown: string): string { + // If the service already inserted InputPageNumber markers (per + // ContentUnderstanding-Docs#249) pass the markdown through unchanged + // to avoid emitting duplicate markers. + if (hasInputPageMarker(markdown)) { + return markdown; + } if (content.pages && content.pages.length > 0) { const fromSpans = pageMarkersFromSpans(markdown, content.pages); if (fromSpans !== markdown) { @@ -462,7 +505,7 @@ function pageMarkersFromSpans(markdown: string, pages: DocumentPage[]): string { if (adj > prev) { parts.push(cleaned.substring(prev, adj)); } - parts.push(`\n\n`); + parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${marker.pageNumber} -->\n\n`); prev = adj; } if (prev < cleaned.length) { @@ -479,7 +522,7 @@ function pageMarkersFromBreaks(markdown: string, content: DocumentContent): stri for (let i = 0; i < chunks.length; i++) { const text = chunks[i].trim(); if (text) { - parts.push(`\n\n${text}`); + parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${startPage + i} -->\n\n${text}`); } } return parts.join("\n\n"); @@ -566,12 +609,20 @@ function formatWarnings(warnings: ErrorModel[]): Record[] { if (!w) { continue; } + const message = w.message; + // Skip internal service telemetry strings (e.g. `LLMStats: ...`) that + // occasionally leak into the warnings collection. These are not + // Responsible-AI warnings and would otherwise be rendered into the + // LLM-facing `rai_warnings` block. + if (message && isTelemetryMessage(message)) { + continue; + } const entry: Record = {}; if (w.code) { entry.code = w.code; } - if (w.message) { - entry.message = w.message; + if (message) { + entry.message = message; } if (w.target) { entry.target = w.target; @@ -583,6 +634,16 @@ function formatWarnings(warnings: ErrorModel[]): Record[] { return items; } +function isTelemetryMessage(message: string): boolean { + const trimmed = message.replace(/^[\s]+/, ""); + for (const prefix of TELEMETRY_MESSAGE_PREFIXES) { + if (trimmed.startsWith(prefix)) { + return true; + } + } + return false; +} + // --------------------------------------------------------------------------- // Minimal YAML serializer (no external dependency) // --------------------------------------------------------------------------- diff --git a/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts b/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts index 0667f86a27db..9416e03a7e58 100644 --- a/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts +++ b/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts @@ -211,7 +211,7 @@ describe("toLlmInput - pages", () => { assert.equal(_compressPageNumbers([1]), 1); }); - it("renders markers from pages[].spans", () => { + it("renders markers from pages[].spans", () => { const markdown = "Page1Content\nPage2Content"; const pages: DocumentPage[] = [ { pageNumber: 1, spans: [{ offset: 0, length: 13 }] }, @@ -220,23 +220,99 @@ describe("toLlmInput - pages", () => { const text = toLlmInput( makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]), ); - assert.include(text, ""); - assert.include(text, ""); + assert.include(text, ""); + assert.include(text, ""); assert.include(text, "pages: 1-2"); }); + it("does not inject duplicate markers when service markdown already has them", () => { + const markdown = + "\n\nFirst page text.\n\n\n\nSecond page text."; + const pages: DocumentPage[] = [ + { pageNumber: 1, spans: [{ offset: 0, length: 47 }] }, + { pageNumber: 2, spans: [{ offset: 49, length: 48 }] }, + ]; + const text = toLlmInput( + makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]), + ); + const count1 = text.split("").length - 1; + const count2 = text.split("").length - 1; + assert.equal(count1, 1); + assert.equal(count2, 1); + }); + it("falls back to PageBreak splitting using startPageNumber", () => { const markdown = "First page text\n\nSecond page text"; const text = toLlmInput( makeResult([makeDocument({ markdown, startPageNumber: 3, endPageNumber: 4 })]), ); - assert.include(text, ""); - assert.include(text, ""); + assert.include(text, ""); + assert.include(text, ""); assert.include(text, "First page text"); assert.include(text, "Second page text"); }); }); +// --------------------------------------------------------------------------- +// rai_warnings (LLMStats telemetry filter) +// --------------------------------------------------------------------------- + +describe("toLlmInput - rai_warnings filter", () => { + it("drops LLMStats: telemetry warnings but keeps real warnings", () => { + const text = toLlmInput( + makeResult([makeDocument()], [ + { code: "Telemetry", message: "LLMStats: completion calls: 2; embedding calls: 1" }, + { code: "ContentWarning", message: "Potentially sensitive content." }, + ]), + ); + assert.include(text, "rai_warnings:"); + assert.notInclude(text, "LLMStats:"); + assert.include(text, "Potentially sensitive content."); + }); + + it("omits the rai_warnings block when only LLMStats: warnings exist", () => { + const text = toLlmInput( + makeResult([makeDocument()], [ + { code: "Telemetry", message: "LLMStats: completion latency: 7.71s" }, + ]), + ); + assert.notInclude(text, "rai_warnings:"); + assert.notInclude(text, "LLMStats:"); + }); + + it("is case-sensitive (lowercase llmstats: is preserved)", () => { + const text = toLlmInput( + makeResult([makeDocument()], [ + { code: "ContentWarning", message: "llmstats: keep as a real warning" }, + ]), + ); + assert.include(text, "rai_warnings:"); + assert.include(text, "llmstats: keep as a real warning"); + }); + + it("preserves LLMStats: text in the document markdown body", () => { + const bodyText = "A log excerpt:\n- LLMStats: keep this body text"; + const text = toLlmInput( + makeResult([makeDocument({ markdown: bodyText })], [ + { code: "Telemetry", message: "LLMStats: remove this warning text" }, + ]), + ); + assert.notInclude(text, "rai_warnings:"); + assert.include(text, "LLMStats: keep this body text"); + assert.notInclude(text, "LLMStats: remove this warning text"); + }); + + it("filters LLMStats: warnings with leading whitespace", () => { + const text = toLlmInput( + makeResult([makeDocument()], [ + { code: "Telemetry", message: " LLMStats: completion calls: 2" }, + ]), + ); + assert.notInclude(text, "rai_warnings:"); + assert.notInclude(text, "LLMStats:"); + }); +}); + // --------------------------------------------------------------------------- // Audio / video segments // --------------------------------------------------------------------------- diff --git a/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts b/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts index ebc844477fe1..96feeee335c4 100644 --- a/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts +++ b/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts @@ -158,18 +158,22 @@ describe("Sample: toLlmInput", () => { ); // Page markers in the markdown body should use the original page numbers - // (, , ), not renumbered (1, 2, 3). - assert.ok( - !text.includes(""), - "Page marker '' should not appear — we only requested pages 2-3, 5", + // Page markers in the markdown body should use the original page numbers + // (, , ), + // not renumbered (1, 2, 3). + assert( + !text.includes(""), + "Page marker '' should not appear \u2014 we only requested pages 2-3, 5", ); for (const expectedPage of [2, 3, 5]) { - assert.ok( - text.includes(``), - `Page marker '' should appear in the markdown body. Output:\n${text.slice(0, 800)}`, + assert( + text.includes(``), + `Page marker '' should appear in the markdown body. Output:\n${text.slice(0, 800)}`, ); } - console.log("[PASS] Page markers verified: , , "); + console.log( + "[PASS] Page markers verified: , , ", + ); console.log( `[PASS] toLlmInput output validated (${text.length} chars, pages='2-3, 5' preserved)`,