Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Release History

## 1.2.0-beta.2 (Unreleased)

### Bugs Fixed

- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter.

### Other Changes

- Updated `toLlmInput` page markers from `<!-- page N -->` to `<!-- InputPageNumber: N -->` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers.

## 1.2.0-beta.1 (2026-04-30)

### Features Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ console.log(text);
// figure illustrating monthly values, and describes the AI Document
// Intelligence service...
// ---
// <!-- page 1 -->
// <!-- InputPageNumber: 1 -->
// # ==This is title==
// ## 1. Text
// [Latin](https://en.wikipedia.org/wiki/Latin) refers to an ancient Italic language...
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@azure/ai-content-understanding",
"version": "1.2.0-beta.1",
"version": "1.2.0-beta.2",
"description": "Azure Content Understanding Rest Client",
"engines": {
"node": ">=20.0.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ export async function main(): Promise<void> {
// Analyze specific pages using contentRange.
// Page markers in the output will use the original document page numbers,
// so even though we only requested pages 2-3 and 5, the markers will say
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
console.log(` URL: ${multiPageUrl}`);
console.log(" contentRange: '2-3,5'\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ async function main() {
// Analyze specific pages using contentRange.
// Page markers in the output will use the original document page numbers,
// so even though we only requested pages 2-3 and 5, the markers will say
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
console.log(` URL: ${multiPageUrl}`);
console.log(" contentRange: '2-3,5'\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ export async function main(): Promise<void> {
// Analyze specific pages using contentRange.
// Page markers in the output will use the original document page numbers,
// so even though we only requested pages 2-3 and 5, the markers will say
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
console.log(` URL: ${multiPageUrl}`);
console.log(" contentRange: '2-3,5'\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,41 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
"rai_warnings",
]);

/**
* Marker emitted by {@link toLlmInput} at each page boundary. Future
* Content Understanding service versions emit this same marker directly in
* the returned markdown (per ContentUnderstanding-Docs#249). When the helper
* sees any occurrence of this prefix in the input markdown it treats the
* service as having already paginated the content and skips its own
* injection to avoid duplicate markers.
*
* @internal
*/
const INPUT_PAGE_MARKER_PREFIX = "<!-- InputPageNumber:";

/**
* Message prefixes the Content Understanding service has been observed to
* emit into the `warnings` collection that are *not* real Responsible-AI
* warnings (they are internal telemetry counters). The helper drops any
* warning whose `message` starts with one of these prefixes before
* rendering the `rai_warnings` block, so the noise never reaches the LLM.
*
* @internal
*/
const TELEMETRY_MESSAGE_PREFIXES: readonly string[] = ["LLMStats:"];

/**
* Returns `true` if `markdown` already contains an `InputPageNumber`
* marker. Case-sensitive substring check: a single occurrence is
* sufficient, because when the service paginates content it places
* markers at every boundary.
*
* @internal
*/
function hasInputPageMarker(markdown: string): boolean {
return markdown.includes(INPUT_PAGE_MARKER_PREFIX);
}

/**
* Convert a Content Understanding analysis result into LLM-friendly text.
*
Expand All @@ -70,8 +105,10 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
* - any caller-supplied `metadata` entries
*
* The markdown body contains the extracted text with page-break markers
* (`<!-- page N -->`) inserted at page boundaries so downstream consumers
* can locate content by page number.
* (`<!-- InputPageNumber: N -->`) inserted at page boundaries so downstream
* consumers can locate content by page number. When the service-provided
* markdown already contains `<!-- InputPageNumber:` markers, the helper
* passes the markdown through unchanged to avoid duplicate markers.
*
* For single-content results (documents, images), the output is a flat
* text block. For multi-segment results (video, audio), each segment is
Expand Down Expand Up @@ -411,6 +448,12 @@ function renderContentBlock(
const PAGE_BREAK_PATTERN = /\n*<!-- PageBreak -->\n*/g;

function addPageMarkers(content: DocumentContent, markdown: string): string {
// If the service already inserted InputPageNumber markers (per
// ContentUnderstanding-Docs#249) pass the markdown through unchanged
// to avoid emitting duplicate markers.
if (hasInputPageMarker(markdown)) {
return markdown;
}
if (content.pages && content.pages.length > 0) {
const fromSpans = pageMarkersFromSpans(markdown, content.pages);
if (fromSpans !== markdown) {
Expand Down Expand Up @@ -462,7 +505,7 @@ function pageMarkersFromSpans(markdown: string, pages: DocumentPage[]): string {
if (adj > prev) {
parts.push(cleaned.substring(prev, adj));
}
parts.push(`<!-- page ${marker.pageNumber} -->\n\n`);
parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${marker.pageNumber} -->\n\n`);
prev = adj;
}
if (prev < cleaned.length) {
Expand All @@ -479,7 +522,7 @@ function pageMarkersFromBreaks(markdown: string, content: DocumentContent): stri
for (let i = 0; i < chunks.length; i++) {
const text = chunks[i].trim();
if (text) {
parts.push(`<!-- page ${startPage + i} -->\n\n${text}`);
parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${startPage + i} -->\n\n${text}`);
}
}
return parts.join("\n\n");
Expand Down Expand Up @@ -566,12 +609,20 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
if (!w) {
continue;
}
const message = w.message;
// Skip internal service telemetry strings (e.g. `LLMStats: ...`) that
// occasionally leak into the warnings collection. These are not
// Responsible-AI warnings and would otherwise be rendered into the
// LLM-facing `rai_warnings` block.
if (message && isTelemetryMessage(message)) {
continue;
}
const entry: Record<string, string> = {};
if (w.code) {
entry.code = w.code;
}
if (w.message) {
entry.message = w.message;
if (message) {
entry.message = message;
}
if (w.target) {
entry.target = w.target;
Expand All @@ -583,6 +634,16 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
return items;
}

function isTelemetryMessage(message: string): boolean {
const trimmed = message.replace(/^[\s]+/, "");
for (const prefix of TELEMETRY_MESSAGE_PREFIXES) {
if (trimmed.startsWith(prefix)) {
return true;
}
}
return false;
}

// ---------------------------------------------------------------------------
// Minimal YAML serializer (no external dependency)
// ---------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ describe("toLlmInput - pages", () => {
assert.equal(_compressPageNumbers([1]), 1);
});

it("renders <!-- page N --> markers from pages[].spans", () => {
it("renders <!-- InputPageNumber: N --> markers from pages[].spans", () => {
const markdown = "Page1Content\nPage2Content";
const pages: DocumentPage[] = [
{ pageNumber: 1, spans: [{ offset: 0, length: 13 }] },
Expand All @@ -220,23 +220,99 @@ describe("toLlmInput - pages", () => {
const text = toLlmInput(
makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
);
assert.include(text, "<!-- page 1 -->");
assert.include(text, "<!-- page 2 -->");
assert.include(text, "<!-- InputPageNumber: 1 -->");
assert.include(text, "<!-- InputPageNumber: 2 -->");
assert.include(text, "pages: 1-2");
});

it("does not inject duplicate markers when service markdown already has them", () => {
const markdown =
"<!-- InputPageNumber: 1 -->\n\nFirst page text.\n\n<!-- InputPageNumber: 2 -->\n\nSecond page text.";
const pages: DocumentPage[] = [
{ pageNumber: 1, spans: [{ offset: 0, length: 47 }] },
{ pageNumber: 2, spans: [{ offset: 49, length: 48 }] },
];
const text = toLlmInput(
makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
);
const count1 = text.split("<!-- InputPageNumber: 1 -->").length - 1;
const count2 = text.split("<!-- InputPageNumber: 2 -->").length - 1;
assert.equal(count1, 1);
assert.equal(count2, 1);
});

it("falls back to PageBreak splitting using startPageNumber", () => {
const markdown = "First page text\n<!-- PageBreak -->\nSecond page text";
const text = toLlmInput(
makeResult([makeDocument({ markdown, startPageNumber: 3, endPageNumber: 4 })]),
);
assert.include(text, "<!-- page 3 -->");
assert.include(text, "<!-- page 4 -->");
assert.include(text, "<!-- InputPageNumber: 3 -->");
assert.include(text, "<!-- InputPageNumber: 4 -->");
assert.include(text, "First page text");
assert.include(text, "Second page text");
});
});

// ---------------------------------------------------------------------------
// rai_warnings (LLMStats telemetry filter)
// ---------------------------------------------------------------------------

describe("toLlmInput - rai_warnings filter", () => {
it("drops LLMStats: telemetry warnings but keeps real warnings", () => {
const text = toLlmInput(
makeResult([makeDocument()], [
{ code: "Telemetry", message: "LLMStats: completion calls: 2; embedding calls: 1" },
{ code: "ContentWarning", message: "Potentially sensitive content." },
]),
);
assert.include(text, "rai_warnings:");
assert.notInclude(text, "LLMStats:");
assert.include(text, "Potentially sensitive content.");
});

it("omits the rai_warnings block when only LLMStats: warnings exist", () => {
const text = toLlmInput(
makeResult([makeDocument()], [
{ code: "Telemetry", message: "LLMStats: completion latency: 7.71s" },
]),
);
assert.notInclude(text, "rai_warnings:");
assert.notInclude(text, "LLMStats:");
});

it("is case-sensitive (lowercase llmstats: is preserved)", () => {
const text = toLlmInput(
makeResult([makeDocument()], [
{ code: "ContentWarning", message: "llmstats: keep as a real warning" },
]),
);
assert.include(text, "rai_warnings:");
assert.include(text, "llmstats: keep as a real warning");
});

it("preserves LLMStats: text in the document markdown body", () => {
const bodyText = "A log excerpt:\n- LLMStats: keep this body text";
const text = toLlmInput(
makeResult([makeDocument({ markdown: bodyText })], [
{ code: "Telemetry", message: "LLMStats: remove this warning text" },
]),
);
assert.notInclude(text, "rai_warnings:");
assert.include(text, "LLMStats: keep this body text");
assert.notInclude(text, "LLMStats: remove this warning text");
});

it("filters LLMStats: warnings with leading whitespace", () => {
const text = toLlmInput(
makeResult([makeDocument()], [
{ code: "Telemetry", message: " LLMStats: completion calls: 2" },
]),
);
assert.notInclude(text, "rai_warnings:");
assert.notInclude(text, "LLMStats:");
});
});

// ---------------------------------------------------------------------------
// Audio / video segments
// ---------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,18 +158,22 @@ describe("Sample: toLlmInput", () => {
);

// Page markers in the markdown body should use the original page numbers
// (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
assert.ok(
!text.includes("<!-- page 1 -->"),
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5",
// Page markers in the markdown body should use the original page numbers
// (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
// not renumbered (1, 2, 3).
assert(
!text.includes("<!-- InputPageNumber: 1 -->"),
"Page marker '<!-- InputPageNumber: 1 -->' should not appear \u2014 we only requested pages 2-3, 5",
);
for (const expectedPage of [2, 3, 5]) {
assert.ok(
text.includes(`<!-- page ${expectedPage} -->`),
`Page marker '<!-- page ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
assert(
text.includes(`<!-- InputPageNumber: ${expectedPage} -->`),
`Page marker '<!-- InputPageNumber: ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
);
}
console.log("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->");
console.log(
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->",
);

console.log(
`[PASS] toLlmInput output validated (${text.length} chars, pages='2-3, 5' preserved)`,
Expand Down
Loading