[Content Understanding] Update toLlmInput page markers and filter LLMStats telemetry

chienyuanchang · chienyuanchang · commit 72ed27f00a0a · 2026-06-05T16:18:52.000-07:00
diff --git a/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md b/sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md
@@ -1,5 +1,15 @@
 # Release History
 
+## 1.2.0-beta.2 (Unreleased)
+
+### Bugs Fixed
+
+- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter.
+
+### Other Changes
+
+- Updated `toLlmInput` page markers from `<!-- page N -->` to `<!-- InputPageNumber: N -->` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers.
+
 ## 1.2.0-beta.1 (2026-04-30)
 
 ### Features Added
diff --git a/sdk/contentunderstanding/ai-content-understanding/package.json b/sdk/contentunderstanding/ai-content-understanding/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@azure/ai-content-understanding",
-  "version": "1.2.0-beta.1",
+  "version": "1.2.0-beta.2",
   "description": "Azure Content Understanding Rest Client",
   "engines": {
     "node": ">=20.0.0"
diff --git a/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts b/sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts
@@ -114,7 +114,7 @@ export async function main(): Promise<void> {
   // Analyze specific pages using contentRange.
   // Page markers in the output will use the original document page numbers,
   // so even though we only requested pages 2-3 and 5, the markers will say
-  // <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
+  // <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
   console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
   console.log(`  URL: ${multiPageUrl}`);
   console.log("  contentRange: '2-3,5'\n");
diff --git a/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts b/sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts
@@ -112,7 +112,7 @@ export async function main(): Promise<void> {
   // Analyze specific pages using contentRange.
   // Page markers in the output will use the original document page numbers,
   // so even though we only requested pages 2-3 and 5, the markers will say
-  // <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
+  // <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
   console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
   console.log(`  URL: ${multiPageUrl}`);
   console.log("  contentRange: '2-3,5'\n");
diff --git a/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts b/sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts
@@ -53,6 +53,41 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
   "rai_warnings",
 ]);
 
+/**
+ * Marker emitted by {@link toLlmInput} at each page boundary. Future
+ * Content Understanding service versions emit this same marker directly in
+ * the returned markdown (per ContentUnderstanding-Docs#249). When the helper
+ * sees any occurrence of this prefix in the input markdown it treats the
+ * service as having already paginated the content and skips its own
+ * injection to avoid duplicate markers.
+ *
+ * @internal
+ */
+const INPUT_PAGE_MARKER_PREFIX = "<!-- InputPageNumber:";
+
+/**
+ * Message prefixes the Content Understanding service has been observed to
+ * emit into the `warnings` collection that are *not* real Responsible-AI
+ * warnings (they are internal telemetry counters). The helper drops any
+ * warning whose `message` starts with one of these prefixes before
+ * rendering the `rai_warnings` block, so the noise never reaches the LLM.
+ *
+ * @internal
+ */
+const TELEMETRY_MESSAGE_PREFIXES: readonly string[] = ["LLMStats:"];
+
+/**
+ * Returns `true` if `markdown` already contains an `InputPageNumber`
+ * marker. Case-sensitive substring check: a single occurrence is
+ * sufficient, because when the service paginates content it places
+ * markers at every boundary.
+ *
+ * @internal
+ */
+function hasInputPageMarker(markdown: string): boolean {
+  return markdown.includes(INPUT_PAGE_MARKER_PREFIX);
+}
+
 /**
  * Convert a Content Understanding analysis result into LLM-friendly text.
  *
@@ -70,8 +105,10 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
  * - any caller-supplied `metadata` entries
  *
  * The markdown body contains the extracted text with page-break markers
- * (`<!-- page N -->`) inserted at page boundaries so downstream consumers
- * can locate content by page number.
+ * (`<!-- InputPageNumber: N -->`) inserted at page boundaries so downstream
+ * consumers can locate content by page number. When the service-provided
+ * markdown already contains `<!-- InputPageNumber:` markers, the helper
+ * passes the markdown through unchanged to avoid duplicate markers.
  *
  * For single-content results (documents, images), the output is a flat
  * text block. For multi-segment results (video, audio), each segment is
@@ -411,6 +448,12 @@ function renderContentBlock(
 const PAGE_BREAK_PATTERN = /\n*<!-- PageBreak -->\n*/g;
 
 function addPageMarkers(content: DocumentContent, markdown: string): string {
+  // If the service already inserted InputPageNumber markers (per
+  // ContentUnderstanding-Docs#249) pass the markdown through unchanged
+  // to avoid emitting duplicate markers.
+  if (hasInputPageMarker(markdown)) {
+    return markdown;
+  }
   if (content.pages && content.pages.length > 0) {
     const fromSpans = pageMarkersFromSpans(markdown, content.pages);
     if (fromSpans !== markdown) {
@@ -462,7 +505,7 @@ function pageMarkersFromSpans(markdown: string, pages: DocumentPage[]): string {
     if (adj > prev) {
       parts.push(cleaned.substring(prev, adj));
     }
-    parts.push(`<!-- page ${marker.pageNumber} -->\n\n`);
+    parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${marker.pageNumber} -->\n\n`);
     prev = adj;
   }
   if (prev < cleaned.length) {
@@ -479,7 +522,7 @@ function pageMarkersFromBreaks(markdown: string, content: DocumentContent): stri
   for (let i = 0; i < chunks.length; i++) {
     const text = chunks[i].trim();
     if (text) {
-      parts.push(`<!-- page ${startPage + i} -->\n\n${text}`);
+      parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${startPage + i} -->\n\n${text}`);
     }
   }
   return parts.join("\n\n");
@@ -566,12 +609,20 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
     if (!w) {
       continue;
     }
+    const message = w.message;
+    // Skip internal service telemetry strings (e.g. `LLMStats: ...`) that
+    // occasionally leak into the warnings collection. These are not
+    // Responsible-AI warnings and would otherwise be rendered into the
+    // LLM-facing `rai_warnings` block.
+    if (message && isTelemetryMessage(message)) {
+      continue;
+    }
     const entry: Record<string, string> = {};
     if (w.code) {
       entry.code = w.code;
     }
-    if (w.message) {
-      entry.message = w.message;
+    if (message) {
+      entry.message = message;
     }
     if (w.target) {
       entry.target = w.target;
@@ -583,6 +634,16 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
   return items;
 }
 
+function isTelemetryMessage(message: string): boolean {
+  const trimmed = message.replace(/^[\s]+/, "");
+  for (const prefix of TELEMETRY_MESSAGE_PREFIXES) {
+    if (trimmed.startsWith(prefix)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // ---------------------------------------------------------------------------
 // Minimal YAML serializer (no external dependency)
 // ---------------------------------------------------------------------------
diff --git a/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts b/sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts
@@ -211,7 +211,7 @@ describe("toLlmInput - pages", () => {
     assert.equal(_compressPageNumbers([1]), 1);
   });
 
-  it("renders <!-- page N --> markers from pages[].spans", () => {
+  it("renders <!-- InputPageNumber: N --> markers from pages[].spans", () => {
     const markdown = "Page1Content\nPage2Content";
     const pages: DocumentPage[] = [
       { pageNumber: 1, spans: [{ offset: 0, length: 13 }] },
@@ -220,23 +220,99 @@ describe("toLlmInput - pages", () => {
     const text = toLlmInput(
       makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
     );
-    assert.include(text, "<!-- page 1 -->");
-    assert.include(text, "<!-- page 2 -->");
+    assert.include(text, "<!-- InputPageNumber: 1 -->");
+    assert.include(text, "<!-- InputPageNumber: 2 -->");
     assert.include(text, "pages: 1-2");
   });
 
+  it("does not inject duplicate markers when service markdown already has them", () => {
+    const markdown =
+      "<!-- InputPageNumber: 1 -->\n\nFirst page text.\n\n<!-- InputPageNumber: 2 -->\n\nSecond page text.";
+    const pages: DocumentPage[] = [
+      { pageNumber: 1, spans: [{ offset: 0, length: 47 }] },
+      { pageNumber: 2, spans: [{ offset: 49, length: 48 }] },
+    ];
+    const text = toLlmInput(
+      makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
+    );
+    const count1 = text.split("<!-- InputPageNumber: 1 -->").length - 1;
+    const count2 = text.split("<!-- InputPageNumber: 2 -->").length - 1;
+    assert.equal(count1, 1);
+    assert.equal(count2, 1);
+  });
+
   it("falls back to PageBreak splitting using startPageNumber", () => {
     const markdown = "First page text\n<!-- PageBreak -->\nSecond page text";
     const text = toLlmInput(
       makeResult([makeDocument({ markdown, startPageNumber: 3, endPageNumber: 4 })]),
     );
-    assert.include(text, "<!-- page 3 -->");
-    assert.include(text, "<!-- page 4 -->");
+    assert.include(text, "<!-- InputPageNumber: 3 -->");
+    assert.include(text, "<!-- InputPageNumber: 4 -->");
     assert.include(text, "First page text");
     assert.include(text, "Second page text");
   });
 });
 
+// ---------------------------------------------------------------------------
+// rai_warnings (LLMStats telemetry filter)
+// ---------------------------------------------------------------------------
+
+describe("toLlmInput - rai_warnings filter", () => {
+  it("drops LLMStats: telemetry warnings but keeps real warnings", () => {
+    const text = toLlmInput(
+      makeResult([makeDocument()], [
+        { code: "Telemetry", message: "LLMStats: completion calls: 2; embedding calls: 1" },
+        { code: "ContentWarning", message: "Potentially sensitive content." },
+      ]),
+    );
+    assert.include(text, "rai_warnings:");
+    assert.notInclude(text, "LLMStats:");
+    assert.include(text, "Potentially sensitive content.");
+  });
+
+  it("omits the rai_warnings block when only LLMStats: warnings exist", () => {
+    const text = toLlmInput(
+      makeResult([makeDocument()], [
+        { code: "Telemetry", message: "LLMStats: completion latency: 7.71s" },
+      ]),
+    );
+    assert.notInclude(text, "rai_warnings:");
+    assert.notInclude(text, "LLMStats:");
+  });
+
+  it("is case-sensitive (lowercase llmstats: is preserved)", () => {
+    const text = toLlmInput(
+      makeResult([makeDocument()], [
+        { code: "ContentWarning", message: "llmstats: keep as a real warning" },
+      ]),
+    );
+    assert.include(text, "rai_warnings:");
+    assert.include(text, "llmstats: keep as a real warning");
+  });
+
+  it("preserves LLMStats: text in the document markdown body", () => {
+    const bodyText = "A log excerpt:\n- LLMStats: keep this body text";
+    const text = toLlmInput(
+      makeResult([makeDocument({ markdown: bodyText })], [
+        { code: "Telemetry", message: "LLMStats: remove this warning text" },
+      ]),
+    );
+    assert.notInclude(text, "rai_warnings:");
+    assert.include(text, "LLMStats: keep this body text");
+    assert.notInclude(text, "LLMStats: remove this warning text");
+  });
+
+  it("filters LLMStats: warnings with leading whitespace", () => {
+    const text = toLlmInput(
+      makeResult([makeDocument()], [
+        { code: "Telemetry", message: "  LLMStats: completion calls: 2" },
+      ]),
+    );
+    assert.notInclude(text, "rai_warnings:");
+    assert.notInclude(text, "LLMStats:");
+  });
+});
+
 // ---------------------------------------------------------------------------
 // Audio / video segments
 // ---------------------------------------------------------------------------
diff --git a/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts b/sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts
@@ -158,18 +158,22 @@ describe("Sample: toLlmInput", () => {
     );
 
     // Page markers in the markdown body should use the original page numbers
-    // (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
-    assert.ok(
-      !text.includes("<!-- page 1 -->"),
-      "Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5",
+    // Page markers in the markdown body should use the original page numbers
+    // (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
+    // not renumbered (1, 2, 3).
+    assert(
+      !text.includes("<!-- InputPageNumber: 1 -->"),
+      "Page marker '<!-- InputPageNumber: 1 -->' should not appear \u2014 we only requested pages 2-3, 5",
     );
     for (const expectedPage of [2, 3, 5]) {
-      assert.ok(
-        text.includes(`<!-- page ${expectedPage} -->`),
-        `Page marker '<!-- page ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
+      assert(
+        text.includes(`<!-- InputPageNumber: ${expectedPage} -->`),
+        `Page marker '<!-- InputPageNumber: ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
       );
     }
-    console.log("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->");
+    console.log(
+      "[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->",
+    );
 
     console.log(
       `[PASS] toLlmInput output validated (${text.length} chars, pages='2-3, 5' preserved)`,

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@azure/ai-content-understanding",`
`3`		`- "version": "1.2.0-beta.1",`
	`3`	`+ "version": "1.2.0-beta.2",`
`4`	`4`	`"description": "Azure Content Understanding Rest Client",`
`5`	`5`	`"engines": {`
`6`	`6`	`"node": ">=20.0.0"`