Skip to content

Commit 72ed27f

Browse files
[Content Understanding] Update toLlmInput page markers and filter LLMStats telemetry
1 parent 2d3b618 commit 72ed27f

7 files changed

Lines changed: 173 additions & 22 deletions

File tree

sdk/contentunderstanding/ai-content-understanding/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# Release History
22

3+
## 1.2.0-beta.2 (Unreleased)
4+
5+
### Bugs Fixed
6+
7+
- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter.
8+
9+
### Other Changes
10+
11+
- Updated `toLlmInput` page markers from `<!-- page N -->` to `<!-- InputPageNumber: N -->` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers.
12+
313
## 1.2.0-beta.1 (2026-04-30)
414

515
### Features Added

sdk/contentunderstanding/ai-content-understanding/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@azure/ai-content-understanding",
3-
"version": "1.2.0-beta.1",
3+
"version": "1.2.0-beta.2",
44
"description": "Azure Content Understanding Rest Client",
55
"engines": {
66
"node": ">=20.0.0"

sdk/contentunderstanding/ai-content-understanding/samples-dev/toLlmInput.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ export async function main(): Promise<void> {
114114
// Analyze specific pages using contentRange.
115115
// Page markers in the output will use the original document page numbers,
116116
// so even though we only requested pages 2-3 and 5, the markers will say
117-
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
117+
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
118118
console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
119119
console.log(` URL: ${multiPageUrl}`);
120120
console.log(" contentRange: '2-3,5'\n");

sdk/contentunderstanding/ai-content-understanding/samples/v1-beta/typescript/src/toLlmInput.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ export async function main(): Promise<void> {
112112
// Analyze specific pages using contentRange.
113113
// Page markers in the output will use the original document page numbers,
114114
// so even though we only requested pages 2-3 and 5, the markers will say
115-
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
115+
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
116116
console.log("Analyzing pages 2-3 and 5 of a multi-page PDF...");
117117
console.log(` URL: ${multiPageUrl}`);
118118
console.log(" contentRange: '2-3,5'\n");

sdk/contentunderstanding/ai-content-understanding/src/static-helpers/llmInputHelper.ts

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,41 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
5353
"rai_warnings",
5454
]);
5555

56+
/**
57+
* Marker emitted by {@link toLlmInput} at each page boundary. Future
58+
* Content Understanding service versions emit this same marker directly in
59+
* the returned markdown (per ContentUnderstanding-Docs#249). When the helper
60+
* sees any occurrence of this prefix in the input markdown it treats the
61+
* service as having already paginated the content and skips its own
62+
* injection to avoid duplicate markers.
63+
*
64+
* @internal
65+
*/
66+
const INPUT_PAGE_MARKER_PREFIX = "<!-- InputPageNumber:";
67+
68+
/**
69+
* Message prefixes the Content Understanding service has been observed to
70+
* emit into the `warnings` collection that are *not* real Responsible-AI
71+
* warnings (they are internal telemetry counters). The helper drops any
72+
* warning whose `message` starts with one of these prefixes before
73+
* rendering the `rai_warnings` block, so the noise never reaches the LLM.
74+
*
75+
* @internal
76+
*/
77+
const TELEMETRY_MESSAGE_PREFIXES: readonly string[] = ["LLMStats:"];
78+
79+
/**
80+
* Returns `true` if `markdown` already contains an `InputPageNumber`
81+
* marker. Case-sensitive substring check: a single occurrence is
82+
* sufficient, because when the service paginates content it places
83+
* markers at every boundary.
84+
*
85+
* @internal
86+
*/
87+
function hasInputPageMarker(markdown: string): boolean {
88+
return markdown.includes(INPUT_PAGE_MARKER_PREFIX);
89+
}
90+
5691
/**
5792
* Convert a Content Understanding analysis result into LLM-friendly text.
5893
*
@@ -70,8 +105,10 @@ const RESERVED_METADATA_KEYS: ReadonlySet<string> = new Set([
70105
* - any caller-supplied `metadata` entries
71106
*
72107
* The markdown body contains the extracted text with page-break markers
73-
* (`<!-- page N -->`) inserted at page boundaries so downstream consumers
74-
* can locate content by page number.
108+
* (`<!-- InputPageNumber: N -->`) inserted at page boundaries so downstream
109+
* consumers can locate content by page number. When the service-provided
110+
* markdown already contains `<!-- InputPageNumber:` markers, the helper
111+
* passes the markdown through unchanged to avoid duplicate markers.
75112
*
76113
* For single-content results (documents, images), the output is a flat
77114
* text block. For multi-segment results (video, audio), each segment is
@@ -411,6 +448,12 @@ function renderContentBlock(
411448
const PAGE_BREAK_PATTERN = /\n*<!-- PageBreak -->\n*/g;
412449

413450
function addPageMarkers(content: DocumentContent, markdown: string): string {
451+
// If the service already inserted InputPageNumber markers (per
452+
// ContentUnderstanding-Docs#249) pass the markdown through unchanged
453+
// to avoid emitting duplicate markers.
454+
if (hasInputPageMarker(markdown)) {
455+
return markdown;
456+
}
414457
if (content.pages && content.pages.length > 0) {
415458
const fromSpans = pageMarkersFromSpans(markdown, content.pages);
416459
if (fromSpans !== markdown) {
@@ -462,7 +505,7 @@ function pageMarkersFromSpans(markdown: string, pages: DocumentPage[]): string {
462505
if (adj > prev) {
463506
parts.push(cleaned.substring(prev, adj));
464507
}
465-
parts.push(`<!-- page ${marker.pageNumber} -->\n\n`);
508+
parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${marker.pageNumber} -->\n\n`);
466509
prev = adj;
467510
}
468511
if (prev < cleaned.length) {
@@ -479,7 +522,7 @@ function pageMarkersFromBreaks(markdown: string, content: DocumentContent): stri
479522
for (let i = 0; i < chunks.length; i++) {
480523
const text = chunks[i].trim();
481524
if (text) {
482-
parts.push(`<!-- page ${startPage + i} -->\n\n${text}`);
525+
parts.push(`${INPUT_PAGE_MARKER_PREFIX} ${startPage + i} -->\n\n${text}`);
483526
}
484527
}
485528
return parts.join("\n\n");
@@ -566,12 +609,20 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
566609
if (!w) {
567610
continue;
568611
}
612+
const message = w.message;
613+
// Skip internal service telemetry strings (e.g. `LLMStats: ...`) that
614+
// occasionally leak into the warnings collection. These are not
615+
// Responsible-AI warnings and would otherwise be rendered into the
616+
// LLM-facing `rai_warnings` block.
617+
if (message && isTelemetryMessage(message)) {
618+
continue;
619+
}
569620
const entry: Record<string, string> = {};
570621
if (w.code) {
571622
entry.code = w.code;
572623
}
573-
if (w.message) {
574-
entry.message = w.message;
624+
if (message) {
625+
entry.message = message;
575626
}
576627
if (w.target) {
577628
entry.target = w.target;
@@ -583,6 +634,16 @@ function formatWarnings(warnings: ErrorModel[]): Record<string, string>[] {
583634
return items;
584635
}
585636

637+
function isTelemetryMessage(message: string): boolean {
638+
const trimmed = message.replace(/^[\s]+/, "");
639+
for (const prefix of TELEMETRY_MESSAGE_PREFIXES) {
640+
if (trimmed.startsWith(prefix)) {
641+
return true;
642+
}
643+
}
644+
return false;
645+
}
646+
586647
// ---------------------------------------------------------------------------
587648
// Minimal YAML serializer (no external dependency)
588649
// ---------------------------------------------------------------------------

sdk/contentunderstanding/ai-content-understanding/test/public/node/llmInputHelper.spec.ts

Lines changed: 81 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ describe("toLlmInput - pages", () => {
211211
assert.equal(_compressPageNumbers([1]), 1);
212212
});
213213

214-
it("renders <!-- page N --> markers from pages[].spans", () => {
214+
it("renders <!-- InputPageNumber: N --> markers from pages[].spans", () => {
215215
const markdown = "Page1Content\nPage2Content";
216216
const pages: DocumentPage[] = [
217217
{ pageNumber: 1, spans: [{ offset: 0, length: 13 }] },
@@ -220,23 +220,99 @@ describe("toLlmInput - pages", () => {
220220
const text = toLlmInput(
221221
makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
222222
);
223-
assert.include(text, "<!-- page 1 -->");
224-
assert.include(text, "<!-- page 2 -->");
223+
assert.include(text, "<!-- InputPageNumber: 1 -->");
224+
assert.include(text, "<!-- InputPageNumber: 2 -->");
225225
assert.include(text, "pages: 1-2");
226226
});
227227

228+
it("does not inject duplicate markers when service markdown already has them", () => {
229+
const markdown =
230+
"<!-- InputPageNumber: 1 -->\n\nFirst page text.\n\n<!-- InputPageNumber: 2 -->\n\nSecond page text.";
231+
const pages: DocumentPage[] = [
232+
{ pageNumber: 1, spans: [{ offset: 0, length: 47 }] },
233+
{ pageNumber: 2, spans: [{ offset: 49, length: 48 }] },
234+
];
235+
const text = toLlmInput(
236+
makeResult([makeDocument({ markdown, pages, startPageNumber: 1, endPageNumber: 2 })]),
237+
);
238+
const count1 = text.split("<!-- InputPageNumber: 1 -->").length - 1;
239+
const count2 = text.split("<!-- InputPageNumber: 2 -->").length - 1;
240+
assert.equal(count1, 1);
241+
assert.equal(count2, 1);
242+
});
243+
228244
it("falls back to PageBreak splitting using startPageNumber", () => {
229245
const markdown = "First page text\n<!-- PageBreak -->\nSecond page text";
230246
const text = toLlmInput(
231247
makeResult([makeDocument({ markdown, startPageNumber: 3, endPageNumber: 4 })]),
232248
);
233-
assert.include(text, "<!-- page 3 -->");
234-
assert.include(text, "<!-- page 4 -->");
249+
assert.include(text, "<!-- InputPageNumber: 3 -->");
250+
assert.include(text, "<!-- InputPageNumber: 4 -->");
235251
assert.include(text, "First page text");
236252
assert.include(text, "Second page text");
237253
});
238254
});
239255

256+
// ---------------------------------------------------------------------------
257+
// rai_warnings (LLMStats telemetry filter)
258+
// ---------------------------------------------------------------------------
259+
260+
describe("toLlmInput - rai_warnings filter", () => {
261+
it("drops LLMStats: telemetry warnings but keeps real warnings", () => {
262+
const text = toLlmInput(
263+
makeResult([makeDocument()], [
264+
{ code: "Telemetry", message: "LLMStats: completion calls: 2; embedding calls: 1" },
265+
{ code: "ContentWarning", message: "Potentially sensitive content." },
266+
]),
267+
);
268+
assert.include(text, "rai_warnings:");
269+
assert.notInclude(text, "LLMStats:");
270+
assert.include(text, "Potentially sensitive content.");
271+
});
272+
273+
it("omits the rai_warnings block when only LLMStats: warnings exist", () => {
274+
const text = toLlmInput(
275+
makeResult([makeDocument()], [
276+
{ code: "Telemetry", message: "LLMStats: completion latency: 7.71s" },
277+
]),
278+
);
279+
assert.notInclude(text, "rai_warnings:");
280+
assert.notInclude(text, "LLMStats:");
281+
});
282+
283+
it("is case-sensitive (lowercase llmstats: is preserved)", () => {
284+
const text = toLlmInput(
285+
makeResult([makeDocument()], [
286+
{ code: "ContentWarning", message: "llmstats: keep as a real warning" },
287+
]),
288+
);
289+
assert.include(text, "rai_warnings:");
290+
assert.include(text, "llmstats: keep as a real warning");
291+
});
292+
293+
it("preserves LLMStats: text in the document markdown body", () => {
294+
const bodyText = "A log excerpt:\n- LLMStats: keep this body text";
295+
const text = toLlmInput(
296+
makeResult([makeDocument({ markdown: bodyText })], [
297+
{ code: "Telemetry", message: "LLMStats: remove this warning text" },
298+
]),
299+
);
300+
assert.notInclude(text, "rai_warnings:");
301+
assert.include(text, "LLMStats: keep this body text");
302+
assert.notInclude(text, "LLMStats: remove this warning text");
303+
});
304+
305+
it("filters LLMStats: warnings with leading whitespace", () => {
306+
const text = toLlmInput(
307+
makeResult([makeDocument()], [
308+
{ code: "Telemetry", message: " LLMStats: completion calls: 2" },
309+
]),
310+
);
311+
assert.notInclude(text, "rai_warnings:");
312+
assert.notInclude(text, "LLMStats:");
313+
});
314+
});
315+
240316
// ---------------------------------------------------------------------------
241317
// Audio / video segments
242318
// ---------------------------------------------------------------------------

sdk/contentunderstanding/ai-content-understanding/test/public/node/samples/toLlmInput.spec.ts

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -158,18 +158,22 @@ describe("Sample: toLlmInput", () => {
158158
);
159159

160160
// Page markers in the markdown body should use the original page numbers
161-
// (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
162-
assert.ok(
163-
!text.includes("<!-- page 1 -->"),
164-
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5",
161+
// Page markers in the markdown body should use the original page numbers
162+
// (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
163+
// not renumbered (1, 2, 3).
164+
assert(
165+
!text.includes("<!-- InputPageNumber: 1 -->"),
166+
"Page marker '<!-- InputPageNumber: 1 -->' should not appear \u2014 we only requested pages 2-3, 5",
165167
);
166168
for (const expectedPage of [2, 3, 5]) {
167-
assert.ok(
168-
text.includes(`<!-- page ${expectedPage} -->`),
169-
`Page marker '<!-- page ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
169+
assert(
170+
text.includes(`<!-- InputPageNumber: ${expectedPage} -->`),
171+
`Page marker '<!-- InputPageNumber: ${expectedPage} -->' should appear in the markdown body. Output:\n${text.slice(0, 800)}`,
170172
);
171173
}
172-
console.log("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->");
174+
console.log(
175+
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->",
176+
);
173177

174178
console.log(
175179
`[PASS] toLlmInput output validated (${text.length} chars, pages='2-3, 5' preserved)`,

0 commit comments

Comments
 (0)