Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

### Bugs Fixed

- Filtered service-emitted `LLMStats:` telemetry entries from the rendered `rai_warnings` front matter in `LlmInputHelper.toLlmInput`.

### Other Changes

- Updated `LlmInputHelper.toLlmInput` page markers from `<!-- page N -->` to `<!-- InputPageNumber: N -->` and avoided duplicate marker injection when the service markdown already includes `InputPageNumber` markers.

## 1.1.0-beta.1 (2026-05-01)

### Features Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ If you encounter errors:
<dependency>
<groupId>com.azure</groupId>
<artifactId>azure-ai-contentunderstanding</artifactId>
<version>1.0.0</version>
<version>1.1.0-beta.2</version>
</dependency>
```
[//]: # ({x-version-update-end})
Expand Down Expand Up @@ -439,7 +439,7 @@ fields:
figure illustrating monthly values, and describes the AI Document
Intelligence service...
---
<!-- page 1 -->
<!-- InputPageNumber: 1 -->
# ==This is title==
## 1. Text
[Latin](https://en.wikipedia.org/wiki/Latin) refers to an ancient Italic language...
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,20 @@ public final class LlmInputHelper {

private static final Pattern PAGE_BREAK_PATTERN = Pattern.compile("\\n*<!-- PageBreak -->\\n*");

// Marker emitted by toLlmInput at each page boundary. Future Content Understanding
// service versions emit this same marker directly in the returned markdown (per
// ContentUnderstanding-Docs#249). When the helper sees any occurrence of this
// prefix in the input markdown it treats the service as having already paginated
// the content and skips its own injection to avoid duplicate markers.
private static final String INPUT_PAGE_MARKER_PREFIX = "<!-- InputPageNumber:";

// Message prefixes the Content Understanding service has been observed to emit
// into the warnings collection that are *not* real Responsible-AI warnings (they
// are internal telemetry counters). The helper drops any warning whose message
// starts with one of these prefixes before rendering the rai_warnings: block, so
// the noise never reaches the LLM.
private static final String[] TELEMETRY_MESSAGE_PREFIXES = { "LLMStats:" };

// YAML quoting patterns
private static final Pattern YAML_SPECIAL_START = Pattern.compile("^[-?:,\\[\\]{}#&*!|>'\"%@`]");
private static final Pattern YAML_SPECIAL_INSIDE = Pattern.compile("[:#] |[\\n\\r]");
Expand All @@ -84,8 +98,10 @@ private LlmInputHelper() {
* and any caller-supplied metadata entries.
*
* <p>The markdown body contains the extracted text with page-break markers
* ({@code <!-- page N -->}) inserted at page boundaries so downstream consumers
* can locate content by page number.
* ({@code <!-- InputPageNumber: N -->}) inserted at page boundaries so downstream
* consumers can locate content by page number. If the service markdown already
* contains {@code <!-- InputPageNumber:} markers, the helper passes them through
* unchanged to avoid duplicates.
*
* @param result the {@link AnalysisResult} from a Content Understanding analyze operation.
* @return a formatted string with YAML front matter followed by markdown content.
Expand Down Expand Up @@ -498,6 +514,9 @@ private static String renderContentBlock(RenderableContent content, AnalysisResu
// -----------------------------------------------------------------------

private static String addPageMarkers(RenderableContent content, String markdown) {
if (hasInputPageMarker(markdown)) {
return markdown;
}
if (content.pages != null && !content.pages.isEmpty()) {
String result = pageMarkersFromSpans(markdown, content.pages);
// Identity check: if spans were found, result differs from input
Expand All @@ -508,6 +527,10 @@ private static String addPageMarkers(RenderableContent content, String markdown)
return pageMarkersFromBreaks(markdown, content);
}

private static boolean hasInputPageMarker(String markdown) {
return markdown != null && markdown.contains(INPUT_PAGE_MARKER_PREFIX);
}

private static String pageMarkersFromSpans(String markdown, List<DocumentPage> pages) {
List<int[]> markers = new ArrayList<>(); // [offset, pageNumber]
for (DocumentPage page : pages) {
Expand Down Expand Up @@ -539,7 +562,7 @@ private static String pageMarkersFromSpans(String markdown, List<DocumentPage> p
if (adj > prev) {
sb.append(cleaned, prev, adj);
}
sb.append("<!-- page ").append(marker[1]).append(" -->\n\n");
sb.append(INPUT_PAGE_MARKER_PREFIX).append(' ').append(marker[1]).append(" -->\n\n");
prev = adj;
}
if (prev < cleaned.length()) {
Expand All @@ -565,7 +588,7 @@ private static String pageMarkersFromBreaks(String markdown, RenderableContent c
for (int i = 0; i < chunks.length; i++) {
String text = chunks[i].trim();
if (!text.isEmpty()) {
parts.add("<!-- page " + (startPage + i) + " -->\n\n" + text);
parts.add(INPUT_PAGE_MARKER_PREFIX + " " + (startPage + i) + " -->\n\n" + text);
}
}
return String.join("\n\n", parts);
Expand Down Expand Up @@ -646,12 +669,20 @@ private static List<Map<String, String>> formatWarnings(List<ResponseError> warn
if (w == null) {
continue;
}
String message = w.getMessage();
// Skip internal service telemetry strings (e.g. "LLMStats: ...") that
// occasionally leak into the warnings collection. These are not
// Responsible-AI warnings and would otherwise be rendered into the
// LLM-facing rai_warnings: block.
if (message != null && isTelemetryMessage(message)) {
continue;
}
Map<String, String> entry = new LinkedHashMap<>();
if (w.getCode() != null && !w.getCode().isEmpty()) {
entry.put("code", w.getCode());
}
if (w.getMessage() != null && !w.getMessage().isEmpty()) {
entry.put("message", w.getMessage());
if (message != null && !message.isEmpty()) {
entry.put("message", message);
}
if (!entry.isEmpty()) {
items.add(entry);
Expand All @@ -660,6 +691,20 @@ private static List<Map<String, String>> formatWarnings(List<ResponseError> warn
return items;
}

private static boolean isTelemetryMessage(String message) {
// Strip leading whitespace (case-sensitive prefix match).
int i = 0;
while (i < message.length() && (message.charAt(i) == ' ' || message.charAt(i) == '\t')) {
i++;
}
for (String prefix : TELEMETRY_MESSAGE_PREFIXES) {
if (message.regionMatches(false, i, prefix, 0, prefix.length())) {
return true;
}
}
return false;
}

// -----------------------------------------------------------------------
// Minimal YAML serializer (no external dependency)
// -----------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ public static void main(String[] args) {
// Analyze specific pages using ContentRange.
// Page markers in the output will use the original document page numbers,
// so even though we only requested pages 2-3 and 5, the markers will say
// <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 --> (not 1, 2, 3).
// <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 --> (not 1, 2, 3).
System.out.println("Analyzing pages 2-3 and 5 of a multi-page PDF...");
System.out.println(" URL: " + multiPageUrl);
System.out.println(" contentRange: '2-3,5'\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public void toLlmInputSingleDocumentDefaultOptions() {
assertTrue(output.contains("Amount: 165"));
assertTrue(output.contains("CurrencyCode: USD"));
assertTrue(output.contains("Hello world"));
assertTrue(output.contains("<!-- page 1 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 1 -->"));
}

@Test
Expand Down Expand Up @@ -183,10 +183,110 @@ public void toLlmInputWithWarnings() {
assertTrue(output.contains("message: 'latency: 2s'"));
}

@Test
public void llmStatsWarningFilteredFromRaiWarnings() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + "\"warnings\":["
+ "{\"code\":\"Telemetry\",\"message\":\"LLMStats: completion calls: 2; embedding calls: 1\"},"
+ "{\"code\":\"ContentWarning\",\"message\":\"Potentially sensitive content.\"}" + "],"
+ "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertTrue(output.contains("rai_warnings:"));
assertFalse(output.contains("LLMStats:"));
assertTrue(output.contains("Potentially sensitive content."));
}

@Test
public void llmStatsWarningOnlyOmitsRaiWarningsBlock() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\","
+ "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\"LLMStats: completion latency: 7.71s\"}],"
+ "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertFalse(output.contains("rai_warnings:"));
assertFalse(output.contains("LLMStats:"));
}

@Test
public void llmStatsFilterIsCaseSensitive() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\","
+ "\"warnings\":[{\"code\":\"ContentWarning\",\"message\":\"llmstats: keep as a real warning\"}],"
+ "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertTrue(output.contains("rai_warnings:"));
assertTrue(output.contains("llmstats: keep as a real warning"));
}

@Test
public void llmStatsTextInMarkdownBodyIsPreserved() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\","
+ "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\"LLMStats: remove this warning text\"}],"
+ "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ "\"startPageNumber\":1,\"endPageNumber\":1,"
+ "\"markdown\":\"A log excerpt:\\n- LLMStats: keep this body text\"}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertFalse(output.contains("rai_warnings:"));
assertTrue(output.contains("LLMStats: keep this body text"));
assertFalse(output.contains("LLMStats: remove this warning text"));
}

@Test
public void llmStatsWarningFilteredWithLeadingWhitespace() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\","
+ "\"warnings\":[{\"code\":\"Telemetry\",\"message\":\" LLMStats: completion calls: 2\"}],"
+ "\"contents\":[{\"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ "\"startPageNumber\":1,\"endPageNumber\":1,\"markdown\":\"text\"}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertFalse(output.contains("rai_warnings:"));
assertFalse(output.contains("LLMStats:"));
}

// -----------------------------------------------------------------------
// Page markers
// -----------------------------------------------------------------------

@Test
public void pageMarkersNotDuplicatedWhenServiceProvidesMarkers() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
+ "\"createdAt\":\"2026-01-01T00:00:00Z\",\"stringEncoding\":\"utf16\"," + "\"contents\":[{"
+ " \"kind\":\"document\",\"mimeType\":\"application/pdf\","
+ " \"startPageNumber\":1,\"endPageNumber\":2,"
+ " \"markdown\":\"<!-- InputPageNumber: 1 -->\\n\\nFirst page text.\\n\\n<!-- InputPageNumber: 2 -->\\n\\nSecond page text.\","
+ " \"pages\":[" + " {\"pageNumber\":1,\"spans\":[{\"offset\":0,\"length\":47}]},"
+ " {\"pageNumber\":2,\"spans\":[{\"offset\":49,\"length\":48}]}" + " ]" + "}]" + "}";
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertEquals(1, countOccurrences(output, "<!-- InputPageNumber: 1 -->"));
assertEquals(1, countOccurrences(output, "<!-- InputPageNumber: 2 -->"));
}

private static int countOccurrences(String text, String needle) {
int count = 0;
int idx = 0;
while ((idx = text.indexOf(needle, idx)) != -1) {
count++;
idx += needle.length();
}
return count;
}

@Test
public void toLlmInputMultiPageWithSpans() {
String json = "{" + "\"analyzerId\":\"a\",\"apiVersion\":\"v\","
Expand All @@ -201,9 +301,9 @@ public void toLlmInputMultiPageWithSpans() {
String output = LlmInputHelper.toLlmInput(result);

assertTrue(output.contains("pages: 2-4"));
assertTrue(output.contains("<!-- page 2 -->"));
assertTrue(output.contains("<!-- page 3 -->"));
assertTrue(output.contains("<!-- page 4 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 2 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 3 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 4 -->"));
}

@Test
Expand All @@ -216,9 +316,9 @@ public void toLlmInputMultiPageWithPageBreaks() {
AnalysisResult result = parseResult(json);
String output = LlmInputHelper.toLlmInput(result);

assertTrue(output.contains("<!-- page 3 -->"));
assertTrue(output.contains("<!-- page 4 -->"));
assertTrue(output.contains("<!-- page 5 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 3 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 4 -->"));
assertTrue(output.contains("<!-- InputPageNumber: 5 -->"));
assertFalse(output.contains("<!-- PageBreak -->"));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,16 @@ public void testToLlmInputMultiPageContentRangeAsync() {
"'pages' value should be '2-3, 5' (original page numbers preserved)");

// Page markers in the markdown body should use the original page numbers
assertFalse(text.contains("<!-- page 1 -->"),
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5");
assertTrue(text.contains("<!-- page 2 -->"),
"Page marker '<!-- page 2 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- page 3 -->"),
"Page marker '<!-- page 3 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- page 5 -->"),
"Page marker '<!-- page 5 -->' should appear in the markdown body");
System.out.println("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->");
assertFalse(text.contains("<!-- InputPageNumber: 1 -->"),
"Page marker '<!-- InputPageNumber: 1 -->' should not appear — we only requested pages 2-3, 5");
assertTrue(text.contains("<!-- InputPageNumber: 2 -->"),
"Page marker '<!-- InputPageNumber: 2 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- InputPageNumber: 3 -->"),
"Page marker '<!-- InputPageNumber: 3 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- InputPageNumber: 5 -->"),
"Page marker '<!-- InputPageNumber: 5 -->' should appear in the markdown body");
System.out.println(
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->");

System.out
.println("[PASS] toLlmInput output validated (" + text.length() + " chars, pages='2-3, 5' preserved)");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,18 @@ public void testToLlmInputMultiPageContentRange() {
"'pages' value should be '2-3, 5' (original page numbers preserved)");

// Page markers in the markdown body should use the original page numbers
// (<!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->), not renumbered (1, 2, 3).
assertFalse(text.contains("<!-- page 1 -->"),
"Page marker '<!-- page 1 -->' should not appear — we only requested pages 2-3, 5");
assertTrue(text.contains("<!-- page 2 -->"),
"Page marker '<!-- page 2 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- page 3 -->"),
"Page marker '<!-- page 3 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- page 5 -->"),
"Page marker '<!-- page 5 -->' should appear in the markdown body");
System.out.println("[PASS] Page markers verified: <!-- page 2 -->, <!-- page 3 -->, <!-- page 5 -->");
// (<!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->),
// not renumbered (1, 2, 3).
assertFalse(text.contains("<!-- InputPageNumber: 1 -->"),
"Page marker '<!-- InputPageNumber: 1 -->' should not appear — we only requested pages 2-3, 5");
assertTrue(text.contains("<!-- InputPageNumber: 2 -->"),
"Page marker '<!-- InputPageNumber: 2 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- InputPageNumber: 3 -->"),
"Page marker '<!-- InputPageNumber: 3 -->' should appear in the markdown body");
assertTrue(text.contains("<!-- InputPageNumber: 5 -->"),
"Page marker '<!-- InputPageNumber: 5 -->' should appear in the markdown body");
System.out.println(
"[PASS] Page markers verified: <!-- InputPageNumber: 2 -->, <!-- InputPageNumber: 3 -->, <!-- InputPageNumber: 5 -->");

System.out
.println("[PASS] toLlmInput output validated (" + text.length() + " chars, pages='2-3, 5' preserved)");
Expand Down