Skip to content

Commit 4b2336c

Browse files
Feat/sanitize strings (#19)
* feat: add sanitizeString function and improve CTRF report serialization
1 parent 1af5455 commit 4b2336c

File tree

6 files changed

+411
-17
lines changed

6 files changed

+411
-17
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## 0.0.11
4+
5+
- sanitize strings in JUnit report more thoroughly to avoid invalid characters in CTRF report
6+
37
## 0.0.10
48

59
- Add support for maven surefire retries rerunFailingTestsCount

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "junit-to-ctrf",
3-
"version": "0.0.10",
3+
"version": "0.0.11",
44
"description": "Convert JUnit XML reports to CTRF JSON",
55
"type": "module",
66
"main": "dist/index.js",
@@ -21,6 +21,7 @@
2121
"e2e:glob": "node dist/cli.js \"reports/*.xml\" --output reports/test-glob-ctrf.json",
2222
"e2e:surefire-flaky": "node dist/cli.js reports/test-surefire-flaky.xml --output reports/test-surefire-flaky-ctrf.json",
2323
"e2e:surefire-retry": "node dist/cli.js reports/test-surefire-retry.xml --output reports/test-surefire-retry-ctrf.json",
24+
"e2e:junit-problem-string": "node dist/cli.js reports/test-junit-problem-string.xml --output reports/test-junit-problem-string-ctrf.json",
2425
"docs": "typedoc",
2526
"docs:watch": "typedoc --watch",
2627
"all": "npm run build:check && npm run test && npm run lint && npm run format && npm run docs && npm run build"
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
This file contains PROBLEMATIC CHARACTERS that would break JSON serialization
4+
without proper sanitization. These include:
5+
- BOM (Byte Order Mark):  at start of strings
6+
- Unicode surrogate pairs and isolated surrogates
7+
- Strings with only whitespace that should become undefined
8+
- ANSI escape sequences embedded as literal characters in CDATA
9+
-->
10+
<testsuites>
11+
<testsuite name="com.example.CalculatorTests" tests="5" failures="1" errors="1" skipped="1" time="0.456">
12+
<testcase name="addsNumbers" classname="com.example.CalculatorTests" time="0.012" file="CalculatorTests.java" lineno="23"/>
13+
14+
<testcase name="subtractsNumbers" classname="com.example.CalculatorTests" time="0.003">
15+
<skipped/>
16+
</testcase>
17+
18+
<testcase name="multipliesNumbers" classname="com.example.CalculatorTests" time="0.025">
19+
<!-- Failure message contains: BOM at start -->
20+
<failure message="Expected 10 but was 9" type="org.opentest4j.AssertionFailedError"><![CDATA[org.opentest4j.AssertionFailedError: Expected 10 but was 9
21+
at com.example.CalculatorTests.multipliesNumbers(CalculatorTests.java:45)
22+
Stack trace with ANSI colors: [31mRED[0m
23+
]]></failure>
24+
<!-- System output contains: ANSI escape sequences -->
25+
<system-out><![CDATA[[32mstdout line 1[0m
26+
stdout line 2]]></system-out>
27+
<!-- System error contains: ANSI escape sequences -->
28+
<system-err><![CDATA[stderr warns...[31mERROR[0m]]></system-err>
29+
</testcase>
30+
31+
<!-- Test name contains: Unicode surrogate -->
32+
<testcase name="dividesNumbers𝐀WithSurrogate" classname="com.example.CalculatorTests" time="0.100">
33+
<!-- Error message contains: BOM at start -->
34+
<error message="java.lang.ArithmeticException: / by zero" type="java.lang.ArithmeticException"><![CDATA[java.lang.ArithmeticException: / by zero
35+
at com.example.CalculatorTests.dividesNumbers(CalculatorTests.java:60)
36+
Error with whitespace only message follows
37+
]]></error>
38+
</testcase>
39+
40+
<!-- Test name contains: whitespace only (should become undefined) -->
41+
<testcase name="flakyThenPasses " classname="com.example.CalculatorTests" time="0.200">
42+
<!-- Flaky failure message contains: empty after sanitization -->
43+
<flakyFailure message=" " type="org.opentest4j.AssertionFailedError"><![CDATA[intermittent failure stack
44+
at line...]]></flakyFailure>
45+
<!-- Flaky error message contains: BOM -->
46+
<flakyError message="java.lang.RuntimeException" type="java.lang.RuntimeException"><![CDATA[some transient error]]></flakyError>
47+
<!-- System output contains: ANSI escape sequences -->
48+
<system-out><![CDATA[flaky test stdout[32mGREEN[0m]]></system-out>
49+
</testcase>
50+
</testsuite>
51+
52+
<!-- Suite name contains: BOM -->
53+
<testsuite name="com.example.RetryTests" tests="2" failures="0" errors="0" skipped="0" time="0.123">
54+
<!-- Test name contains: isolated surrogate (will be replaced) -->
55+
<testcase name="retriesThenFails�" classname="com.example.RetryTests" time="0.050">
56+
<!-- Rerun failure message contains: BOM -->
57+
<rerunFailure message="AssertionError" type="org.opentest4j.AssertionFailedError"><![CDATA[first attempt fail]]></rerunFailure>
58+
<!-- Rerun error message contains: BOM -->
59+
<rerunError message="java.lang.IllegalStateException" type="java.lang.IllegalStateException"><![CDATA[second attempt error]]></rerunError>
60+
</testcase>
61+
62+
<!-- Clean test with no problematic characters -->
63+
<testcase name="alwaysPasses" classname="com.example.RetryTests" time="0.010"/>
64+
</testsuite>
65+
</testsuites>

src/convert.test.ts

Lines changed: 241 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, it, expect } from 'vitest'
2-
import { createCTRFReport } from './convert.js'
2+
import { createCTRFReport, sanitizeString } from './convert.js'
33
import type { JUnitTestCase } from '../types/junit.js'
44

55
describe('createCTRFReport', () => {
@@ -334,3 +334,243 @@ describe('createCTRFReport', () => {
334334
})
335335
})
336336
})
337+
338+
describe('sanitizeString', () => {
339+
describe('null and undefined handling', () => {
340+
it('should return undefined for undefined input', () => {
341+
expect(sanitizeString(undefined)).toBeUndefined()
342+
})
343+
344+
it('should return undefined for empty string', () => {
345+
expect(sanitizeString('')).toBeUndefined()
346+
})
347+
348+
it('should return undefined for whitespace-only string', () => {
349+
expect(sanitizeString(' \n\t\r ')).toBeUndefined()
350+
})
351+
})
352+
353+
describe('normal strings', () => {
354+
it('should pass through normal ASCII strings unchanged', () => {
355+
expect(sanitizeString('hello world')).toBe('hello world')
356+
})
357+
358+
it('should preserve safe whitespace characters', () => {
359+
expect(sanitizeString('line1\nline2\tindented\rcarriage')).toBe(
360+
'line1\nline2\tindented\rcarriage'
361+
)
362+
})
363+
364+
it('should preserve quotes and special characters', () => {
365+
expect(sanitizeString('He said "hello" & goodbye!')).toBe(
366+
'He said "hello" & goodbye!'
367+
)
368+
})
369+
370+
it('should preserve Unicode characters (except emojis/surrogates)', () => {
371+
expect(sanitizeString('café naïve résumé 测试')).toBe(
372+
'café naïve résumé 测试'
373+
)
374+
})
375+
376+
it('should replace emoji characters (which use surrogates)', () => {
377+
expect(sanitizeString('hello 🎉 world')).toBe('hello �� world')
378+
})
379+
})
380+
381+
describe('BOM removal', () => {
382+
it('should remove BOM at start of string', () => {
383+
expect(sanitizeString('\uFEFFhello world')).toBe('hello world')
384+
})
385+
386+
it('should remove BOM in middle of string', () => {
387+
expect(sanitizeString('hello\uFEFF world')).toBe('hello world')
388+
})
389+
390+
it('should remove multiple BOMs', () => {
391+
expect(sanitizeString('\uFEFFhello\uFEFF world\uFEFF')).toBe(
392+
'hello world'
393+
)
394+
})
395+
})
396+
397+
describe('control character replacement', () => {
398+
it('should replace null bytes with spaces', () => {
399+
expect(sanitizeString('hello\x00world')).toBe('hello world')
400+
})
401+
402+
it('should replace backspace with space', () => {
403+
expect(sanitizeString('hello\x08world')).toBe('hello world')
404+
})
405+
406+
it('should replace bell character with space', () => {
407+
expect(sanitizeString('hello\x07world')).toBe('hello world')
408+
})
409+
410+
it('should replace vertical tab with space', () => {
411+
expect(sanitizeString('hello\x0Bworld')).toBe('hello world')
412+
})
413+
414+
it('should replace form feed with space', () => {
415+
expect(sanitizeString('hello\x0Cworld')).toBe('hello world')
416+
})
417+
418+
it('should replace escape sequences with space', () => {
419+
expect(sanitizeString('hello\x1Bworld')).toBe('hello world')
420+
})
421+
422+
it('should replace DEL character with space', () => {
423+
expect(sanitizeString('hello\x7Fworld')).toBe('hello world')
424+
})
425+
426+
it('should replace unit separator with space', () => {
427+
expect(sanitizeString('hello\x1Fworld')).toBe('hello world')
428+
})
429+
430+
it('should handle multiple control characters', () => {
431+
expect(sanitizeString('hello\x00\x07\x08\x0B\x0C\x1B\x7Fworld')).toBe(
432+
'hello world'
433+
)
434+
})
435+
})
436+
437+
describe('ANSI escape sequences', () => {
438+
it('should clean ANSI color codes', () => {
439+
expect(sanitizeString('\x1B[31mred text\x1B[0m')).toBe(
440+
' [31mred text [0m'
441+
)
442+
})
443+
444+
it('should clean complex ANSI sequences', () => {
445+
expect(
446+
sanitizeString('\x1B[32mGREEN\x1B[0m normal \x1B[31mRED\x1B[0m')
447+
).toBe(' [32mGREEN [0m normal [31mRED [0m')
448+
})
449+
})
450+
451+
describe('surrogate handling', () => {
452+
it('should replace isolated high surrogate', () => {
453+
expect(sanitizeString('hello\uD800world')).toBe('hello�world')
454+
})
455+
456+
it('should replace isolated low surrogate', () => {
457+
expect(sanitizeString('hello\uDFFFworld')).toBe('hello�world')
458+
})
459+
460+
it('should handle multiple isolated surrogates', () => {
461+
// \uD800\uDC00 forms valid char, then both get replaced + \uDFFF gets replaced = 3 replacements
462+
expect(sanitizeString('test\uD800\uDC00\uDFFFend')).toBe('test���end')
463+
})
464+
})
465+
466+
describe('Unicode normalization', () => {
467+
it('should normalize composed characters', () => {
468+
// Composed 'é' vs decomposed 'e' + combining acute
469+
const composed = 'café'
470+
const decomposed = 'cafe\u0301'
471+
expect(sanitizeString(decomposed)).toBe(composed)
472+
})
473+
474+
it('should normalize multiple combining characters', () => {
475+
// Test with combining characters that should normalize
476+
const input = 'a\u0300\u0301' // a + grave + acute
477+
const result = sanitizeString(input)
478+
expect(result).toBeDefined()
479+
expect(result?.normalize('NFC')).toBe(result)
480+
})
481+
})
482+
483+
describe('complex real-world scenarios', () => {
484+
it('should handle JUnit error messages with control chars', () => {
485+
const input = 'Expected 10 but was 9\x00\x08\x1B[31m'
486+
expect(sanitizeString(input)).toBe('Expected 10 but was 9 [31m')
487+
})
488+
489+
it('should handle stack traces with mixed problems', () => {
490+
const input = '\uFEFFjava.lang.Exception\x00\x0B\x0C\n at line 42\x07'
491+
expect(sanitizeString(input)).toBe(
492+
'java.lang.Exception \n at line 42 '
493+
)
494+
})
495+
496+
it('should handle system output with ANSI and control chars', () => {
497+
const input = '\x1B[32mGREEN\x1B[0m\x00test\x08output'
498+
expect(sanitizeString(input)).toBe(' [32mGREEN [0m test output')
499+
})
500+
501+
it('should handle test names with problematic chars', () => {
502+
const input = 'testMethod\x1F\x7F\uD800WithProblems'
503+
expect(sanitizeString(input)).toBe('testMethod �WithProblems')
504+
})
505+
})
506+
507+
describe('edge cases', () => {
508+
it('should handle string with only control characters', () => {
509+
expect(sanitizeString('\x00\x07\x08\x1B')).toBeUndefined()
510+
})
511+
512+
it('should handle string with only BOM', () => {
513+
expect(sanitizeString('\uFEFF')).toBeUndefined()
514+
})
515+
516+
it('should handle string with only surrogates', () => {
517+
// Surrogates get replaced with replacement chars, not removed
518+
expect(sanitizeString('\uD800\uDFFF')).toBe('��')
519+
})
520+
521+
it('should preserve single valid character', () => {
522+
expect(sanitizeString('a')).toBe('a')
523+
})
524+
525+
it('should handle very long strings with scattered problems', () => {
526+
const input =
527+
'a'.repeat(1000) +
528+
'\x00' +
529+
'b'.repeat(1000) +
530+
'\x1B[31m' +
531+
'c'.repeat(1000)
532+
const result = sanitizeString(input)
533+
expect(result).toBeDefined()
534+
// 3000 good chars + 1 space (null) + 6 chars from ANSI sequence (\x1B becomes space, [31m stays)
535+
expect(result?.length).toBe(3006)
536+
})
537+
})
538+
539+
describe('JSON serialization safety', () => {
540+
it('should produce strings that can be JSON serialized', () => {
541+
const problematicInputs = [
542+
'test\x00with\x07null\x08bytes',
543+
'\uFEFFBOM\x1B[31mANSI\x0Bvertical\x0Cform',
544+
'surrogate\uD800\uDFFFpairs',
545+
'\x00\x01\x02\x03\x04\x05\x06\x07\x08',
546+
'\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F',
547+
]
548+
549+
problematicInputs.forEach(input => {
550+
const sanitized = sanitizeString(input)
551+
if (sanitized) {
552+
expect(() => JSON.stringify({ test: sanitized })).not.toThrow()
553+
}
554+
})
555+
})
556+
557+
it('should handle real Gradle JUnit output patterns', () => {
558+
const gradleOutputs = [
559+
'Test failed: Expected <10> but was: <9>\x00',
560+
'\x1B[31mFAILED\x1B[0m com.example.Test.method\x07',
561+
'java.lang.AssertionError\x0B\x0C\n at Assert.fail(Assert.java:42)',
562+
'\uFEFFCaused by: java.lang.NullPointerException\x08',
563+
]
564+
565+
gradleOutputs.forEach(output => {
566+
const sanitized = sanitizeString(output)
567+
if (sanitized) {
568+
expect(() => JSON.stringify(sanitized)).not.toThrow()
569+
expect(() =>
570+
JSON.parse(JSON.stringify({ msg: sanitized }))
571+
).not.toThrow()
572+
}
573+
})
574+
})
575+
})
576+
})

0 commit comments

Comments
 (0)