Feat/sanitize strings (#19)

Ma11hewThomas · web-flow · commit 4b2336c75465 · 2025-10-06T19:17:07.000+01:00
* feat: add sanitizeString function and improve CTRF report serialization
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 0.0.11
+
+- sanitize strings in JUnit report more thoroughly to avoid invalid characters in CTRF report
+
 ## 0.0.10
 
 - Add support for maven surefire retries rerunFailingTestsCount
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "junit-to-ctrf",
-  "version": "0.0.10",
+  "version": "0.0.11",
   "description": "Convert JUnit XML reports to CTRF JSON",
   "type": "module",
   "main": "dist/index.js",
@@ -21,6 +21,7 @@
     "e2e:glob": "node dist/cli.js \"reports/*.xml\" --output reports/test-glob-ctrf.json",
     "e2e:surefire-flaky": "node dist/cli.js reports/test-surefire-flaky.xml --output reports/test-surefire-flaky-ctrf.json",
     "e2e:surefire-retry": "node dist/cli.js reports/test-surefire-retry.xml --output reports/test-surefire-retry-ctrf.json",
+    "e2e:junit-problem-string": "node dist/cli.js reports/test-junit-problem-string.xml --output reports/test-junit-problem-string-ctrf.json",
     "docs": "typedoc",
     "docs:watch": "typedoc --watch",
     "all": "npm run build:check && npm run test && npm run lint && npm run format && npm run docs && npm run build"
diff --git a/reports/test-junit-problem-string.xml b/reports/test-junit-problem-string.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--  
+  This file contains PROBLEMATIC CHARACTERS that would break JSON serialization
+  without proper sanitization. These include:
+  - BOM (Byte Order Mark): ﻿ at start of strings
+  - Unicode surrogate pairs and isolated surrogates
+  - Strings with only whitespace that should become undefined
+  - ANSI escape sequences embedded as literal characters in CDATA
+-->
+<testsuites>
+  <testsuite name="com.example.CalculatorTests" tests="5" failures="1" errors="1" skipped="1" time="0.456">
+    <testcase name="addsNumbers" classname="com.example.CalculatorTests" time="0.012" file="CalculatorTests.java" lineno="23"/>
+
+    <testcase name="subtractsNumbers" classname="com.example.CalculatorTests" time="0.003">
+      <skipped/>
+    </testcase>
+
+    <testcase name="multipliesNumbers" classname="com.example.CalculatorTests" time="0.025">
+      <!-- Failure message contains: BOM at start -->
+      <failure message="﻿Expected 10 but was 9" type="org.opentest4j.AssertionFailedError"><![CDATA[org.opentest4j.AssertionFailedError: Expected 10 but was 9
+    at com.example.CalculatorTests.multipliesNumbers(CalculatorTests.java:45)
+Stack trace with ANSI colors: [31mRED[0m
+]]></failure>
+      <!-- System output contains: ANSI escape sequences -->
+      <system-out><![CDATA[[32mstdout line 1[0m
+stdout line 2]]></system-out>
+      <!-- System error contains: ANSI escape sequences -->
+      <system-err><![CDATA[stderr warns...[31mERROR[0m]]></system-err>
+    </testcase>
+
+    <!-- Test name contains: Unicode surrogate -->
+    <testcase name="dividesNumbers𝐀WithSurrogate" classname="com.example.CalculatorTests" time="0.100">
+      <!-- Error message contains: BOM at start -->
+      <error message="﻿java.lang.ArithmeticException: / by zero" type="java.lang.ArithmeticException"><![CDATA[java.lang.ArithmeticException: / by zero
+    at com.example.CalculatorTests.dividesNumbers(CalculatorTests.java:60)
+Error with whitespace only message follows
+]]></error>
+    </testcase>
+
+    <!-- Test name contains: whitespace only (should become undefined) -->
+    <testcase name="flakyThenPasses   " classname="com.example.CalculatorTests" time="0.200">
+      <!-- Flaky failure message contains: empty after sanitization -->
+      <flakyFailure message="   " type="org.opentest4j.AssertionFailedError"><![CDATA[intermittent failure stack
+at line...]]></flakyFailure>
+      <!-- Flaky error message contains: BOM -->
+      <flakyError message="﻿java.lang.RuntimeException" type="java.lang.RuntimeException"><![CDATA[some transient error]]></flakyError>
+      <!-- System output contains: ANSI escape sequences -->
+      <system-out><![CDATA[flaky test stdout[32mGREEN[0m]]></system-out>
+    </testcase>
+  </testsuite>
+
+  <!-- Suite name contains: BOM -->
+  <testsuite name="﻿com.example.RetryTests" tests="2" failures="0" errors="0" skipped="0" time="0.123">
+    <!-- Test name contains: isolated surrogate (will be replaced) -->
+    <testcase name="retriesThenFails�" classname="com.example.RetryTests" time="0.050">
+      <!-- Rerun failure message contains: BOM -->
+      <rerunFailure message="﻿AssertionError" type="org.opentest4j.AssertionFailedError"><![CDATA[first attempt fail]]></rerunFailure>
+      <!-- Rerun error message contains: BOM -->
+      <rerunError message="﻿java.lang.IllegalStateException" type="java.lang.IllegalStateException"><![CDATA[second attempt error]]></rerunError>
+    </testcase>
+
+    <!-- Clean test with no problematic characters -->
+    <testcase name="alwaysPasses" classname="com.example.RetryTests" time="0.010"/>
+  </testsuite>
+</testsuites>
diff --git a/src/convert.test.ts b/src/convert.test.ts
@@ -1,5 +1,5 @@
 import { describe, it, expect } from 'vitest'
-import { createCTRFReport } from './convert.js'
+import { createCTRFReport, sanitizeString } from './convert.js'
 import type { JUnitTestCase } from '../types/junit.js'
 
 describe('createCTRFReport', () => {
@@ -334,3 +334,243 @@ describe('createCTRFReport', () => {
     })
   })
 })
+
+describe('sanitizeString', () => {
+  describe('null and undefined handling', () => {
+    it('should return undefined for undefined input', () => {
+      expect(sanitizeString(undefined)).toBeUndefined()
+    })
+
+    it('should return undefined for empty string', () => {
+      expect(sanitizeString('')).toBeUndefined()
+    })
+
+    it('should return undefined for whitespace-only string', () => {
+      expect(sanitizeString('   \n\t\r   ')).toBeUndefined()
+    })
+  })
+
+  describe('normal strings', () => {
+    it('should pass through normal ASCII strings unchanged', () => {
+      expect(sanitizeString('hello world')).toBe('hello world')
+    })
+
+    it('should preserve safe whitespace characters', () => {
+      expect(sanitizeString('line1\nline2\tindented\rcarriage')).toBe(
+        'line1\nline2\tindented\rcarriage'
+      )
+    })
+
+    it('should preserve quotes and special characters', () => {
+      expect(sanitizeString('He said "hello" & goodbye!')).toBe(
+        'He said "hello" & goodbye!'
+      )
+    })
+
+    it('should preserve Unicode characters (except emojis/surrogates)', () => {
+      expect(sanitizeString('café naïve résumé 测试')).toBe(
+        'café naïve résumé 测试'
+      )
+    })
+
+    it('should replace emoji characters (which use surrogates)', () => {
+      expect(sanitizeString('hello 🎉 world')).toBe('hello �� world')
+    })
+  })
+
+  describe('BOM removal', () => {
+    it('should remove BOM at start of string', () => {
+      expect(sanitizeString('\uFEFFhello world')).toBe('hello world')
+    })
+
+    it('should remove BOM in middle of string', () => {
+      expect(sanitizeString('hello\uFEFF world')).toBe('hello world')
+    })
+
+    it('should remove multiple BOMs', () => {
+      expect(sanitizeString('\uFEFFhello\uFEFF world\uFEFF')).toBe(
+        'hello world'
+      )
+    })
+  })
+
+  describe('control character replacement', () => {
+    it('should replace null bytes with spaces', () => {
+      expect(sanitizeString('hello\x00world')).toBe('hello world')
+    })
+
+    it('should replace backspace with space', () => {
+      expect(sanitizeString('hello\x08world')).toBe('hello world')
+    })
+
+    it('should replace bell character with space', () => {
+      expect(sanitizeString('hello\x07world')).toBe('hello world')
+    })
+
+    it('should replace vertical tab with space', () => {
+      expect(sanitizeString('hello\x0Bworld')).toBe('hello world')
+    })
+
+    it('should replace form feed with space', () => {
+      expect(sanitizeString('hello\x0Cworld')).toBe('hello world')
+    })
+
+    it('should replace escape sequences with space', () => {
+      expect(sanitizeString('hello\x1Bworld')).toBe('hello world')
+    })
+
+    it('should replace DEL character with space', () => {
+      expect(sanitizeString('hello\x7Fworld')).toBe('hello world')
+    })
+
+    it('should replace unit separator with space', () => {
+      expect(sanitizeString('hello\x1Fworld')).toBe('hello world')
+    })
+
+    it('should handle multiple control characters', () => {
+      expect(sanitizeString('hello\x00\x07\x08\x0B\x0C\x1B\x7Fworld')).toBe(
+        'hello       world'
+      )
+    })
+  })
+
+  describe('ANSI escape sequences', () => {
+    it('should clean ANSI color codes', () => {
+      expect(sanitizeString('\x1B[31mred text\x1B[0m')).toBe(
+        ' [31mred text [0m'
+      )
+    })
+
+    it('should clean complex ANSI sequences', () => {
+      expect(
+        sanitizeString('\x1B[32mGREEN\x1B[0m normal \x1B[31mRED\x1B[0m')
+      ).toBe(' [32mGREEN [0m normal  [31mRED [0m')
+    })
+  })
+
+  describe('surrogate handling', () => {
+    it('should replace isolated high surrogate', () => {
+      expect(sanitizeString('hello\uD800world')).toBe('hello�world')
+    })
+
+    it('should replace isolated low surrogate', () => {
+      expect(sanitizeString('hello\uDFFFworld')).toBe('hello�world')
+    })
+
+    it('should handle multiple isolated surrogates', () => {
+      // \uD800\uDC00 forms valid char, then both get replaced + \uDFFF gets replaced = 3 replacements
+      expect(sanitizeString('test\uD800\uDC00\uDFFFend')).toBe('test���end')
+    })
+  })
+
+  describe('Unicode normalization', () => {
+    it('should normalize composed characters', () => {
+      // Composed 'é' vs decomposed 'e' + combining acute
+      const composed = 'café'
+      const decomposed = 'cafe\u0301'
+      expect(sanitizeString(decomposed)).toBe(composed)
+    })
+
+    it('should normalize multiple combining characters', () => {
+      // Test with combining characters that should normalize
+      const input = 'a\u0300\u0301' // a + grave + acute
+      const result = sanitizeString(input)
+      expect(result).toBeDefined()
+      expect(result?.normalize('NFC')).toBe(result)
+    })
+  })
+
+  describe('complex real-world scenarios', () => {
+    it('should handle JUnit error messages with control chars', () => {
+      const input = 'Expected 10 but was 9\x00\x08\x1B[31m'
+      expect(sanitizeString(input)).toBe('Expected 10 but was 9   [31m')
+    })
+
+    it('should handle stack traces with mixed problems', () => {
+      const input = '\uFEFFjava.lang.Exception\x00\x0B\x0C\n    at line 42\x07'
+      expect(sanitizeString(input)).toBe(
+        'java.lang.Exception   \n    at line 42 '
+      )
+    })
+
+    it('should handle system output with ANSI and control chars', () => {
+      const input = '\x1B[32mGREEN\x1B[0m\x00test\x08output'
+      expect(sanitizeString(input)).toBe(' [32mGREEN [0m test output')
+    })
+
+    it('should handle test names with problematic chars', () => {
+      const input = 'testMethod\x1F\x7F\uD800WithProblems'
+      expect(sanitizeString(input)).toBe('testMethod  �WithProblems')
+    })
+  })
+
+  describe('edge cases', () => {
+    it('should handle string with only control characters', () => {
+      expect(sanitizeString('\x00\x07\x08\x1B')).toBeUndefined()
+    })
+
+    it('should handle string with only BOM', () => {
+      expect(sanitizeString('\uFEFF')).toBeUndefined()
+    })
+
+    it('should handle string with only surrogates', () => {
+      // Surrogates get replaced with replacement chars, not removed
+      expect(sanitizeString('\uD800\uDFFF')).toBe('��')
+    })
+
+    it('should preserve single valid character', () => {
+      expect(sanitizeString('a')).toBe('a')
+    })
+
+    it('should handle very long strings with scattered problems', () => {
+      const input =
+        'a'.repeat(1000) +
+        '\x00' +
+        'b'.repeat(1000) +
+        '\x1B[31m' +
+        'c'.repeat(1000)
+      const result = sanitizeString(input)
+      expect(result).toBeDefined()
+      // 3000 good chars + 1 space (null) + 6 chars from ANSI sequence (\x1B becomes space, [31m stays)
+      expect(result?.length).toBe(3006)
+    })
+  })
+
+  describe('JSON serialization safety', () => {
+    it('should produce strings that can be JSON serialized', () => {
+      const problematicInputs = [
+        'test\x00with\x07null\x08bytes',
+        '\uFEFFBOM\x1B[31mANSI\x0Bvertical\x0Cform',
+        'surrogate\uD800\uDFFFpairs',
+        '\x00\x01\x02\x03\x04\x05\x06\x07\x08',
+        '\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F',
+      ]
+
+      problematicInputs.forEach(input => {
+        const sanitized = sanitizeString(input)
+        if (sanitized) {
+          expect(() => JSON.stringify({ test: sanitized })).not.toThrow()
+        }
+      })
+    })
+
+    it('should handle real Gradle JUnit output patterns', () => {
+      const gradleOutputs = [
+        'Test failed: Expected <10> but was: <9>\x00',
+        '\x1B[31mFAILED\x1B[0m com.example.Test.method\x07',
+        'java.lang.AssertionError\x0B\x0C\n    at Assert.fail(Assert.java:42)',
+        '\uFEFFCaused by: java.lang.NullPointerException\x08',
+      ]
+
+      gradleOutputs.forEach(output => {
+        const sanitized = sanitizeString(output)
+        if (sanitized) {
+          expect(() => JSON.stringify(sanitized)).not.toThrow()
+          expect(() =>
+            JSON.parse(JSON.stringify({ msg: sanitized }))
+          ).not.toThrow()
+        }
+      })
+    })
+  })
+})
diff --git a/src/convert.ts b/src/convert.ts