diff --git a/internal/impl/xml/bloblang.go b/internal/impl/xml/bloblang.go
index e7b836c8a3..de517a9de5 100644
--- a/internal/impl/xml/bloblang.go
+++ b/internal/impl/xml/bloblang.go
@@ -42,8 +42,15 @@ func init() {
`{"doc":"This is a title123True"}`,
`{"doc":{"root":{"bool":true,"number":{"#text":123,"-id":99},"title":"This is a title"}}}`,
}).
+ Example("Parse XML preserving namespace prefixes so the original document is reconstructable", `root.doc = this.doc.parse_xml(preserve_namespaces: true)`, [2]string{
+ `{"doc":"Hello"}`,
+ `{"doc":{"root":{"-xmlns:dc":"http://my.namespace/dc","dc:title":"Hello"}}}`,
+ }).
Param(bloblang.NewBoolParam("cast").
Description("Whether to automatically cast numeric and boolean string values to their proper types. When false, all values remain as strings.").
+ Optional().Default(false)).
+ Param(bloblang.NewBoolParam("preserve_namespaces").
+ Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped.").
Optional().Default(false)),
func(args *bloblang.ParsedParams) (bloblang.Method, error) {
castOpt, err := args.GetOptionalBool("cast")
@@ -54,8 +61,24 @@ func init() {
if castOpt != nil {
cast = *castOpt
}
+ preserveOpt, err := args.GetOptionalBool("preserve_namespaces")
+ if err != nil {
+ return nil, err
+ }
+ preserveNS := false
+ if preserveOpt != nil {
+ preserveNS = *preserveOpt
+ }
return bloblang.BytesMethod(func(xmlBytes []byte) (any, error) {
- xmlObj, err := ToMap(xmlBytes, cast)
+ var (
+ xmlObj map[string]any
+ err error
+ )
+ if preserveNS {
+ xmlObj, err = ToMapPreserveNS(xmlBytes, cast)
+ } else {
+ xmlObj, err = ToMap(xmlBytes, cast)
+ }
if err != nil {
return nil, fmt.Errorf("parsing value as XML: %w", err)
}
diff --git a/internal/impl/xml/bloblang_test.go b/internal/impl/xml/bloblang_test.go
index 3d247dc78c..80eaa9b8c1 100644
--- a/internal/impl/xml/bloblang_test.go
+++ b/internal/impl/xml/bloblang_test.go
@@ -48,6 +48,23 @@ func TestParseXML(t *testing.T) {
args: `true`,
exp: map[string]any{"root": map[string]any{"bool": true, "number": map[string]any{"#text": float64(123), "-id": float64(99)}, "title": "This is a title"}},
},
+ {
+ name: "preserve namespaces for issue 3928",
+ target: `Hello`,
+ args: `preserve_namespaces: true`,
+ exp: map[string]any{"root": map[string]any{
+ "-xmlns:dc": "http://my.namespace/dc",
+ "dc:title": "Hello",
+ }},
+ },
+ {
+ name: "preserve namespaces is opt-in",
+ target: `Hello`,
+ exp: map[string]any{"root": map[string]any{
+ "-dc": "http://my.namespace/dc",
+ "title": "Hello",
+ }},
+ },
}
for _, test := range testCases {
diff --git a/internal/impl/xml/package.go b/internal/impl/xml/package.go
index e7a454f087..9398efae06 100644
--- a/internal/impl/xml/package.go
+++ b/internal/impl/xml/package.go
@@ -19,7 +19,12 @@
package xml
import (
+ "bytes"
"encoding/xml"
+ "fmt"
+ "io"
+ "strconv"
+ "strings"
"github.com/clbanning/mxj/v2"
"golang.org/x/net/html/charset"
@@ -41,3 +46,150 @@ func ToMap(xmlBytes []byte, cast bool) (map[string]any, error) {
}
return map[string]any(root), nil
}
+
+// ToMapPreserveNS parses a byte slice as XML with namespace prefixes preserved
+// on element and attribute keys (e.g. "" becomes the key "dc:title")
+// and with xmlns declarations retained as attributes, so the original XML is
+// reconstructable from the resulting JSON. The output shape otherwise matches
+// [ToMap]: attributes are prefixed with "-", mixed text content uses the
+// "#text" key, and repeated elements are collected into arrays.
+func ToMapPreserveNS(xmlBytes []byte, cast bool) (map[string]any, error) {
+ dec := xml.NewDecoder(bytes.NewReader(xmlBytes))
+ dec.Strict = false
+ dec.CharsetReader = charset.NewReaderLabel
+
+ for {
+ tok, err := dec.Token()
+ if err == io.EOF {
+ return nil, fmt.Errorf("xml: no root element found")
+ }
+ if err != nil {
+ return nil, err
+ }
+ if se, ok := tok.(xml.StartElement); ok {
+ key, val, err := parseElementNS(dec, se, map[string]string{}, cast)
+ if err != nil {
+ return nil, err
+ }
+ return map[string]any{key: val}, nil
+ }
+ }
+}
+
+// collectPrefixDecls records URI→prefix mappings from xmlns:* attributes on an
+// element into the provided scope map.
+func collectPrefixDecls(se xml.StartElement, scope map[string]string) {
+ for _, a := range se.Attr {
+ if a.Name.Space == "xmlns" {
+ scope[a.Value] = a.Name.Local
+ }
+ }
+}
+
+// qnameWithPrefix returns "prefix:local" when a prefix is known for the
+// namespace URI; when the namespace was never bound via xmlns, Go's decoder
+// leaves Name.Space as the raw prefix string which is used directly.
+func qnameWithPrefix(n xml.Name, scope map[string]string) string {
+ if n.Space == "" {
+ return n.Local
+ }
+ if p, ok := scope[n.Space]; ok {
+ return p + ":" + n.Local
+ }
+ return n.Space + ":" + n.Local
+}
+
+func parseElementNS(dec *xml.Decoder, se xml.StartElement, parent map[string]string, cast bool) (string, any, error) {
+ scope := make(map[string]string, len(parent)+len(se.Attr))
+ for k, v := range parent {
+ scope[k] = v
+ }
+ collectPrefixDecls(se, scope)
+
+ out := map[string]any{}
+ for _, a := range se.Attr {
+ var key string
+ isNSDecl := false
+ switch {
+ case a.Name.Space == "xmlns":
+ key = "-xmlns:" + a.Name.Local
+ isNSDecl = true
+ case a.Name.Space == "" && a.Name.Local == "xmlns":
+ key = "-xmlns"
+ isNSDecl = true
+ default:
+ key = "-" + qnameWithPrefix(a.Name, scope)
+ }
+ if isNSDecl {
+ out[key] = a.Value
+ } else {
+ out[key] = castString(a.Value, cast)
+ }
+ }
+
+ var text strings.Builder
+ for {
+ tok, err := dec.Token()
+ if err != nil {
+ return "", nil, err
+ }
+ switch t := tok.(type) {
+ case xml.StartElement:
+ k, v, err := parseElementNS(dec, t, scope, cast)
+ if err != nil {
+ return "", nil, err
+ }
+ if existing, ok := out[k]; ok {
+ if arr, isArr := existing.([]any); isArr {
+ out[k] = append(arr, v)
+ } else {
+ out[k] = []any{existing, v}
+ }
+ } else {
+ out[k] = v
+ }
+ case xml.CharData:
+ text.Write(t)
+ case xml.EndElement:
+ s := strings.TrimSpace(text.String())
+ key := qnameWithPrefix(se.Name, scope)
+ if len(out) == 0 {
+ return key, castString(s, cast), nil
+ }
+ if s != "" {
+ out["#text"] = castString(s, cast)
+ }
+ return key, out, nil
+ }
+ }
+}
+
+// castString mirrors clbanning/mxj's default cast order when casting is
+// enabled: int → float → bool, with NaN/Inf left as strings.
+func castString(s string, cast bool) any {
+ if !cast || s == "" {
+ return s
+ }
+ switch strings.ToLower(s) {
+ case "nan", "inf", "-inf":
+ return s
+ }
+ if i, err := strconv.ParseInt(s, 10, 64); err == nil {
+ return i
+ }
+ if u, err := strconv.ParseUint(s, 10, 64); err == nil {
+ return u
+ }
+ if f, err := strconv.ParseFloat(s, 64); err == nil {
+ return f
+ }
+ if len(s) < 6 {
+ switch s[:1] {
+ case "t", "T", "f", "F":
+ if b, err := strconv.ParseBool(s); err == nil {
+ return b
+ }
+ }
+ }
+ return s
+}
diff --git a/internal/impl/xml/processor.go b/internal/impl/xml/processor.go
index 824792d9da..5fc05e9bf0 100644
--- a/internal/impl/xml/processor.go
+++ b/internal/impl/xml/processor.go
@@ -22,8 +22,9 @@ import (
)
const (
- pFieldOperator = "operator"
- pFieldCast = "cast"
+ pFieldOperator = "operator"
+ pFieldCast = "cast"
+ pFieldPreserveNamespace = "preserve_namespaces"
)
func xmlProcSpec() *service.ConfigSpec {
@@ -91,6 +92,43 @@ With cast set to true, the resulting JSON structure would look like this:
]
}
}
+`+"```"+`
+
+== Preserving XML namespaces
+
+By default namespace prefixes on elements and attributes are dropped during conversion (e.g. `+"``"+` becomes the key `+"`title`"+`), which makes the original XML impossible to reconstruct from the resulting JSON. Set `+"`preserve_namespaces`"+` to `+"`true`"+` to retain prefixes on element and attribute keys and to keep `+"`xmlns:*`"+` declarations as attributes.
+
+For example, given the following XML:
+
+`+"```xml"+`
+
+ This is a title
+ This is a description
+ foo1
+ foo2
+ foo3
+
+`+"```"+`
+
+With `+"`preserve_namespaces: true`"+` the resulting JSON structure would look like this:
+
+`+"```json"+`
+{
+ "root":{
+ "-xmlns:dc":"http://my.namespace/dc",
+ "-xmlns:ot":"http://my.namespace/ot",
+ "dc:title":"This is a title",
+ "dc:description":{
+ "#text":"This is a description",
+ "-tone":"boring"
+ },
+ "ot:elements":[
+ {"#text":"foo1","-id":"1"},
+ {"#text":"foo2","-id":"2"},
+ "foo3"
+ ]
+ }
+}
`+"```").
Fields(
service.NewStringEnumField(pFieldOperator, "to_json").
@@ -99,6 +137,9 @@ With cast set to true, the resulting JSON structure would look like this:
service.NewBoolField(pFieldCast).
Description("Whether to try to cast values that are numbers and booleans to the right type. Default: all values are strings.").
Default(false),
+ service.NewBoolField(pFieldPreserveNamespace).
+ Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped.").
+ Default(false),
)
}
@@ -111,8 +152,9 @@ func init() {
}
type xmlProc struct {
- log *service.Logger
- cast bool
+ log *service.Logger
+ cast bool
+ preserveNSPrefix bool
}
func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xmlProc, error) {
@@ -129,9 +171,15 @@ func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xm
return nil, err
}
+ preserveNS, err := pConf.FieldBool(pFieldPreserveNamespace)
+ if err != nil {
+ return nil, err
+ }
+
j := &xmlProc{
- log: mgr.Logger(),
- cast: cast,
+ log: mgr.Logger(),
+ cast: cast,
+ preserveNSPrefix: preserveNS,
}
return j, nil
}
@@ -142,7 +190,12 @@ func (p *xmlProc) Process(_ context.Context, msg *service.Message) (service.Mess
return nil, err
}
- root, err := ToMap(mBytes, p.cast)
+ var root map[string]any
+ if p.preserveNSPrefix {
+ root, err = ToMapPreserveNS(mBytes, p.cast)
+ } else {
+ root, err = ToMap(mBytes, p.cast)
+ }
if err != nil {
p.log.Debugf("Failed to parse part as XML: %v", err)
return nil, err
diff --git a/internal/impl/xml/processor_test.go b/internal/impl/xml/processor_test.go
index 3b87e5a96a..c17b747162 100644
--- a/internal/impl/xml/processor_test.go
+++ b/internal/impl/xml/processor_test.go
@@ -146,3 +146,117 @@ cast: true
assert.Equal(t, `{"root":{"bool":true,"number":{"#text":123,"-id":99},"title":"This is a title"}}`, string(mBytes))
}
+
+func TestXMLPreserveNamespaces(t *testing.T) {
+ type testCase struct {
+ name string
+ input string
+ output string
+ }
+ tests := []testCase{
+ {
+ name: "issue 3928 example",
+ input: `
+ This is a title
+ This is a description
+ foo1
+ foo2
+ foo3
+`,
+ output: `{"root":{"-xmlns:dc":"http://my.namespace/dc","-xmlns:ot":"http://my.namespace/ot","dc:description":{"#text":"This is a description","-tone":"boring"},"dc:title":"This is a title","ot:elements":[{"#text":"foo1","-id":"1"},{"#text":"foo2","-id":"2"},"foo3"]}}`,
+ },
+ {
+ name: "no namespaces behaves like default mode",
+ input: `
+ foo1
+`,
+ output: `{"root":{"next":"foo1"}}`,
+ },
+ {
+ name: "nested element redeclares prefix",
+ input: `
+ outer
+
+ inner
+
+`,
+ output: `{"root":{"-xmlns:a":"urn:outer","a:item":"outer","inner":{"-xmlns:a":"urn:inner","a:item":"inner"}}}`,
+ },
+ {
+ name: "attribute with namespace prefix",
+ input: `- foo
`,
+ output: `{"root":{"-xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance","item":{"#text":"foo","-xsi:type":"string"}}}`,
+ },
+ {
+ name: "prefix used without xmlns declaration stays literal",
+ input: `Hello`,
+ output: `{"root":{"dc:title":"Hello"}}`,
+ },
+ }
+
+ pConf, err := xmlProcSpec().ParseYAML(`
+operator: to_json
+preserve_namespaces: true
+`, nil)
+ require.NoError(t, err)
+
+ proc, err := xmlProcFromParsed(pConf, service.MockResources())
+ require.NoError(t, err)
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(test.input)))
+ require.NoError(t, err)
+ require.Len(t, msgsOut, 1)
+
+ mBytes, err := msgsOut[0].AsBytes()
+ require.NoError(t, err)
+
+ assert.Equal(t, test.output, string(mBytes))
+ })
+ }
+}
+
+func TestXMLPreserveNamespacesWithCast(t *testing.T) {
+ pConf, err := xmlProcSpec().ParseYAML(`
+operator: to_json
+cast: true
+preserve_namespaces: true
+`, nil)
+ require.NoError(t, err)
+
+ proc, err := xmlProcFromParsed(pConf, service.MockResources())
+ require.NoError(t, err)
+
+ testString := `123True`
+
+ msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(testString)))
+ require.NoError(t, err)
+ require.Len(t, msgsOut, 1)
+
+ mBytes, err := msgsOut[0].AsBytes()
+ require.NoError(t, err)
+
+ assert.Equal(t, `{"root":{"-xmlns:n":"urn:num","n:flag":true,"n:value":{"#text":123,"-id":99}}}`, string(mBytes))
+}
+
+func TestXMLDefaultStripsNamespacesUnchanged(t *testing.T) {
+ // Regression guard: the default (preserve_namespaces omitted) must keep
+ // the previous lossy-but-backwards-compatible behaviour.
+ pConf, err := xmlProcSpec().ParseYAML(`operator: to_json`, nil)
+ require.NoError(t, err)
+
+ proc, err := xmlProcFromParsed(pConf, service.MockResources())
+ require.NoError(t, err)
+
+ testString := `Hello`
+
+ msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(testString)))
+ require.NoError(t, err)
+ require.Len(t, msgsOut, 1)
+
+ mBytes, err := msgsOut[0].AsBytes()
+ require.NoError(t, err)
+
+ assert.Equal(t, `{"root":{"-dc":"http://my.namespace/dc","title":"Hello"}}`, string(mBytes))
+}