diff --git a/internal/impl/xml/bloblang.go b/internal/impl/xml/bloblang.go index e7b836c8a3..de517a9de5 100644 --- a/internal/impl/xml/bloblang.go +++ b/internal/impl/xml/bloblang.go @@ -42,8 +42,15 @@ func init() { `{"doc":"This is a title123True"}`, `{"doc":{"root":{"bool":true,"number":{"#text":123,"-id":99},"title":"This is a title"}}}`, }). + Example("Parse XML preserving namespace prefixes so the original document is reconstructable", `root.doc = this.doc.parse_xml(preserve_namespaces: true)`, [2]string{ + `{"doc":"Hello"}`, + `{"doc":{"root":{"-xmlns:dc":"http://my.namespace/dc","dc:title":"Hello"}}}`, + }). Param(bloblang.NewBoolParam("cast"). Description("Whether to automatically cast numeric and boolean string values to their proper types. When false, all values remain as strings."). + Optional().Default(false)). + Param(bloblang.NewBoolParam("preserve_namespaces"). + Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped."). Optional().Default(false)), func(args *bloblang.ParsedParams) (bloblang.Method, error) { castOpt, err := args.GetOptionalBool("cast") @@ -54,8 +61,24 @@ func init() { if castOpt != nil { cast = *castOpt } + preserveOpt, err := args.GetOptionalBool("preserve_namespaces") + if err != nil { + return nil, err + } + preserveNS := false + if preserveOpt != nil { + preserveNS = *preserveOpt + } return bloblang.BytesMethod(func(xmlBytes []byte) (any, error) { - xmlObj, err := ToMap(xmlBytes, cast) + var ( + xmlObj map[string]any + err error + ) + if preserveNS { + xmlObj, err = ToMapPreserveNS(xmlBytes, cast) + } else { + xmlObj, err = ToMap(xmlBytes, cast) + } if err != nil { return nil, fmt.Errorf("parsing value as XML: %w", err) } diff --git a/internal/impl/xml/bloblang_test.go b/internal/impl/xml/bloblang_test.go index 3d247dc78c..80eaa9b8c1 100644 --- a/internal/impl/xml/bloblang_test.go +++ b/internal/impl/xml/bloblang_test.go @@ -48,6 +48,23 @@ func TestParseXML(t *testing.T) { args: `true`, exp: map[string]any{"root": map[string]any{"bool": true, "number": map[string]any{"#text": float64(123), "-id": float64(99)}, "title": "This is a title"}}, }, + { + name: "preserve namespaces for issue 3928", + target: `Hello`, + args: `preserve_namespaces: true`, + exp: map[string]any{"root": map[string]any{ + "-xmlns:dc": "http://my.namespace/dc", + "dc:title": "Hello", + }}, + }, + { + name: "preserve namespaces is opt-in", + target: `Hello`, + exp: map[string]any{"root": map[string]any{ + "-dc": "http://my.namespace/dc", + "title": "Hello", + }}, + }, } for _, test := range testCases { diff --git a/internal/impl/xml/package.go b/internal/impl/xml/package.go index e7a454f087..9398efae06 100644 --- a/internal/impl/xml/package.go +++ b/internal/impl/xml/package.go @@ -19,7 +19,12 @@ package xml import ( + "bytes" "encoding/xml" + "fmt" + "io" + "strconv" + "strings" "github.com/clbanning/mxj/v2" "golang.org/x/net/html/charset" @@ -41,3 +46,150 @@ func ToMap(xmlBytes []byte, cast bool) (map[string]any, error) { } return map[string]any(root), nil } + +// ToMapPreserveNS parses a byte slice as XML with namespace prefixes preserved +// on element and attribute keys (e.g. "" becomes the key "dc:title") +// and with xmlns declarations retained as attributes, so the original XML is +// reconstructable from the resulting JSON. The output shape otherwise matches +// [ToMap]: attributes are prefixed with "-", mixed text content uses the +// "#text" key, and repeated elements are collected into arrays. +func ToMapPreserveNS(xmlBytes []byte, cast bool) (map[string]any, error) { + dec := xml.NewDecoder(bytes.NewReader(xmlBytes)) + dec.Strict = false + dec.CharsetReader = charset.NewReaderLabel + + for { + tok, err := dec.Token() + if err == io.EOF { + return nil, fmt.Errorf("xml: no root element found") + } + if err != nil { + return nil, err + } + if se, ok := tok.(xml.StartElement); ok { + key, val, err := parseElementNS(dec, se, map[string]string{}, cast) + if err != nil { + return nil, err + } + return map[string]any{key: val}, nil + } + } +} + +// collectPrefixDecls records URI→prefix mappings from xmlns:* attributes on an +// element into the provided scope map. +func collectPrefixDecls(se xml.StartElement, scope map[string]string) { + for _, a := range se.Attr { + if a.Name.Space == "xmlns" { + scope[a.Value] = a.Name.Local + } + } +} + +// qnameWithPrefix returns "prefix:local" when a prefix is known for the +// namespace URI; when the namespace was never bound via xmlns, Go's decoder +// leaves Name.Space as the raw prefix string which is used directly. +func qnameWithPrefix(n xml.Name, scope map[string]string) string { + if n.Space == "" { + return n.Local + } + if p, ok := scope[n.Space]; ok { + return p + ":" + n.Local + } + return n.Space + ":" + n.Local +} + +func parseElementNS(dec *xml.Decoder, se xml.StartElement, parent map[string]string, cast bool) (string, any, error) { + scope := make(map[string]string, len(parent)+len(se.Attr)) + for k, v := range parent { + scope[k] = v + } + collectPrefixDecls(se, scope) + + out := map[string]any{} + for _, a := range se.Attr { + var key string + isNSDecl := false + switch { + case a.Name.Space == "xmlns": + key = "-xmlns:" + a.Name.Local + isNSDecl = true + case a.Name.Space == "" && a.Name.Local == "xmlns": + key = "-xmlns" + isNSDecl = true + default: + key = "-" + qnameWithPrefix(a.Name, scope) + } + if isNSDecl { + out[key] = a.Value + } else { + out[key] = castString(a.Value, cast) + } + } + + var text strings.Builder + for { + tok, err := dec.Token() + if err != nil { + return "", nil, err + } + switch t := tok.(type) { + case xml.StartElement: + k, v, err := parseElementNS(dec, t, scope, cast) + if err != nil { + return "", nil, err + } + if existing, ok := out[k]; ok { + if arr, isArr := existing.([]any); isArr { + out[k] = append(arr, v) + } else { + out[k] = []any{existing, v} + } + } else { + out[k] = v + } + case xml.CharData: + text.Write(t) + case xml.EndElement: + s := strings.TrimSpace(text.String()) + key := qnameWithPrefix(se.Name, scope) + if len(out) == 0 { + return key, castString(s, cast), nil + } + if s != "" { + out["#text"] = castString(s, cast) + } + return key, out, nil + } + } +} + +// castString mirrors clbanning/mxj's default cast order when casting is +// enabled: int → float → bool, with NaN/Inf left as strings. +func castString(s string, cast bool) any { + if !cast || s == "" { + return s + } + switch strings.ToLower(s) { + case "nan", "inf", "-inf": + return s + } + if i, err := strconv.ParseInt(s, 10, 64); err == nil { + return i + } + if u, err := strconv.ParseUint(s, 10, 64); err == nil { + return u + } + if f, err := strconv.ParseFloat(s, 64); err == nil { + return f + } + if len(s) < 6 { + switch s[:1] { + case "t", "T", "f", "F": + if b, err := strconv.ParseBool(s); err == nil { + return b + } + } + } + return s +} diff --git a/internal/impl/xml/processor.go b/internal/impl/xml/processor.go index 824792d9da..5fc05e9bf0 100644 --- a/internal/impl/xml/processor.go +++ b/internal/impl/xml/processor.go @@ -22,8 +22,9 @@ import ( ) const ( - pFieldOperator = "operator" - pFieldCast = "cast" + pFieldOperator = "operator" + pFieldCast = "cast" + pFieldPreserveNamespace = "preserve_namespaces" ) func xmlProcSpec() *service.ConfigSpec { @@ -91,6 +92,43 @@ With cast set to true, the resulting JSON structure would look like this: ] } } +`+"```"+` + +== Preserving XML namespaces + +By default namespace prefixes on elements and attributes are dropped during conversion (e.g. `+"``"+` becomes the key `+"`title`"+`), which makes the original XML impossible to reconstruct from the resulting JSON. Set `+"`preserve_namespaces`"+` to `+"`true`"+` to retain prefixes on element and attribute keys and to keep `+"`xmlns:*`"+` declarations as attributes. + +For example, given the following XML: + +`+"```xml"+` + + This is a title + This is a description + foo1 + foo2 + foo3 + +`+"```"+` + +With `+"`preserve_namespaces: true`"+` the resulting JSON structure would look like this: + +`+"```json"+` +{ + "root":{ + "-xmlns:dc":"http://my.namespace/dc", + "-xmlns:ot":"http://my.namespace/ot", + "dc:title":"This is a title", + "dc:description":{ + "#text":"This is a description", + "-tone":"boring" + }, + "ot:elements":[ + {"#text":"foo1","-id":"1"}, + {"#text":"foo2","-id":"2"}, + "foo3" + ] + } +} `+"```"). Fields( service.NewStringEnumField(pFieldOperator, "to_json"). @@ -99,6 +137,9 @@ With cast set to true, the resulting JSON structure would look like this: service.NewBoolField(pFieldCast). Description("Whether to try to cast values that are numbers and booleans to the right type. Default: all values are strings."). Default(false), + service.NewBoolField(pFieldPreserveNamespace). + Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped."). + Default(false), ) } @@ -111,8 +152,9 @@ func init() { } type xmlProc struct { - log *service.Logger - cast bool + log *service.Logger + cast bool + preserveNSPrefix bool } func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xmlProc, error) { @@ -129,9 +171,15 @@ func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xm return nil, err } + preserveNS, err := pConf.FieldBool(pFieldPreserveNamespace) + if err != nil { + return nil, err + } + j := &xmlProc{ - log: mgr.Logger(), - cast: cast, + log: mgr.Logger(), + cast: cast, + preserveNSPrefix: preserveNS, } return j, nil } @@ -142,7 +190,12 @@ func (p *xmlProc) Process(_ context.Context, msg *service.Message) (service.Mess return nil, err } - root, err := ToMap(mBytes, p.cast) + var root map[string]any + if p.preserveNSPrefix { + root, err = ToMapPreserveNS(mBytes, p.cast) + } else { + root, err = ToMap(mBytes, p.cast) + } if err != nil { p.log.Debugf("Failed to parse part as XML: %v", err) return nil, err diff --git a/internal/impl/xml/processor_test.go b/internal/impl/xml/processor_test.go index 3b87e5a96a..c17b747162 100644 --- a/internal/impl/xml/processor_test.go +++ b/internal/impl/xml/processor_test.go @@ -146,3 +146,117 @@ cast: true assert.Equal(t, `{"root":{"bool":true,"number":{"#text":123,"-id":99},"title":"This is a title"}}`, string(mBytes)) } + +func TestXMLPreserveNamespaces(t *testing.T) { + type testCase struct { + name string + input string + output string + } + tests := []testCase{ + { + name: "issue 3928 example", + input: ` + This is a title + This is a description + foo1 + foo2 + foo3 +`, + output: `{"root":{"-xmlns:dc":"http://my.namespace/dc","-xmlns:ot":"http://my.namespace/ot","dc:description":{"#text":"This is a description","-tone":"boring"},"dc:title":"This is a title","ot:elements":[{"#text":"foo1","-id":"1"},{"#text":"foo2","-id":"2"},"foo3"]}}`, + }, + { + name: "no namespaces behaves like default mode", + input: ` + foo1 +`, + output: `{"root":{"next":"foo1"}}`, + }, + { + name: "nested element redeclares prefix", + input: ` + outer + + inner + +`, + output: `{"root":{"-xmlns:a":"urn:outer","a:item":"outer","inner":{"-xmlns:a":"urn:inner","a:item":"inner"}}}`, + }, + { + name: "attribute with namespace prefix", + input: `foo`, + output: `{"root":{"-xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance","item":{"#text":"foo","-xsi:type":"string"}}}`, + }, + { + name: "prefix used without xmlns declaration stays literal", + input: `Hello`, + output: `{"root":{"dc:title":"Hello"}}`, + }, + } + + pConf, err := xmlProcSpec().ParseYAML(` +operator: to_json +preserve_namespaces: true +`, nil) + require.NoError(t, err) + + proc, err := xmlProcFromParsed(pConf, service.MockResources()) + require.NoError(t, err) + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(test.input))) + require.NoError(t, err) + require.Len(t, msgsOut, 1) + + mBytes, err := msgsOut[0].AsBytes() + require.NoError(t, err) + + assert.Equal(t, test.output, string(mBytes)) + }) + } +} + +func TestXMLPreserveNamespacesWithCast(t *testing.T) { + pConf, err := xmlProcSpec().ParseYAML(` +operator: to_json +cast: true +preserve_namespaces: true +`, nil) + require.NoError(t, err) + + proc, err := xmlProcFromParsed(pConf, service.MockResources()) + require.NoError(t, err) + + testString := `123True` + + msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(testString))) + require.NoError(t, err) + require.Len(t, msgsOut, 1) + + mBytes, err := msgsOut[0].AsBytes() + require.NoError(t, err) + + assert.Equal(t, `{"root":{"-xmlns:n":"urn:num","n:flag":true,"n:value":{"#text":123,"-id":99}}}`, string(mBytes)) +} + +func TestXMLDefaultStripsNamespacesUnchanged(t *testing.T) { + // Regression guard: the default (preserve_namespaces omitted) must keep + // the previous lossy-but-backwards-compatible behaviour. + pConf, err := xmlProcSpec().ParseYAML(`operator: to_json`, nil) + require.NoError(t, err) + + proc, err := xmlProcFromParsed(pConf, service.MockResources()) + require.NoError(t, err) + + testString := `Hello` + + msgsOut, err := proc.Process(t.Context(), service.NewMessage([]byte(testString))) + require.NoError(t, err) + require.Len(t, msgsOut, 1) + + mBytes, err := msgsOut[0].AsBytes() + require.NoError(t, err) + + assert.Equal(t, `{"root":{"-dc":"http://my.namespace/dc","title":"Hello"}}`, string(mBytes)) +}