Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion internal/impl/xml/bloblang.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,15 @@ func init() {
`{"doc":"<root><title>This is a title</title><number id=\"99\">123</number><bool>True</bool></root>"}`,
`{"doc":{"root":{"bool":true,"number":{"#text":123,"-id":99},"title":"This is a title"}}}`,
}).
Example("Parse XML preserving namespace prefixes so the original document is reconstructable", `root.doc = this.doc.parse_xml(preserve_namespaces: true)`, [2]string{
`{"doc":"<root xmlns:dc=\"http://my.namespace/dc\"><dc:title>Hello</dc:title></root>"}`,
`{"doc":{"root":{"-xmlns:dc":"http://my.namespace/dc","dc:title":"Hello"}}}`,
}).
Param(bloblang.NewBoolParam("cast").
Description("Whether to automatically cast numeric and boolean string values to their proper types. When false, all values remain as strings.").
Optional().Default(false)).
Param(bloblang.NewBoolParam("preserve_namespaces").
Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped.").
Optional().Default(false)),
func(args *bloblang.ParsedParams) (bloblang.Method, error) {
castOpt, err := args.GetOptionalBool("cast")
Expand All @@ -54,8 +61,24 @@ func init() {
if castOpt != nil {
cast = *castOpt
}
preserveOpt, err := args.GetOptionalBool("preserve_namespaces")
if err != nil {
return nil, err
}
preserveNS := false
if preserveOpt != nil {
preserveNS = *preserveOpt
}
return bloblang.BytesMethod(func(xmlBytes []byte) (any, error) {
xmlObj, err := ToMap(xmlBytes, cast)
var (
xmlObj map[string]any
err error
)
if preserveNS {
xmlObj, err = ToMapPreserveNS(xmlBytes, cast)
} else {
xmlObj, err = ToMap(xmlBytes, cast)
}
if err != nil {
return nil, fmt.Errorf("parsing value as XML: %w", err)
}
Expand Down
17 changes: 17 additions & 0 deletions internal/impl/xml/bloblang_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@ func TestParseXML(t *testing.T) {
args: `true`,
exp: map[string]any{"root": map[string]any{"bool": true, "number": map[string]any{"#text": float64(123), "-id": float64(99)}, "title": "This is a title"}},
},
{
name: "preserve namespaces for issue 3928",
target: `<root xmlns:dc="http://my.namespace/dc"><dc:title>Hello</dc:title></root>`,
args: `preserve_namespaces: true`,
exp: map[string]any{"root": map[string]any{
"-xmlns:dc": "http://my.namespace/dc",
"dc:title": "Hello",
}},
},
{
name: "preserve namespaces is opt-in",
target: `<root xmlns:dc="http://my.namespace/dc"><dc:title>Hello</dc:title></root>`,
exp: map[string]any{"root": map[string]any{
"-dc": "http://my.namespace/dc",
"title": "Hello",
}},
},
}

for _, test := range testCases {
Expand Down
152 changes: 152 additions & 0 deletions internal/impl/xml/package.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
package xml

import (
"bytes"
"encoding/xml"
"fmt"
"io"
"strconv"
"strings"

"github.com/clbanning/mxj/v2"
"golang.org/x/net/html/charset"
Expand All @@ -41,3 +46,150 @@ func ToMap(xmlBytes []byte, cast bool) (map[string]any, error) {
}
return map[string]any(root), nil
}

// ToMapPreserveNS parses a byte slice as XML with namespace prefixes preserved
// on element and attribute keys (e.g. "<dc:title>" becomes the key "dc:title")
// and with xmlns declarations retained as attributes, so the original XML is
// reconstructable from the resulting JSON. The output shape otherwise matches
// [ToMap]: attributes are prefixed with "-", mixed text content uses the
// "#text" key, and repeated elements are collected into arrays.
func ToMapPreserveNS(xmlBytes []byte, cast bool) (map[string]any, error) {
dec := xml.NewDecoder(bytes.NewReader(xmlBytes))
dec.Strict = false
dec.CharsetReader = charset.NewReaderLabel

for {
tok, err := dec.Token()
if err == io.EOF {
return nil, fmt.Errorf("xml: no root element found")
}
if err != nil {
return nil, err
}
if se, ok := tok.(xml.StartElement); ok {
key, val, err := parseElementNS(dec, se, map[string]string{}, cast)
if err != nil {
return nil, err
}
return map[string]any{key: val}, nil
}
}
}

// collectPrefixDecls records URI→prefix mappings from xmlns:* attributes on an
// element into the provided scope map.
func collectPrefixDecls(se xml.StartElement, scope map[string]string) {
for _, a := range se.Attr {
if a.Name.Space == "xmlns" {
scope[a.Value] = a.Name.Local
}
}
}

// qnameWithPrefix returns "prefix:local" when a prefix is known for the
// namespace URI; when the namespace was never bound via xmlns, Go's decoder
// leaves Name.Space as the raw prefix string which is used directly.
func qnameWithPrefix(n xml.Name, scope map[string]string) string {
if n.Space == "" {
return n.Local
}
if p, ok := scope[n.Space]; ok {
return p + ":" + n.Local
}
return n.Space + ":" + n.Local
}

func parseElementNS(dec *xml.Decoder, se xml.StartElement, parent map[string]string, cast bool) (string, any, error) {
scope := make(map[string]string, len(parent)+len(se.Attr))
for k, v := range parent {
scope[k] = v
}
collectPrefixDecls(se, scope)

out := map[string]any{}
for _, a := range se.Attr {
var key string
isNSDecl := false
switch {
case a.Name.Space == "xmlns":
key = "-xmlns:" + a.Name.Local
isNSDecl = true
case a.Name.Space == "" && a.Name.Local == "xmlns":
key = "-xmlns"
isNSDecl = true
default:
key = "-" + qnameWithPrefix(a.Name, scope)
}
if isNSDecl {
out[key] = a.Value
} else {
out[key] = castString(a.Value, cast)
}
}

var text strings.Builder
for {
tok, err := dec.Token()
if err != nil {
return "", nil, err
}
switch t := tok.(type) {
case xml.StartElement:
k, v, err := parseElementNS(dec, t, scope, cast)
if err != nil {
return "", nil, err
}
if existing, ok := out[k]; ok {
if arr, isArr := existing.([]any); isArr {
out[k] = append(arr, v)
} else {
out[k] = []any{existing, v}
}
} else {
out[k] = v
}
case xml.CharData:
text.Write(t)
case xml.EndElement:
s := strings.TrimSpace(text.String())
key := qnameWithPrefix(se.Name, scope)
if len(out) == 0 {
return key, castString(s, cast), nil
}
if s != "" {
out["#text"] = castString(s, cast)
}
return key, out, nil
}
}
}

// castString mirrors clbanning/mxj's default cast order when casting is
// enabled: int → float → bool, with NaN/Inf left as strings.
func castString(s string, cast bool) any {
if !cast || s == "" {
return s
}
switch strings.ToLower(s) {
case "nan", "inf", "-inf":
return s
}
if i, err := strconv.ParseInt(s, 10, 64); err == nil {
return i
}
if u, err := strconv.ParseUint(s, 10, 64); err == nil {
return u
}
if f, err := strconv.ParseFloat(s, 64); err == nil {
return f
}
if len(s) < 6 {
switch s[:1] {
case "t", "T", "f", "F":
if b, err := strconv.ParseBool(s); err == nil {
return b
}
}
}
return s
}
67 changes: 60 additions & 7 deletions internal/impl/xml/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ import (
)

const (
pFieldOperator = "operator"
pFieldCast = "cast"
pFieldOperator = "operator"
pFieldCast = "cast"
pFieldPreserveNamespace = "preserve_namespaces"
)

func xmlProcSpec() *service.ConfigSpec {
Expand Down Expand Up @@ -91,6 +92,43 @@ With cast set to true, the resulting JSON structure would look like this:
]
}
}
`+"```"+`

== Preserving XML namespaces

By default namespace prefixes on elements and attributes are dropped during conversion (e.g. `+"`<dc:title>`"+` becomes the key `+"`title`"+`), which makes the original XML impossible to reconstruct from the resulting JSON. Set `+"`preserve_namespaces`"+` to `+"`true`"+` to retain prefixes on element and attribute keys and to keep `+"`xmlns:*`"+` declarations as attributes.

For example, given the following XML:

`+"```xml"+`
<root xmlns:dc="http://my.namespace/dc" xmlns:ot="http://my.namespace/ot">
<dc:title>This is a title</dc:title>
<dc:description tone="boring">This is a description</dc:description>
<ot:elements id="1">foo1</ot:elements>
<ot:elements id="2">foo2</ot:elements>
<ot:elements>foo3</ot:elements>
</root>
`+"```"+`

With `+"`preserve_namespaces: true`"+` the resulting JSON structure would look like this:

`+"```json"+`
{
"root":{
"-xmlns:dc":"http://my.namespace/dc",
"-xmlns:ot":"http://my.namespace/ot",
"dc:title":"This is a title",
"dc:description":{
"#text":"This is a description",
"-tone":"boring"
},
"ot:elements":[
{"#text":"foo1","-id":"1"},
{"#text":"foo2","-id":"2"},
"foo3"
]
}
}
`+"```").
Fields(
service.NewStringEnumField(pFieldOperator, "to_json").
Expand All @@ -99,6 +137,9 @@ With cast set to true, the resulting JSON structure would look like this:
service.NewBoolField(pFieldCast).
Description("Whether to try to cast values that are numbers and booleans to the right type. Default: all values are strings.").
Default(false),
service.NewBoolField(pFieldPreserveNamespace).
Description("Whether to preserve XML namespace prefixes on element and attribute keys, and retain xmlns declarations as attributes. When disabled, namespace prefixes are stripped.").
Default(false),
)
}

Expand All @@ -111,8 +152,9 @@ func init() {
}

type xmlProc struct {
log *service.Logger
cast bool
log *service.Logger
cast bool
preserveNSPrefix bool
}

func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xmlProc, error) {
Expand All @@ -129,9 +171,15 @@ func xmlProcFromParsed(pConf *service.ParsedConfig, mgr *service.Resources) (*xm
return nil, err
}

preserveNS, err := pConf.FieldBool(pFieldPreserveNamespace)
if err != nil {
return nil, err
}

j := &xmlProc{
log: mgr.Logger(),
cast: cast,
log: mgr.Logger(),
cast: cast,
preserveNSPrefix: preserveNS,
}
return j, nil
}
Expand All @@ -142,7 +190,12 @@ func (p *xmlProc) Process(_ context.Context, msg *service.Message) (service.Mess
return nil, err
}

root, err := ToMap(mBytes, p.cast)
var root map[string]any
if p.preserveNSPrefix {
root, err = ToMapPreserveNS(mBytes, p.cast)
} else {
root, err = ToMap(mBytes, p.cast)
}
if err != nil {
p.log.Debugf("Failed to parse part as XML: %v", err)
return nil, err
Expand Down
Loading