Skip to content

Allow cardinality #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flatjsonl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ type Config struct {
Transpose map[string]string `json:"transpose" yaml:"transpose" description:"Map of key prefixes to transposed table names."`
ExtractValuesRegex map[string]extract `json:"extractValuesRegex" yaml:"extractValuesRegex" description:"Map of key regex to extraction format, values can be 'URL', 'JSON'."`
KeepJSON []string `json:"keepJSON" yaml:"keepJSON" description:"List of keys to keep as JSON literals."`
AllowCardinality []string `json:"allowCardinality" yaml:"allowCardinality" description:"List of keys to allow high cardinality of child keys."`
}
11 changes: 7 additions & 4 deletions flatjsonl/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ type Flags struct {
MatchLinePrefix string
CaseSensitiveKeys bool

ShowKeysFlat bool
ShowKeysHier bool
ShowKeysInfo bool
ShowKeysFlat bool
ShowKeysHier bool
ShowKeysInfo bool
ShowJSONSchema bool

Concurrency int
MemLimit int
Expand Down Expand Up @@ -74,6 +75,8 @@ func (f *Flags) Register() {
flag.BoolVar(&f.ShowKeysFlat, "show-keys-flat", false, "Show all available keys as flat list.")
flag.BoolVar(&f.ShowKeysHier, "show-keys-hier", false, "Show all available keys as hierarchy.")
flag.BoolVar(&f.ShowKeysInfo, "show-keys-info", false, "Show keys, their replaces and types.")
flag.BoolVar(&f.ShowJSONSchema, "show-json-schema", false, "Show hierarchy as JSON schema.")

flag.BoolVar(&f.SkipZeroCols, "skip-zero-cols", false, "Skip columns with zero values.")
flag.BoolVar(&f.AddSequence, "add-sequence", false, "Add auto incremented sequence number.")
flag.BoolVar(&f.CaseSensitiveKeys, "case-sensitive-keys", false, "Use case-sensitive keys (can fail for SQLite).")
Expand All @@ -94,7 +97,7 @@ func (f *Flags) Register() {
func (f *Flags) Parse() {
flag.Parse()

if f.Output == "" && !f.ShowKeysHier && !f.ShowKeysFlat && !f.ShowKeysInfo {
if f.Output == "" && !f.ShowKeysHier && !f.ShowKeysFlat && !f.ShowKeysInfo && !f.ShowJSONSchema {
inputs := f.Inputs()

if len(inputs) > 0 && f.CSV == "" && f.SQLite == "" && f.Raw == "" && f.PGDump == "" {
Expand Down
104 changes: 103 additions & 1 deletion flatjsonl/flattener.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"strconv"
"strings"

"github.com/valyala/fastjson"
"github.com/puzpuzpuz/xsync/v3"
"github.com/vearutop/fastjson"
)

// KeyFromPath joins path elements into a dot-separated scalar key.
Expand Down Expand Up @@ -171,6 +172,7 @@
xs, name, err := x.extract(s)
if err == nil {
p := parserPool.Get()
p.AllowUnexpectedTail = true
defer parserPool.Put(p)

if v, err := p.ParseBytes(xs); err == nil {
Expand All @@ -196,6 +198,7 @@
// Check if string has nested JSON or URL.
if s[0] == '{' || s[0] == '[' {
p := parserPool.Get()
p.AllowUnexpectedTail = true
defer parserPool.Put(p)

v, err := p.ParseBytes(s)
Expand All @@ -218,6 +221,7 @@
us, _, err := (urlExtractor{}).extract(s)
if err == nil {
p := parserPool.Get()
p.AllowUnexpectedTail = true
defer parserPool.Put(p)

v, err := p.ParseBytes(us)
Expand Down Expand Up @@ -255,8 +259,106 @@
}
}

type JSONSchema struct {

Check failure on line 262 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

exported: exported type JSONSchema should have comment or be unexported (revive)
Types []string `json:"type,omitempty"`
Properties map[string]*JSONSchema `json:"properties,omitempty"`
Items *JSONSchema `json:"items,omitempty"`
}

func (j *JSONSchema) AddType(t Type) {

Check failure on line 268 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

exported: exported method JSONSchema.AddType should have comment or be unexported (revive)
tt := ""

switch t {
case TypeString:
tt = "string"
case TypeInt:
tt = "integer"
case TypeFloat:
tt = "number"
case TypeBool:
tt = "boolean"
case TypeArray:
tt = "array"
case TypeObject:
tt = "object"
case TypeJSON:
tt = "string"
}

for _, t := range j.Types {
if t == tt {
return
}
}

j.Types = append(j.Types, tt)
}

func (j *JSONSchema) AddKey(k flKey, keys *xsync.MapOf[uint64, flKey]) {

Check failure on line 297 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

exported: exported method JSONSchema.AddKey should have comment or be unexported (revive)
if k.parent == 0 {
return
}

parents := []flKey{k}

parent := k.parent
for {

Check failure on line 305 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

for statement without condition should never be cuddled (wsl)
if parent == 0 {
break
}

pk, ok := keys.Load(parent)
if !ok {
println("BUG: failed to load parent key:", parent)
return

Check failure on line 313 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

return with no blank line before (nlreturn)
}

parents = append(parents, pk)

parent = pk.parent
}

parentSchema := j
parentType := TypeObject
for i := len(parents) - 1; i >= 0; i-- {

Check failure on line 323 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

only one cuddle assignment allowed before for statement (wsl)
pk := parents[i]
name := pk.path[len(pk.path)-1]

if i != 0 && pk.t == TypeString {
pk.t = TypeObject
}

if parentType == TypeObject {
if parentSchema.Properties == nil {
parentSchema.Properties = make(map[string]*JSONSchema)
}

property := parentSchema.Properties[name]
if property == nil {
property = &JSONSchema{}
}
parentSchema.Properties[name] = property

Check failure on line 340 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

assignments should only be cuddled with other assignments (wsl)
parentSchema = property

parentType = pk.t
} else if parentType == TypeArray {
if parentSchema.Items == nil {
parentSchema.Items = &JSONSchema{}
}

parentSchema = parentSchema.Items
parentType = pk.t
}
}

parentSchema.AddType(k.t)
return

Check failure on line 355 in flatjsonl/flattener.go

View workflow job for this annotation

GitHub Actions / golangci-lint

S1023: redundant `return` statement (gosimple)
}

// KeyHierarchy collects structural relations.
type KeyHierarchy struct {
Schema JSONSchema

Name string
Sub map[string]KeyHierarchy
}
Expand Down
76 changes: 56 additions & 20 deletions flatjsonl/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,35 @@
return existing
}

if p.f.ChildrenLimit > 0 && len(path) > 1 {

Check failure on line 100 in flatjsonl/keys.go

View workflow job for this annotation

GitHub Actions / golangci-lint

`if p.f.ChildrenLimit > 0 && len(path) > 1` has complex nested blocks (complexity: 7) (nestif)
parentCardinality := p.parentCardinality[parent]
parentCardinality++

if parentCardinality > p.f.ChildrenLimit {
pp := k.path[0 : len(k.path)-1]
parentKey := KeyFromPath(pp)
grandParentKey := KeyFromPath(pp[:len(pp)-1])
ppk, gpk := newHasher().hashParentBytes([]byte(parentKey), len(grandParentKey))
allowCardinality := false

p.mu.Unlock()
// println("making parent key", parentKey, grandParentKey, ppk, gpk)
p.initKey(ppk, gpk, pp, TypeJSON, false)
p.mu.Lock()
for _, ac := range p.cfg.AllowCardinality {
if parentKey == ac {
allowCardinality = true
}
}

if !allowCardinality {
grandParentKey := KeyFromPath(pp[:len(pp)-1])
ppk, gpk := newHasher().hashParentBytes([]byte(parentKey), len(grandParentKey))

p.mu.Unlock()
// println("making parent key", parentKey, grandParentKey, ppk, gpk)
p.initKey(ppk, gpk, pp, TypeJSON, false)
p.mu.Lock()

p.cfg.KeepJSON = append(p.cfg.KeepJSON, parentKey)
p.parentHighCardinality.Store(parent, true)
p.cfg.KeepJSON = append(p.cfg.KeepJSON, parentKey)
p.parentHighCardinality.Store(parent, true)
} else {
p.parentCardinality[parent] = parentCardinality
}
} else {
p.parentCardinality[parent] = parentCardinality
}
Expand Down Expand Up @@ -227,21 +239,31 @@

p1 := flatPath[:parentLen]

_, err := h.digest.Write(p1)
if err != nil {
panic("hashing failed: " + err.Error())
}
if len(p1) == 0 {
par = 0
} else {
_, err := h.digest.Write(p1)
if err != nil {
panic("hashing failed: " + err.Error())
}

par = h.digest.Sum64()
par = h.digest.Sum64()
}

p2 := flatPath[parentLen:]

_, err = h.digest.Write(p2)
if err != nil {
panic("hashing failed: " + err.Error())
if len(p2) == 0 {
pk = 0
} else {
_, err := h.digest.Write(p2)
if err != nil {
panic("hashing failed: " + err.Error())
}

pk = h.digest.Sum64()
}

return h.digest.Sum64(), par
return pk, par
}

func (p *Processor) scanAvailableKeys() error {
Expand Down Expand Up @@ -282,7 +304,7 @@
w.WantPath = true

w.FnObjectStop = func(_ int64, flatPath []byte, pl int, path []string) (stop bool) {
if pl == 0 {
if len(flatPath) == 0 {
return
}

Expand All @@ -293,7 +315,7 @@
return stop
}
w.FnArrayStop = func(_ int64, flatPath []byte, pl int, path []string) (stop bool) {
if pl == 0 {
if len(flatPath) == 0 {
return
}

Expand Down Expand Up @@ -382,6 +404,14 @@
if k.t == TypeObject || k.t == TypeArray {
deleted[k.original] = true

if p.f.ShowKeysHier {
p.keyHierarchy.Add(k.path)
}

if p.f.ShowJSONSchema {
p.jsonSchema.AddKey(k, p.flKeys)
}

return true
}

Expand All @@ -395,7 +425,13 @@
}
}

p.keyHierarchy.Add(k.path)
if p.f.ShowKeysHier {
p.keyHierarchy.Add(k.path)
}

if p.f.ShowJSONSchema {
p.jsonSchema.AddKey(k, p.flKeys)
}

return true
})
Expand Down
9 changes: 9 additions & 0 deletions flatjsonl/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
mu sync.Mutex
flKeysList []string
keyHierarchy KeyHierarchy
jsonSchema JSONSchema
canonicalKeys map[string]flKey

totalLines int
Expand Down Expand Up @@ -343,6 +344,14 @@
_, _ = fmt.Fprintln(p.Stdout, string(b))
}

if p.f.ShowJSONSchema {
b, err := assertjson.MarshalIndentCompact(p.jsonSchema, "", " ", 120)
if err != nil {
return err
}
_, _ = fmt.Fprintln(p.Stdout, string(b))

Check failure on line 352 in flatjsonl/processor.go

View workflow job for this annotation

GitHub Actions / golangci-lint

assignments should only be cuddled with other assignments (wsl)
}

return nil
}

Expand Down
1 change: 1 addition & 0 deletions flatjsonl/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ func TestNewProcessor(t *testing.T) {
f.ShowKeysFlat = true
f.ShowKeysHier = true
f.ShowKeysInfo = true
f.ShowJSONSchema = true
f.Concurrency = 1
f.PrepareOutput()

Expand Down
6 changes: 3 additions & 3 deletions flatjsonl/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"github.com/bool64/progress"
"github.com/klauspost/compress/zstd"
gzip "github.com/klauspost/pgzip"
"github.com/valyala/fastjson"
"github.com/vearutop/fastjson"
)

const errEmptyFile = ctxd.SentinelError("empty file")
Expand Down Expand Up @@ -194,7 +194,7 @@ func (rd *Reader) Read(sess *readSession) error {

semaphore <- &syncWorker{
i: i,
p: &fastjson.Parser{},
p: &fastjson.Parser{AllowUnexpectedTail: true},
used: 0,
path: make([]string, 0, 20),
flatPath: make([]byte, 0, 5000),
Expand Down Expand Up @@ -260,7 +260,7 @@ func (rd *Reader) Read(sess *readSession) error {

if worker.used >= 100 {
worker.used = 0
worker.p = &fastjson.Parser{}
worker.p = &fastjson.Parser{AllowUnexpectedTail: true}
}

atomic.AddInt64(&rd.Processor.inProgress, 1)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/puzpuzpuz/xsync/v3 v3.5.1
github.com/stretchr/testify v1.9.0
github.com/swaggest/assertjson v1.9.0
github.com/valyala/fastjson v1.6.4
github.com/vearutop/fastjson v1.0.0
gopkg.in/yaml.v3 v3.0.1
modernc.org/sqlite v1.35.0
)
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ github.com/swaggest/assertjson v1.9.0 h1:dKu0BfJkIxv/xe//mkCrK5yZbs79jL7OVf9Ija7
github.com/swaggest/assertjson v1.9.0/go.mod h1:b+ZKX2VRiUjxfUIal0HDN85W0nHPAYUbYH5WkkSsFsU=
github.com/swaggest/usecase v1.2.0 h1:cHVFqxIbHfyTXp02JmWXk+ZADaSa87UZP+b3qL5Nz90=
github.com/swaggest/usecase v1.2.0/go.mod h1:oc5+QoAxG3Et5Gl9lRXgEOm00l4VN9gdVQSMIa5EeLY=
github.com/valyala/fastjson v1.6.4 h1:uAUNq9Z6ymTgGhcm0UynUAB6tlbakBrz6CQFax3BXVQ=
github.com/valyala/fastjson v1.6.4/go.mod h1:CLCAqky6SMuOcxStkYQvblddUtoRxhYMGLrsQns1aXY=
github.com/vearutop/fastjson v1.0.0 h1:4yn7BZj9R52INMqMj1q90gG206Qm9XY54aKfj3ZPC54=
github.com/vearutop/fastjson v1.0.0/go.mod h1:H1NX3WgvfAI1gJf9Pk3IKegysfqZOotwqiihA+txgMQ=
github.com/yosuke-furukawa/json5 v0.1.2-0.20201207051438-cf7bb3f354ff h1:7YqG491bE4vstXRz1lD38rbSgbXnirvROz1lZiOnPO8=
github.com/yosuke-furukawa/json5 v0.1.2-0.20201207051438-cf7bb3f354ff/go.mod h1:sw49aWDqNdRJ6DYUtIQiaA3xyj2IL9tjeNYmX2ixwcU=
github.com/yudai/gojsondiff v1.0.0 h1:27cbfqXLVEJ1o8I6v3y9lg8Ydm53EKqHXAOMxEGlCOA=
Expand Down
Loading