Skip to content

Commit 00e00ee

Browse files
committed
Added support for CLDF sources
1 parent c5e352d commit 00e00ee

15 files changed

Lines changed: 318 additions & 110 deletions

File tree

RELEASING.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,9 @@ go mod tidy
44

55
go test ./...
66

7+
~/venvs/cldf/bin/python test/test_regression.py
8+
9+
FIXME: get cross-compilation set up for goreleaser (e.g. )
10+
711
goreleaser check
812
goreleaser release --snapshot --clean

cldf/dataset.go

Lines changed: 13 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
package cldf
22

33
import (
4-
"encoding/json"
54
"errors"
65
"fmt"
7-
"os"
6+
"gocldf/internal/jsonutil"
87
"path/filepath"
98
"slices"
109
"strings"
@@ -19,18 +18,11 @@ type Dataset struct {
1918
}
2019

2120
func NewDataset(mdPath string) (*Dataset, error) {
22-
data, err := os.ReadFile(mdPath)
21+
result, err := jsonutil.ReadObject(mdPath)
2322
if err != nil {
2423
return nil, err
2524
}
26-
var result map[string]interface{}
27-
28-
err = json.Unmarshal(data, &result)
29-
if err != nil {
30-
return nil, err
31-
}
32-
33-
metadata := make(map[string]interface{}, len(result)-1)
25+
metadata := make(map[string]any, len(result)-1)
3426
for k, v := range result {
3527
if k == "tables" {
3628
continue
@@ -41,12 +33,11 @@ func NewDataset(mdPath string) (*Dataset, error) {
4133
sourcesBibtex string
4234
sources *Sources
4335
)
44-
val, ok := result["dc:source"]
45-
if ok {
46-
sourcesBibtex, ok = val.(string)
47-
if !ok {
48-
return nil, errors.New("invalid dc:source")
49-
}
36+
sourcesBibtex, err = jsonutil.GetString(result, "dc:source", "")
37+
if err != nil {
38+
return nil, err
39+
}
40+
if sourcesBibtex != "" {
5041
sources, err = NewSources(filepath.Join(filepath.Dir(mdPath), sourcesBibtex))
5142
if err != nil {
5243
return nil, err
@@ -115,10 +106,10 @@ func (dataset *Dataset) UrlToCanonicalName() map[string]string {
115106
return res
116107
}
117108

109+
// orderedTables determines the order in which to create the tables in a db in such a way that foreign key constraints are satisfied.
118110
func (dataset *Dataset) orderedTables() ([]*Table, error) {
119111
var (
120-
urlToName = dataset.UrlToCanonicalName()
121-
// Determine the order in which to create the tables
112+
urlToName = dataset.UrlToCanonicalName()
122113
tables []string
123114
orderedTables []string
124115
)
@@ -207,11 +198,9 @@ type TableData struct {
207198
Rows [][]any
208199
}
209200

210-
// Function ToSqlite returns the data necessary to load the dataset into a SQLite database.
211-
func (dataset *Dataset) ToSqlite(noChecks bool) (string, []TableData, error) {
212-
var tableData []TableData
213-
214-
schema, err := dataset.sqlSchema(noChecks)
201+
// ToSqlite returns the data necessary to load the dataset into a SQLite database.
202+
func (dataset *Dataset) ToSqlite(noChecks bool) (schema string, tableData []TableData, err error) {
203+
schema, err = dataset.sqlSchema(noChecks)
215204
if err != nil {
216205
return "", tableData, err
217206
}

cldf/sources.go

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
package cldf
22

33
import (
4+
"bufio"
45
"fmt"
6+
"gocldf/internal/pathutil"
7+
"io"
58
"os"
9+
"regexp"
610
"slices"
711
"strings"
812

@@ -18,7 +22,8 @@ type Source struct {
1822
func NewSource(entry *bibtex.BibEntry) *Source {
1923
fields := make(map[string]string)
2024
for k, v := range entry.Fields {
21-
fields[k] = v.String()
25+
// We reverse the temporary replacement for @ to appease the BibTeX parser.
26+
fields[k] = strings.ReplaceAll(v.String(), "�", "@")
2227
}
2328
return &Source{
2429
Id: entry.CiteName,
@@ -32,20 +37,54 @@ type Sources struct {
3237
FieldNames []string
3338
}
3439

35-
func NewSources(p string) (*Sources, error) {
36-
f, err := os.Open(p)
40+
func normalizeBibtex(r io.Reader) (io.Reader, error) {
41+
var res []string
42+
comment := regexp.MustCompile("^\\s*comment\\s*=")
43+
atAtStart := regexp.MustCompile("^\\s*@")
44+
scanner := bufio.NewScanner(r)
45+
for scanner.Scan() {
46+
line := scanner.Text()
47+
if comment.MatchString(line) {
48+
// For some reason "comment" seems to be forbidden as field name.
49+
line = strings.Replace(line, "comment", "comments", 1)
50+
}
51+
if !atAtStart.MatchString(line) {
52+
line = strings.ReplaceAll(line, "@", "�")
53+
}
54+
res = append(res, line)
55+
}
56+
57+
if err := scanner.Err(); err != nil {
58+
return nil, err
59+
}
60+
return strings.NewReader(strings.Join(res, "\n")), nil
61+
}
62+
63+
func NewSources(p string) (sources *Sources, err error) {
64+
f, err := pathutil.Reader(p)
3765
if err != nil {
3866
return nil, err
3967
}
40-
entries, err := bibtex.Parse(f)
68+
defer func(file any) {
69+
switch file.(type) {
70+
case *os.File:
71+
err = file.(*os.File).Close()
72+
}
73+
}(f)
74+
75+
r, err := normalizeBibtex(f.(io.Reader))
4176
if err != nil {
4277
return nil, err
4378
}
79+
entries, err := bibtex.Parse(r)
80+
if err != nil {
81+
return nil, fmt.Errorf("error parsing %v: %w", p, err)
82+
}
4483
res := make([]*Source, len(entries.Entries))
45-
fields := []string{}
84+
var fields []string
4685
for i, entry := range entries.Entries {
4786
res[i] = NewSource(entry)
48-
for name, _ := range entry.Fields {
87+
for name := range entry.Fields {
4988
if !slices.Contains(fields, name) {
5089
fields = append(fields, name)
5190
}
@@ -56,11 +95,14 @@ func NewSources(p string) (*Sources, error) {
5695
}
5796

5897
func (s *Sources) SqlCreate() string {
59-
res := []string{}
98+
var res []string
6099
res = append(res, "CREATE TABLE IF NOT EXISTS `SourceTable` (")
61100
res = append(res, "\t`id`\tTEXT,")
62101
res = append(res, "\t`genre`\tTEXT,")
63102
for _, field := range s.FieldNames {
103+
if field == "type" || field == "id" {
104+
field += "_"
105+
}
64106
res = append(res, fmt.Sprintf("\t`%s`\tTEXT,", field))
65107
}
66108
res = append(res, "\tPRIMARY KEY(`id`)")
@@ -80,6 +122,9 @@ func (s *Sources) itemsToSql() (rows [][]any, colNames []string, err error) {
80122
rows[i][1] = item.Type
81123

82124
for j, field := range s.FieldNames {
125+
if field == "type" || field == "id" {
126+
field += "_"
127+
}
83128
if i == 0 {
84129
colNames = append(colNames, field)
85130
}

cldf/table.go

Lines changed: 16 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
package cldf
22

33
import (
4-
"archive/zip"
5-
"bytes"
64
"encoding/csv"
75
"encoding/json"
86
"errors"
@@ -41,8 +39,6 @@ type Table struct {
4139
}
4240

4341
func NewTable(jsonTable map[string]interface{}, withSourceTable bool) (tbl *Table, err error) {
44-
// FIXME: We must know whether the dataset has a sources bibfile, because then we'll turn
45-
// columns with canonical name cldf_source into list-valued foreign keys to SourceTable.
4642
var (
4743
dialect *Dialect
4844
trimmer = func(s string) string { return s }
@@ -57,7 +53,7 @@ func NewTable(jsonTable map[string]interface{}, withSourceTable bool) (tbl *Tabl
5753
if err != nil {
5854
return nil, err
5955
}
60-
if col.CanonicalName == "cldf_source" {
56+
if withSourceTable && col.CanonicalName == "cldf_source" {
6157
// remember and store additional foreign key constraint!
6258
fks = append(
6359
fks,
@@ -156,68 +152,26 @@ type TableRead struct {
156152
Err error
157153
}
158154

159-
func readZipped(fp string) (bytes []byte, err error) {
160-
r, err := zip.OpenReader(fp)
161-
if err != nil {
162-
return nil, err
163-
}
164-
defer func(r *zip.ReadCloser) {
165-
err = r.Close()
166-
}(r)
167-
168-
var contentBytes []byte
169-
for _, f := range r.File {
170-
rc, err := f.Open()
171-
if err != nil {
172-
return nil, err
173-
}
174-
contentBytes, err = io.ReadAll(rc)
175-
if err != nil {
176-
return nil, err
177-
}
178-
err = rc.Close()
179-
if err != nil {
180-
return nil, err
181-
} // Must close each file reader individually
182-
break
183-
}
184-
return contentBytes, nil
185-
}
186-
187155
func (tbl *Table) Read(dir string, dialect *Dialect, noChecks bool, ch chan<- TableRead) {
188156
var reader *csv.Reader
189157
fp := filepath.Join(dir, tbl.Url)
190-
zipped := false
191-
if !pathutil.PathExists(fp) {
192-
fp += ".zip"
193-
zipped = true
194-
}
195158
var (
196159
rows [][]string
197160
err error
198161
)
199-
if zipped {
200-
zippedBytes, err := readZipped(fp)
201-
if err != nil {
202-
ch <- TableRead{tbl.Url, err}
203-
return
204-
}
205-
reader = csv.NewReader(bytes.NewReader(zippedBytes))
206-
} else {
207-
file, err := os.Open(fp)
208-
if err != nil {
209-
ch <- TableRead{tbl.Url, err}
210-
return
211-
}
212-
defer func(file *os.File) {
213-
err := file.Close()
214-
if err != nil {
215-
ch <- TableRead{tbl.Url, err}
216-
return
217-
}
218-
}(file)
219-
reader = csv.NewReader(file)
162+
r, err := pathutil.Reader(fp)
163+
if err != nil {
164+
ch <- TableRead{tbl.Url, err}
165+
return
220166
}
167+
defer func(file any) {
168+
switch file.(type) {
169+
case *os.File:
170+
err = file.(*os.File).Close()
171+
}
172+
}(r)
173+
reader = csv.NewReader(r.(io.Reader))
174+
221175
if tbl.Dialect != nil {
222176
dialect = tbl.Dialect
223177
}
@@ -296,9 +250,8 @@ func (tbl *Table) associationTableRowsToSql(
296250
tpk string
297251
colName string
298252
)
299-
nameToCol := tbl.nameToCol()
300253
stable := tbl.CanonicalName
301-
spk := nameToCol[tbl.PrimaryKey[0]].CanonicalName
254+
spk := tbl.nameToCol()[tbl.PrimaryKey[0]].CanonicalName
302255

303256
if fk.Reference.Resource == "SourceTable" {
304257
ttable = "SourceTable"
@@ -310,8 +263,8 @@ func (tbl *Table) associationTableRowsToSql(
310263
panic("not found " + fk.Reference.Resource)
311264
}
312265
ttable = ttable_.CanonicalName
313-
tpk = nameToCol[ttable_.PrimaryKey[0]].CanonicalName
314-
colName = nameToCol[fk.ColumnReference[0]].CanonicalName
266+
tpk = ttable_.nameToCol()[ttable_.PrimaryKey[0]].CanonicalName
267+
colName = tbl.nameToCol()[fk.ColumnReference[0]].CanonicalName
315268
}
316269

317270
colNames = []string{

cmd/createdb.go

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,20 @@ package cmd
22

33
import (
44
"database/sql"
5-
"errors"
65
"fmt"
76
"gocldf/cldf"
87
"gocldf/internal/dbutil"
98
"gocldf/internal/pathutil"
109
"io"
11-
"os"
1210
"slices"
1311

1412
"github.com/spf13/cobra"
1513
)
1614

1715
func createdb(out io.Writer, mdPath string, dbPath string, overwrite bool, noChecks bool) (err error) {
18-
if pathutil.PathExists(dbPath) {
19-
if overwrite {
20-
err = os.Remove(dbPath)
21-
if err != nil {
22-
return err
23-
}
24-
} else {
25-
return errors.New("database already exists")
26-
}
16+
dbPath, err = pathutil.GetFreshPath(dbPath, overwrite)
17+
if err != nil {
18+
return err
2719
}
2820
ds, err := cldf.GetLoadedDataset(mdPath, noChecks)
2921
if err != nil {
@@ -40,7 +32,10 @@ func createdb(out io.Writer, mdPath string, dbPath string, overwrite bool, noChe
4032
return err
4133
}
4234
for _, tData := range tableData { // ... and the data.
43-
dbutil.BatchInsert(tx, tData.TableName, tData.ColNames, tData.Rows)
35+
err = dbutil.BatchInsert(tx, tData.TableName, tData.ColNames, tData.Rows)
36+
if err != nil {
37+
return err
38+
}
4439
}
4540
return nil
4641
})

0 commit comments

Comments
 (0)