Skip to content

Commit 1ea8710

Browse files
initial commit
1 parent e7eed9a commit 1ea8710

20 files changed

+1373
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
*.dll
44
*.so
55
*.dylib
6+
bigboy
7+
debug
68

79
# Test binary, build with `go test -c`
810
*.test

.vscode/launch.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"version": "0.2.0",
3+
"configurations": [
4+
{
5+
"name": "Launch",
6+
"type": "go",
7+
"request": "launch",
8+
"mode": "debug",
9+
"remotePath": "",
10+
"port": 2345,
11+
"host": "127.0.0.1",
12+
"program": "${fileDirname}",
13+
"env": {
14+
"GOPATH": "c:/Users/Dave/go"
15+
},
16+
"cwd": "../bigdata",
17+
"showLog": true
18+
}
19+
]
20+
}

README.md

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Bigboy
2+
Extract data from SQL Server, PostgreSQL, or MySQL, transforming SQL-to-JSON or JSON-to-JSON.
3+
4+
Written by Dave Templin
5+
6+
# Overview
7+
Bigboy is a tool that extracts data from SQL Server, PostgreSQL, or MySQL databases and transforms SQL-to-JSON or JSON-to-JSON; basically performing the **E** and the **T** part of **ETL** *(Extract/Transform/Load)*. The tool provides a simple model for configuring SQL extraction queries and optionally Javascript functions for transformations. A simple but powerful command-line interface (CLI) makes it easy to perform both adhoc and batch processing scenarios (BASH, CRON, etc.). The tool is also designed to maximize available local compute resources to extract and transform massive data volumes in a time-efficient way.
8+
9+
## Features
10+
* Extract data from SQL Server, PostgreSQL, or MySQL
11+
* Perform SQL-to-JSON or JSON-to-JSON transformations
12+
* Nest rows to form complex hierarchical (or document-oriented) data
13+
* Leverage Javascript functions to perform arbitrarily complex data transformations
14+
* Define command driven parameters to create dynamic queries and scripts
15+
* Combine data from multiple different database sources
16+
* Apply timezone to dates stored without a timezone
17+
* Configure the tool to maximize local compute resources and minimize processing time
18+
19+
## Quickstart
20+
21+
22+
23+
# Concepts
24+
25+
## Connections
26+
27+
## Targets
28+
29+
## Fetching and Prefetching
30+
fetch, prefetch
31+
32+
## Transforms
33+
nest, script, split, timezone
34+
35+
36+
37+
# Reference
38+
39+
## Command Arguments
40+
41+
* `-e` Maximum overall number of errors before aborting *(default=100)*
42+
* `-n` Include nulls in output *(default=false)*
43+
* `-o` Output directory *(creates \"out\" directory if not specified)*
44+
* `-p` Number of rows per page extracted *(default=1000)*
45+
* `-q` Supress informational output *(default=false)*
46+
* `-r` Number of consecutive errors before aborting *(default=3)*
47+
* `-v` Print version info about bigboy and exit
48+
* `-w` Number of background workers *(default=4)*
49+
50+
> Above defaults can also be configured in the `config.json` file.
51+
52+
## config.json
53+
This section describes the `config.json` file format.
54+
55+
| Name | Description |
56+
| --- | --- |
57+
| `connections` | ... |
58+
| `errors` | ... |
59+
| `nulls` | ... |
60+
| `page` | ... |
61+
| `quiet` | ... |
62+
| `retries` | ... |
63+
| `workers` | ... |
64+
65+
### connections
66+
| Name | Description |
67+
| --- | --- |
68+
| `driver` | ... |
69+
| `server` | ... |
70+
| `database` | ... |
71+
| `dsn` | ... |
72+
| `port` | ... |
73+
| `user` | ... |
74+
| `password` | ... |
75+
| `max` | ... |
76+
| `timezone` | ... |
77+
78+
79+
## target.json
80+
This section describes the `target.json` file format.
81+
82+
| Name | Description |
83+
| --- | --- |
84+
| `connection` | ... |
85+
| `fetch` | ... |
86+
| `params` | ... |
87+
| `prefetch` | ... |
88+
| `nest` | ... |
89+
| `script` | ... |
90+
| `split` | ... |
91+
| `timezone` | ... |
92+
93+
### nest
94+
| Name | Description |
95+
| --- | --- |
96+
| `connection` | ... |
97+
| `childKey` | ... |
98+
| `parentKey` | ... |
99+
| `fetch` | ... |
100+
| `timezone` | ... |
101+
102+
### param
103+
| Name | Description |
104+
| --- | --- |
105+
| `name` | ... |
106+
| `type` | ... |
107+
| `default` | ... |
108+
109+
### split
110+
| Name | Description |
111+
| --- | --- |
112+
| `by` | ... |
113+
| `value` | ... |
114+
115+
116+
## Date Format
117+
118+
All dates are assumed to be in GMT unless a timezone is specified.
119+
If a time is not specified then midnight GMT is assumed.
120+
Examples below illustrate various scenarios of specifying a date or date-range.
121+
122+
The following examples assume there is a target named `log` with a single paramter of type `date` representing a start date for the extraction.
123+
124+
| Example | Comments
125+
| ------------------------------------- | ------------------------------------------------------- |
126+
| `bigboy log 2017-07-21` | 7/21/2017 at midnight GMT
127+
| `bigboy log "2017-07-21 15:00:00"` | 7/21/2017 at 3pm GMT
128+
| `bigboy log today` | Midnight GMT of the current day
129+
| `bigboy log yesterday` | Midnight GMT of the previous day
130+
131+
The following examples assume there is a target named `sales` with two paramters of type `date` representing a date range for the extraction.
132+
133+
| Example | Comments
134+
| ------------------------------------- | ------------------------------------------------------- |
135+
| `bigboy sales 2017-07-21 2017-07-23` | From 7/21/2017 to 7/23/2017 midnight-to-midnight GMT
136+
| `bigboy sales 2017-07-21 2d` | Midnight GMT of the previous day.
137+
138+
139+
> The time zone database needed by LoadLocation may not be present on all systems, especially non-Unix systems. LoadLocation looks in the directory or uncompressed zip file named by the ZONEINFO environment variable, if any, then looks in known installation locations on Unix systems, and finally looks in $GOROOT/lib/time/zoneinfo.zip.
140+
141+
142+
143+
# Build
144+
145+
Install [golang](https://golang.org/dl/)
146+
147+
```
148+
$ go get github.com/denisenkom/go-mssqldb
149+
$ go get github.com/lib/pq
150+
$ go get github.com/go-sql-driver/mysql
151+
$ git clone https://github.com/davetemplin/bigboy.git
152+
$ go build
153+
```
154+
155+
## Cross compile
156+
```
157+
$ build windows
158+
$ build linux
159+
$ build mac
160+
```
161+
162+
163+
164+
# References
165+
166+
There are lots of ways to approach ETL, and lots of vendors that want to sell you a solution!
167+
Here are some additional references that may be helpful...
168+
169+
* [Wikipedia article on ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load)
170+
* [Performing ETL from a Relational Database into BigQuery](https://cloud.google.com/solutions/performing-etl-from-relational-database-into-bigquery)
171+
* [ETL Software: Top 63](https://www.predictiveanalyticstoday.com/top-free-extract-transform-load-etl-software/)

build.cmd

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
@ECHO OFF
2+
3+
IF "%1"=="windows" GOTO windows
4+
IF "%1"=="linux" GOTO linux
5+
IF "%1"=="mac" GOTO mac
6+
GOTO error
7+
8+
:windows
9+
SETLOCAL
10+
SET GOOS=windows
11+
SET GOARCH=amd64
12+
go build
13+
ENDLOCAL
14+
GOTO end
15+
16+
:linux
17+
SETLOCAL
18+
SET GOOS=linux
19+
SET GOARCH=amd64
20+
go build
21+
ENDLOCAL
22+
GOTO end
23+
24+
:mac
25+
SETLOCAL
26+
SET GOOS=darwin
27+
SET GOARCH=amd64
28+
go build
29+
ENDLOCAL
30+
GOTO end
31+
32+
:error
33+
ECHO Specify build target: windows, linux, or mac
34+
GOTO end
35+
36+
:end

config.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"io/ioutil"
6+
)
7+
8+
// Config ...
9+
type Config struct {
10+
Connections map[string]*Connection `json:"connections"`
11+
Errors uint64 `json:"errors"`
12+
Nulls bool `json:"nulls"`
13+
Page int `json:"page"`
14+
Quiet bool `json:"quiet"`
15+
Retries uint64 `json:"retries"`
16+
Workers int `json:"workers"`
17+
}
18+
19+
var config Config
20+
const undefined = ^uint64(0)
21+
22+
func loadConfig() {
23+
config.Errors = undefined
24+
config.Retries = undefined
25+
26+
name := "config.json"
27+
if fileExists(name) {
28+
buffer, err := ioutil.ReadFile(name)
29+
check(err)
30+
json.Unmarshal(buffer, &config)
31+
}
32+
33+
if config.Errors == undefined {
34+
config.Errors = 100
35+
}
36+
if config.Page == 0 {
37+
config.Page = 1000
38+
}
39+
if config.Retries == undefined {
40+
config.Retries = 3
41+
}
42+
if config.Workers == 0 {
43+
config.Workers = 4
44+
}
45+
}

connection.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package main
2+
3+
import (
4+
"database/sql"
5+
"fmt"
6+
"strings"
7+
"time"
8+
_ "github.com/denisenkom/go-mssqldb"
9+
_ "github.com/lib/pq"
10+
_ "github.com/go-sql-driver/mysql"
11+
)
12+
13+
// Connection ...
14+
type Connection struct {
15+
Driver string `json:"driver"`
16+
Server string `json:"server"`
17+
Database string `json:"database"`
18+
Dsn string `json:"dsn"`
19+
Port int `json:"port"`
20+
User string `json:"user"`
21+
Password string `json:"password"`
22+
Timezone string `json:"timezone"`
23+
Max int `json:"max"`
24+
db *sql.DB
25+
location *time.Location
26+
}
27+
28+
func connect(key string) *Connection {
29+
connection, ok := config.Connections[key]
30+
31+
if !ok {
32+
stop(fmt.Sprintf("Invalid connection key: '%s'", key), 1)
33+
}
34+
35+
dsn := formatDsn(connection)
36+
if dsn == "" {
37+
stop(fmt.Sprintf("Invalid driver specified for connection: '%s'", key), 1)
38+
}
39+
40+
if connection.db == nil {
41+
var err error
42+
connection.db, err = sql.Open(connection.Driver, dsn)
43+
check(err)
44+
45+
err = connection.db.Ping()
46+
if err != nil {
47+
stop(fmt.Sprintf("Unable to establish connection to server \"%s\"", connection.Server), 3)
48+
}
49+
50+
if connection.Max != 0 {
51+
connection.db.SetMaxOpenConns(connection.Max)
52+
}
53+
54+
if connection.Timezone != "" {
55+
connection.location, err = time.LoadLocation(connection.Timezone)
56+
check(err)
57+
}
58+
}
59+
60+
return connection
61+
}
62+
63+
func disconnect() {
64+
for _, connection := range config.Connections {
65+
if connection.db != nil {
66+
err := connection.db.Close()
67+
check(err)
68+
connection.db = nil
69+
}
70+
}
71+
}
72+
73+
func formatDsn(connection *Connection) string {
74+
if connection.Dsn != "" && !strings.HasPrefix(connection.Dsn, "...") {
75+
return connection.Dsn
76+
}
77+
78+
var dsn string
79+
if connection.Driver == "mssql" {
80+
dsn = fmt.Sprintf("server=%s;user id=%s;password=%s;port=%d",
81+
connection.Server,
82+
connection.User,
83+
connection.Password,
84+
connection.Port)
85+
} else if connection.Driver == "postgres" {
86+
dsn = fmt.Sprintf("host=%s user=%s password='%s' port=%d dbname=%s",
87+
strings.Replace(connection.Server, " ", "\\ ", -1),
88+
strings.Replace(connection.User, " ", "\\ ", -1),
89+
strings.Replace(connection.Password, "'", "\\'", -1),
90+
connection.Port,
91+
strings.Replace(connection.Database, " ", "\\ ", -1))
92+
} else if connection.Driver == "mysql" {
93+
dsn = fmt.Sprintf("%s:%s@%s/%s",
94+
connection.User,
95+
connection.Password,
96+
connection.Server,
97+
connection.Database)
98+
}
99+
100+
if strings.HasPrefix(connection.Dsn, "...") {
101+
dsn += connection.Dsn[3:]
102+
}
103+
104+
return dsn
105+
}

constants.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
package main
2+
3+
const version = "1.0.0"

0 commit comments

Comments
 (0)