Merge pull request #2 from bxparks/develop

bxparks · web-flow · commit f5f8696eaa29 · 2018-01-02T13:31:15.000-08:00
Initial release of version 0.1 to PyPI.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,22 @@
 __pycache__/
 *.py[cod]
 *$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
diff --git a/README.md b/README.md
@@ -27,10 +27,10 @@ When the auto-detect feature is used, the BigQuery data importer examines only
 the first 100 records of the input data. In many cases, this is sufficient
 because the data records were dumped from another database and the exact schema
 of the source table was known. However, for data extracted from a service
-(e.g. using a REST API) the record fields were organically at later dates. In
-this case, the first 100 records do not contain fields which are present in
-later records. The **bq load** auto-detection fails and the data fails to
-load.
+(e.g. using a REST API) the record fields could have been organically added
+at later dates. In this case, the first 100 records do not contain fields which
+are present in later records. The **bq load** auto-detection fails and the data
+fails to load.
 
 The **bq load** tool does not support the ability to process the entire dataset
 to determine a more accurate schema. This script fills in that gap. It
@@ -119,19 +119,33 @@ With the ``keep_nulls``, the resulting schema file will be:
 ]
 ```
 
+Example:
+
+```
+$ generate_schema.py --keep_nulls < file.data.json > file.schema.json
+```
+
 #### Debugging Interval
 
 By default, the `generate_schema.py` script prints a short progress message
 every 1000 lines of input data. This interval can be changed using the
 `--debugging_interval` flag.
 
+```
+$ generate_schema.py --debugging_interval 1000 < file.data.json > file.schema.json
+```
+
 #### Debugging Map
 
 Instead of printing out the BigQuery schema, the `--debugging_map` prints out
 the bookkeeping metadata map which is used internally to keep track of the
 various fields and theirs types that was inferred using the data file. This
 flag is intended to be used for debugging.
 
+```
+$ generate_schema.py --debugging_map < file.data.json > file.schema.json
+```
+
 ## Examples
 
 Here is an example of a single JSON data record on the STDIN:
@@ -195,36 +209,6 @@ $ cat file.schema.json
 ]
 ```
 
-## Unit Tests
-
-Instead of embeddeding the input data records and the expected schema file into
-the `test_generate_schema.py` file, we placed them into the `testdata.txt`
-file. This has two advantages:
-
-* we can more easily update the input and output data records, and 
-* the `testdata.txt` data could be reused for versions written in other languages
-
-The output of `test_generate_schema.py` should look something like this:
-```
-----------------------------------------------------------------------
-Ran 4 tests in 0.002s
-
-OK
-Test chunk 1: First record: { "s": null, "a": [], "m": {} }
-Test chunk 2: First record: { "s": null, "a": [], "m": {} }
-Test chunk 3: First record: { "s": "string", "b": true, "i": 1, "x": 3.1, "t": "2017-05-22T17:10:00-07:00" }
-Test chunk 4: First record: { "a": [1, 2], "r": { "r0": "r0", "r1": "r1" } }
-Test chunk 5: First record: { "s": "string", "x": 3.2, "i": 3, "b": true, "a": [ "a", 1] }
-Test chunk 6: First record: { "a": [1, 2] }
-Test chunk 7: First record: { "r" : { "a": [1, 2] } }
-Test chunk 8: First record: { "i": 1 }
-Test chunk 9: First record: { "i": null }
-Test chunk 10: First record: { "i": 3 }
-Test chunk 11: First record: { "i": [1, 2] }
-Test chunk 12: First record: { "r" : { "i": 3 } }
-Test chunk 13: First record: { "r" : [{ "i": 4 }] }
-```
-
 ## System Requirements
 
 This project was developed on Ubuntu 17.04 using Python 3.5. It is likely
diff --git a/bigquery_schema_generator/__init__.py b/bigquery_schema_generator/__init__.py
diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 #
 # Copyright 2017 Brian T. Park
 #
@@ -18,7 +18,7 @@
 Unlike the BigQuery importer which uses only the first 100 records, this script
 uses all available records in the data file.
 
-Usage: generator_schema.py [-h] [flags ...] < file.data.json > file.schema.json
+Usage: generate_schema.py [-h] [flags ...] < file.data.json > file.schema.json
 
 * file.data.json is a newline-delimited JSON data file, one JSON object per line.
 * file.schema.json is the schema definition of the table.
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+from setuptools import setup
+
+# Convert README.md to README.rst because PyPI does not support Markdown.
+try:
+    import pypandoc
+    long_description = pypandoc.convert('README.md', 'rst')
+except OSError:
+    with open('README.md', encoding="utf-8") as f:
+        long_description = f.read()
+
+setup(name='bigquery-schema-generator',
+      version='0.1',
+      description='BigQuery schema generator',
+      long_description=long_description,
+      url='https://github.com/bxparks/bigquery-schema-generator',
+      author='Brian T. Park',
+      author_email='brian@xparks.net',
+      license='Apache 2.0',
+      packages=['bigquery_schema_generator'],
+      python_requires='~=3.5')
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,31 @@
+# Tests
+
+Instead of embedding the input data records and the expected schema into
+the `test_generate_schema.py` file, we placed them into the `testdata.txt`
+file which is parsed by the unit test program.  This has two advantages:
+
+* we can more easily update the input and output data records, and 
+* the `testdata.txt` data can be reused for versions written in other languages
+
+The output of `test_generate_schema.py` should look something like this:
+```
+----------------------------------------------------------------------
+Ran 4 tests in 0.002s
+
+OK
+Test chunk 1: First record: { "s": null, "a": [], "m": {} }
+Test chunk 2: First record: { "s": null, "a": [], "m": {} }
+Test chunk 3: First record: { "s": "string", "b": true, "i": 1, "x": 3.1, "t": "2017-05-22T17:10:00-07:00" }
+Test chunk 4: First record: { "a": [1, 2], "r": { "r0": "r0", "r1": "r1" } }
+Test chunk 5: First record: { "s": "string", "x": 3.2, "i": 3, "b": true, "a": [ "a", 1] }
+Test chunk 6: First record: { "a": [1, 2] }
+Test chunk 7: First record: { "r" : { "a": [1, 2] } }
+Test chunk 8: First record: { "i": 1 }
+Test chunk 9: First record: { "i": null }
+Test chunk 10: First record: { "i": 3 }
+Test chunk 11: First record: { "i": [1, 2] }
+Test chunk 12: First record: { "r" : { "i": 3 } }
+Test chunk 13: First record: { "r" : [{ "i": 4 }] }
+```
+
+
diff --git a/tests/data_reader.py b/tests/data_reader.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 #
 # Copyright 2017 Brian T. Park
 #
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Parses the 'testdata.txt' date file used by the 'generate_schema_test.py'
+Parses the 'testdata.txt' date file used by the 'test_generate_schema.py'
 program.
 
 Usage:
diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 #
 # Copyright 2017 Brian T. Park
 #
@@ -18,9 +18,9 @@
 import os
 import json
 from collections import OrderedDict
+from bigquery_schema_generator.generate_schema import SchemaGenerator
+from bigquery_schema_generator.generate_schema import sort_schema
 from data_reader import DataReader
-from generate_schema import SchemaGenerator
-from generate_schema import sort_schema
 
 
 class TestSchemaGenerator(unittest.TestCase):
diff --git a/tests/testdata.txt b/tests/testdata.txt

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#!/usr/bin/python3`
	`1`	`+#!/usr/bin/env python3`
`2`	`2`	`#`
`3`	`3`	`# Copyright 2017 Brian T. Park`
`4`	`4`	`#`
`@@ -18,7 +18,7 @@`
`18`	`18`	`Unlike the BigQuery importer which uses only the first 100 records, this script`
`19`	`19`	`uses all available records in the data file.`
`20`	`20`
`21`		`-Usage: generator_schema.py [-h] [flags ...] < file.data.json > file.schema.json`
	`21`	`+Usage: generate_schema.py [-h] [flags ...] < file.data.json > file.schema.json`
`22`	`22`
`23`	`23`	`* file.data.json is a newline-delimited JSON data file, one JSON object per line.`
`24`	`24`	`* file.schema.json is the schema definition of the table.`