Skip to content

Commit 0f63dd0

Browse files
authored
Merge pull request #54 from bxparks/develop
merge 1.2 into master
2 parents 6e2c62d + 9120c81 commit 0f63dd0

File tree

7 files changed

+121
-20
lines changed

7 files changed

+121
-20
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# Changelog
22

33
* Unreleased
4+
* 1.2 (2020-10-27)
5+
* Print full path of nested JSON elements in error messages (See #52;
6+
thanks abroglesc@).
47
* 1.1 (2020-07-10)
58
* Add `--ignore_invalid_lines` to ignore parsing errors on invalid lines
69
and continue processing. Fixes

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ tests:
66
python3 -m unittest
77

88
flake8:
9-
flake8 . \
9+
flake8 bigquery_schema_generator \
1010
--count \
1111
--ignore W503 \
1212
--show-source \

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ $ generate-schema < file.data.json > file.schema.json
1212
$ generate-schema --input_format csv < file.data.csv > file.schema.json
1313
```
1414

15-
Version: 1.1 (2020-07-10)
15+
Version: 1.2 (2020-10-27)
1616

1717
Changelog: [CHANGELOG.md](CHANGELOG.md)
1818

@@ -723,6 +723,7 @@ now requires Python 3.6 or higher, I think mostly due to the use of f-strings.
723723

724724
I have tested it on:
725725

726+
* Ubuntu 20.04, Python 3.8.5
726727
* Ubuntu 18.04, Python 3.7.7
727728
* Ubuntu 18.04, Python 3.6.7
728729
* Ubuntu 17.10, Python 3.6.3
@@ -745,6 +746,8 @@ and 3.8.
745746
* Sanitizing of column names to valid BigQuery characters and length by Jon
746747
Warghed (jonwarghed@).
747748
* Bug fix in `--sanitize_names` by Riccardo M. Cefala (riccardomc@).
749+
* Print full path of nested JSON elements in error messages, by Austin Brogle
750+
(abroglesc@).
748751

749752

750753
## License

bigquery_schema_generator/generate_schema.py

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,10 @@ def deduce_schema(self, file):
200200

201201
# Deduce the schema from this given data record.
202202
if isinstance(json_object, dict):
203-
self.deduce_schema_for_line(json_object, schema_map)
203+
self.deduce_schema_for_line(
204+
json_object=json_object,
205+
schema_map=schema_map,
206+
)
204207
elif isinstance(json_object, Exception):
205208
self.log_error(
206209
f'Record could not be parsed: Exception: {json_object}')
@@ -218,20 +221,35 @@ def deduce_schema(self, file):
218221

219222
return schema_map, self.error_logs
220223

221-
def deduce_schema_for_line(self, json_object, schema_map):
224+
def deduce_schema_for_line(self, json_object, schema_map, base_path=None):
222225
"""Figures out the BigQuery schema for the given 'json_object' and
223226
updates 'schema_map' with the latest info. A 'schema_map' entry of type
224227
'soft' is a provisional entry that can be overwritten by a subsequent
225228
'soft' or 'hard' entry. If both the old and new have the same type,
226229
then they must be compatible.
230+
231+
'base_path' is the string representing the current path within the
232+
nested record that leads to this specific entry.
227233
"""
228234
for key, value in json_object.items():
229235
schema_entry = schema_map.get(key)
230-
new_schema_entry = self.get_schema_entry(key, value)
231-
schema_map[key] = self.merge_schema_entry(schema_entry,
232-
new_schema_entry)
233-
234-
def merge_schema_entry(self, old_schema_entry, new_schema_entry):
236+
new_schema_entry = self.get_schema_entry(
237+
key=key,
238+
value=value,
239+
base_path=base_path,
240+
)
241+
schema_map[key] = self.merge_schema_entry(
242+
old_schema_entry=schema_entry,
243+
new_schema_entry=new_schema_entry,
244+
base_path=base_path,
245+
)
246+
247+
def merge_schema_entry(
248+
self,
249+
old_schema_entry,
250+
new_schema_entry,
251+
base_path=None,
252+
):
235253
"""Merges the 'new_schema_entry' into the 'old_schema_entry' and return
236254
a merged schema entry. Recursively merges in sub-fields as well.
237255
@@ -240,6 +258,10 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
240258
returned as the new schema_entry. Returns None if the field should
241259
be removed from the schema due to internal consistency errors.
242260
261+
'base_path' is the string representing the current path within the
262+
nested record that leads to this specific entry. This is used during
263+
error logging.
264+
243265
An Exception is thrown if an unexpected programming error is detected.
244266
The calling routine should stop processing the file.
245267
"""
@@ -310,50 +332,71 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
310332
new_fields = new_info['fields']
311333
for key, new_entry in new_fields.items():
312334
old_entry = old_fields.get(key)
313-
old_fields[key] = self.merge_schema_entry(old_entry, new_entry)
335+
new_base_path = json_full_path(base_path, old_name)
336+
old_fields[key] = self.merge_schema_entry(
337+
old_schema_entry=old_entry,
338+
new_schema_entry=new_entry,
339+
base_path=new_base_path,
340+
)
314341
return old_schema_entry
315342

343+
full_old_name = json_full_path(base_path, old_name)
344+
full_new_name = json_full_path(base_path, new_name)
345+
316346
# For all other types, the old_mode must be the same as the new_mode. It
317347
# might seem reasonable to allow a NULLABLE {primitive_type} to be
318348
# upgraded to a REPEATED {primitive_type}, but currently 'bq load' does
319349
# not support that so we must also follow that rule.
320350
if old_mode != new_mode:
321351
self.log_error(
322352
f'Ignoring non-RECORD field with mismatched mode: '
323-
f'old=({old_status},{old_name},{old_mode},{old_type}); '
324-
f'new=({new_status},{new_name},{new_mode},{new_type})')
353+
f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
354+
f'new=({new_status},{full_new_name},{new_mode},{new_type})')
325355
return None
326356

327357
# Check that the converted types are compatible.
328358
candidate_type = convert_type(old_type, new_type)
329359
if not candidate_type:
330360
self.log_error(
331361
f'Ignoring field with mismatched type: '
332-
f'old=({old_status},{old_name},{old_mode},{old_type}); '
333-
f'new=({new_status},{new_name},{new_mode},{new_type})')
362+
f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
363+
f'new=({new_status},{full_new_name},{new_mode},{new_type})')
334364
return None
335365

336366
new_info['type'] = candidate_type
337367
return new_schema_entry
338368

339-
def get_schema_entry(self, key, value):
369+
def get_schema_entry(self, key, value, base_path=None):
340370
"""Determines the 'schema_entry' of the (key, value) pair. Calls
341371
deduce_schema_for_line() recursively if the value is another object
342372
instead of a primitive (this will happen only for JSON input file).
373+
374+
'base_path' is the string representing the current path within the
375+
nested record that leads to this specific entry.
343376
"""
344377
value_mode, value_type = self.infer_bigquery_type(value)
345378
if not value_mode or not value_type:
346379
return None
347380

348381
if value_type == 'RECORD':
382+
new_base_path = json_full_path(base_path, key)
349383
# recursively figure out the RECORD
350384
fields = OrderedDict()
351385
if value_mode == 'NULLABLE':
352-
self.deduce_schema_for_line(value, fields)
386+
self.deduce_schema_for_line(
387+
json_object=value,
388+
schema_map=fields,
389+
base_path=new_base_path,
390+
)
353391
else:
354392
for val in value:
355-
self.deduce_schema_for_line(val, fields)
356-
# yapf: disable
393+
self.deduce_schema_for_line(
394+
json_object=val,
395+
schema_map=fields,
396+
base_path=new_base_path,
397+
)
398+
399+
# yapf: disable
357400
schema_entry = OrderedDict([
358401
('status', 'hard'),
359402
('filled', True),
@@ -539,7 +582,8 @@ def flatten_schema(self, schema_map):
539582
keep_nulls=self.keep_nulls,
540583
sorted_schema=self.sorted_schema,
541584
infer_mode=self.infer_mode,
542-
sanitize_names=self.sanitize_names)
585+
sanitize_names=self.sanitize_names,
586+
)
543587

544588
def run(self, input_file=sys.stdin, output_file=sys.stdout):
545589
"""Read the data records from the input_file and print out the BigQuery
@@ -745,6 +789,17 @@ def flatten_schema_map(
745789
return schema
746790

747791

792+
def json_full_path(base_path, key):
793+
"""Return the dot-separated JSON full path to a particular key.
794+
e.g. 'server.config.port'. Column names in CSV files are never nested,
795+
so this will always return `key`.
796+
"""
797+
if base_path is None or base_path == "":
798+
return key
799+
else:
800+
return f'{base_path}.{key}'
801+
802+
748803
def main():
749804
# Configure command line flags.
750805
parser = argparse.ArgumentParser(
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '1.1'
1+
__version__ = '1.2'

tests/test_generate_schema.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from bigquery_schema_generator.generate_schema import SchemaGenerator
2323
from bigquery_schema_generator.generate_schema import is_string_type
2424
from bigquery_schema_generator.generate_schema import convert_type
25+
from bigquery_schema_generator.generate_schema import json_full_path
2526
from .data_reader import DataReader
2627

2728

@@ -414,6 +415,15 @@ def test_run_with_invalid_input_throws_exception(self):
414415
with self.assertRaises(Exception):
415416
generator.run(input, output)
416417

418+
def test_json_full_path(self):
419+
self.assertEqual('port', json_full_path(None, 'port'))
420+
self.assertEqual('port', json_full_path("", 'port'))
421+
422+
# 'base_path' should never be '0', but if is do something reasonable.
423+
self.assertEqual('0.port', json_full_path(0, 'port'))
424+
425+
self.assertEqual('server.port', json_full_path('server', 'port'))
426+
417427

418428
class TestFromDataFile(unittest.TestCase):
419429
"""Read the test case data from TESTDATA_FILE and verify that the expected

tests/testdata.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,36 @@ SCHEMA
732732
]
733733
END
734734

735+
# Incompatible types error printing full path
736+
# given
737+
DATA
738+
{"source_machine":{"port":80},"dest_machine":{"port":80}}
739+
{"source_machine":{"port":80},"dest_machine":{"port":"http-port"}}
740+
ERRORS
741+
2: Ignoring field with mismatched type: old=(hard,dest_machine.port,NULLABLE,INTEGER); new=(hard,dest_machine.port,NULLABLE,STRING)
742+
SCHEMA
743+
[
744+
{
745+
"fields": [],
746+
"mode": "NULLABLE",
747+
"name": "dest_machine",
748+
"type": "RECORD"
749+
},
750+
{
751+
"fields": [
752+
{
753+
"mode": "NULLABLE",
754+
"name": "port",
755+
"type": "INTEGER"
756+
}
757+
],
758+
"mode": "NULLABLE",
759+
"name": "source_machine",
760+
"type": "RECORD"
761+
}
762+
]
763+
END
764+
735765
# Simple CSV file
736766
DATA csv
737767
name,surname,age

0 commit comments

Comments
 (0)