Merge pull request #54 from bxparks/develop

bxparks · web-flow · commit 0f63dd059794 · 2020-10-27T20:44:59.000-07:00
merge 1.2 into master
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,9 @@
 # Changelog
 
 * Unreleased
+* 1.2 (2020-10-27)
+    * Print full path of nested JSON elements in error messages (See #52;
+      thanks abroglesc@).
 * 1.1 (2020-07-10)
     * Add `--ignore_invalid_lines` to ignore parsing errors on invalid lines
       and continue processing. Fixes
diff --git a/Makefile b/Makefile
@@ -6,7 +6,7 @@ tests:
 	python3 -m unittest
 
 flake8:
-	flake8 . \
+	flake8 bigquery_schema_generator \
 		--count \
 		--ignore W503 \
 		--show-source \
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ $ generate-schema < file.data.json > file.schema.json
 $ generate-schema --input_format csv < file.data.csv > file.schema.json
 ```
 
-Version: 1.1 (2020-07-10)
+Version: 1.2 (2020-10-27)
 
 Changelog: [CHANGELOG.md](CHANGELOG.md)
 
@@ -723,6 +723,7 @@ now requires Python 3.6 or higher, I think mostly due to the use of f-strings.
 
 I have tested it on:
 
+* Ubuntu 20.04, Python 3.8.5
 * Ubuntu 18.04, Python 3.7.7
 * Ubuntu 18.04, Python 3.6.7
 * Ubuntu 17.10, Python 3.6.3
@@ -745,6 +746,8 @@ and 3.8.
 * Sanitizing of column names to valid BigQuery characters and length by Jon
   Warghed (jonwarghed@).
 * Bug fix in `--sanitize_names` by Riccardo M. Cefala (riccardomc@).
+* Print full path of nested JSON elements in error messages, by Austin Brogle
+  (abroglesc@).
 
 
 ## License
diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py
@@ -200,7 +200,10 @@ def deduce_schema(self, file):
 
                 # Deduce the schema from this given data record.
                 if isinstance(json_object, dict):
-                    self.deduce_schema_for_line(json_object, schema_map)
+                    self.deduce_schema_for_line(
+                        json_object=json_object,
+                        schema_map=schema_map,
+                    )
                 elif isinstance(json_object, Exception):
                     self.log_error(
                         f'Record could not be parsed: Exception: {json_object}')
@@ -218,20 +221,35 @@ def deduce_schema(self, file):
 
         return schema_map, self.error_logs
 
-    def deduce_schema_for_line(self, json_object, schema_map):
+    def deduce_schema_for_line(self, json_object, schema_map, base_path=None):
         """Figures out the BigQuery schema for the given 'json_object' and
         updates 'schema_map' with the latest info. A 'schema_map' entry of type
         'soft' is a provisional entry that can be overwritten by a subsequent
         'soft' or 'hard' entry. If both the old and new have the same type,
         then they must be compatible.
+
+        'base_path' is the string representing the current path within the
+        nested record that leads to this specific entry.
         """
         for key, value in json_object.items():
             schema_entry = schema_map.get(key)
-            new_schema_entry = self.get_schema_entry(key, value)
-            schema_map[key] = self.merge_schema_entry(schema_entry,
-                                                      new_schema_entry)
-
-    def merge_schema_entry(self, old_schema_entry, new_schema_entry):
+            new_schema_entry = self.get_schema_entry(
+                key=key,
+                value=value,
+                base_path=base_path,
+            )
+            schema_map[key] = self.merge_schema_entry(
+                old_schema_entry=schema_entry,
+                new_schema_entry=new_schema_entry,
+                base_path=base_path,
+            )
+
+    def merge_schema_entry(
+        self,
+        old_schema_entry,
+        new_schema_entry,
+        base_path=None,
+    ):
         """Merges the 'new_schema_entry' into the 'old_schema_entry' and return
         a merged schema entry. Recursively merges in sub-fields as well.
 
@@ -240,6 +258,10 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
         returned as the new schema_entry. Returns None if the field should
         be removed from the schema due to internal consistency errors.
 
+        'base_path' is the string representing the current path within the
+        nested record that leads to this specific entry. This is used during
+        error logging.
+
         An Exception is thrown if an unexpected programming error is detected.
         The calling routine should stop processing the file.
         """
@@ -310,50 +332,71 @@ def merge_schema_entry(self, old_schema_entry, new_schema_entry):
             new_fields = new_info['fields']
             for key, new_entry in new_fields.items():
                 old_entry = old_fields.get(key)
-                old_fields[key] = self.merge_schema_entry(old_entry, new_entry)
+                new_base_path = json_full_path(base_path, old_name)
+                old_fields[key] = self.merge_schema_entry(
+                    old_schema_entry=old_entry,
+                    new_schema_entry=new_entry,
+                    base_path=new_base_path,
+                )
             return old_schema_entry
 
+        full_old_name = json_full_path(base_path, old_name)
+        full_new_name = json_full_path(base_path, new_name)
+
         # For all other types, the old_mode must be the same as the new_mode. It
         # might seem reasonable to allow a NULLABLE {primitive_type} to be
         # upgraded to a REPEATED {primitive_type}, but currently 'bq load' does
         # not support that so we must also follow that rule.
         if old_mode != new_mode:
             self.log_error(
                 f'Ignoring non-RECORD field with mismatched mode: '
-                f'old=({old_status},{old_name},{old_mode},{old_type}); '
-                f'new=({new_status},{new_name},{new_mode},{new_type})')
+                f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
+                f'new=({new_status},{full_new_name},{new_mode},{new_type})')
             return None
 
         # Check that the converted types are compatible.
         candidate_type = convert_type(old_type, new_type)
         if not candidate_type:
             self.log_error(
                 f'Ignoring field with mismatched type: '
-                f'old=({old_status},{old_name},{old_mode},{old_type}); '
-                f'new=({new_status},{new_name},{new_mode},{new_type})')
+                f'old=({old_status},{full_old_name},{old_mode},{old_type}); '
+                f'new=({new_status},{full_new_name},{new_mode},{new_type})')
             return None
 
         new_info['type'] = candidate_type
         return new_schema_entry
 
-    def get_schema_entry(self, key, value):
+    def get_schema_entry(self, key, value, base_path=None):
         """Determines the 'schema_entry' of the (key, value) pair. Calls
         deduce_schema_for_line() recursively if the value is another object
         instead of a primitive (this will happen only for JSON input file).
+
+        'base_path' is the string representing the current path within the
+        nested record that leads to this specific entry.
         """
         value_mode, value_type = self.infer_bigquery_type(value)
         if not value_mode or not value_type:
             return None
 
         if value_type == 'RECORD':
+            new_base_path = json_full_path(base_path, key)
             # recursively figure out the RECORD
             fields = OrderedDict()
             if value_mode == 'NULLABLE':
-                self.deduce_schema_for_line(value, fields)
+                self.deduce_schema_for_line(
+                    json_object=value,
+                    schema_map=fields,
+                    base_path=new_base_path,
+                )
             else:
                 for val in value:
-                    self.deduce_schema_for_line(val, fields)
-            # yapf: disable
+                    self.deduce_schema_for_line(
+                        json_object=val,
+                        schema_map=fields,
+                        base_path=new_base_path,
+                    )
+
+        # yapf: disable
             schema_entry = OrderedDict([
                 ('status', 'hard'),
                 ('filled', True),
@@ -539,7 +582,8 @@ def flatten_schema(self, schema_map):
             keep_nulls=self.keep_nulls,
             sorted_schema=self.sorted_schema,
             infer_mode=self.infer_mode,
-            sanitize_names=self.sanitize_names)
+            sanitize_names=self.sanitize_names,
+        )
 
     def run(self, input_file=sys.stdin, output_file=sys.stdout):
         """Read the data records from the input_file and print out the BigQuery
@@ -745,6 +789,17 @@ def flatten_schema_map(
     return schema
 
 
+def json_full_path(base_path, key):
+    """Return the dot-separated JSON full path to a particular key.
+    e.g. 'server.config.port'. Column names in CSV files are never nested,
+    so this will always return `key`.
+    """
+    if base_path is None or base_path == "":
+        return key
+    else:
+        return f'{base_path}.{key}'
+
+
 def main():
     # Configure command line flags.
     parser = argparse.ArgumentParser(
diff --git a/bigquery_schema_generator/version.py b/bigquery_schema_generator/version.py
@@ -1 +1 @@
-__version__ = '1.1'
+__version__ = '1.2'
diff --git a/tests/test_generate_schema.py b/tests/test_generate_schema.py
@@ -22,6 +22,7 @@
 from bigquery_schema_generator.generate_schema import SchemaGenerator
 from bigquery_schema_generator.generate_schema import is_string_type
 from bigquery_schema_generator.generate_schema import convert_type
+from bigquery_schema_generator.generate_schema import json_full_path
 from .data_reader import DataReader
 
 
@@ -414,6 +415,15 @@ def test_run_with_invalid_input_throws_exception(self):
         with self.assertRaises(Exception):
             generator.run(input, output)
 
+    def test_json_full_path(self):
+        self.assertEqual('port', json_full_path(None, 'port'))
+        self.assertEqual('port', json_full_path("", 'port'))
+
+        # 'base_path' should never be '0', but if is do something reasonable.
+        self.assertEqual('0.port', json_full_path(0, 'port'))
+
+        self.assertEqual('server.port', json_full_path('server', 'port'))
+
 
 class TestFromDataFile(unittest.TestCase):
     """Read the test case data from TESTDATA_FILE and verify that the expected
diff --git a/tests/testdata.txt b/tests/testdata.txt
@@ -732,6 +732,36 @@ SCHEMA
 ]
 END
 
+# Incompatible types error printing full path
+# given
+DATA
+{"source_machine":{"port":80},"dest_machine":{"port":80}}
+{"source_machine":{"port":80},"dest_machine":{"port":"http-port"}}
+ERRORS
+2: Ignoring field with mismatched type: old=(hard,dest_machine.port,NULLABLE,INTEGER); new=(hard,dest_machine.port,NULLABLE,STRING)
+SCHEMA
+[
+  {
+    "fields": [],
+    "mode": "NULLABLE",
+    "name": "dest_machine",
+    "type": "RECORD"
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "port",
+        "type": "INTEGER"
+      }
+    ],
+    "mode": "NULLABLE",
+    "name": "source_machine",
+    "type": "RECORD"
+  }
+]
+END
+
 # Simple CSV file
 DATA csv
 name,surname,age

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.1'`
	`1`	`+__version__ = '1.2'`