@@ -136,13 +136,15 @@ def deduce_schema(self, input_data, *, schema_map=None):
136136 key: schema_entry
137137 }
138138
139- The 'key' is the name of the table column.
139+ The 'key' is the canonical column name, which is set to be the
140+ lower-cased version of the sanitized key because BigQuery is
141+ case-insensitive to its column name.
140142
141143 schema_entry := {
142- 'status': 'hard | soft',
144+ 'status': 'hard | soft | ignore ',
143145 'filled': True | False,
144146 'info': {
145- 'name': key ,
147+ 'name': column_name ,
146148 'type': 'STRING | TIMESTAMP | DATE | TIME
147149 | FLOAT | INTEGER | BOOLEAN | RECORD'
148150 'mode': 'NULLABLE | REQUIRED | REPEATED',
@@ -160,6 +162,13 @@ def deduce_schema(self, input_data, *, schema_map=None):
160162 'hard'. The status can transition from 'soft' to 'hard' but not the
161163 reverse.
162164
165+ The status of 'ignore' identifies a column where the type of one record
166+ conflicts with the type of another record. The column will be ignored
167+ in the final JSON schema. (Early versions of this script *removed* the
168+ offending column entry completely upon the first mismatch. But that
169+ caused subsequent records to recreate the schema entry, which would be
170+ incorrect.)
171+
163172 The 'filled' entry indicates whether all input data records contained
164173 the given field. If the --infer_mode flag is given, the 'filled' entry
165174 is used to convert a NULLABLE schema entry to a REQUIRED schema entry or
@@ -277,8 +286,7 @@ def merge_schema_entry(
277286
278287 Returns the merged schema_entry. This method assumes that both
279288 'old_schema_entry' and 'new_schema_entry' can be modified in place and
280- returned as the new schema_entry. Returns None if the field should
281- be removed from the schema due to internal consistency errors.
289+ returned as the new schema_entry.
282290
283291 'base_path' is the string representing the current path within the
284292 nested record that leads to this specific entry. This is used during
@@ -302,13 +310,19 @@ def merge_schema_entry(
302310 old_status = old_schema_entry ['status' ]
303311 new_status = new_schema_entry ['status' ]
304312
305- # new 'soft' does not clobber old 'hard'
313+ # If the field was previously determined to be inconsistent, hence set
314+ # to 'ignore', do nothing and return immediately.
315+ if old_status == 'ignore' :
316+ return old_schema_entry
317+
318+ # new 'soft' retains the old 'hard'
306319 if old_status == 'hard' and new_status == 'soft' :
307320 mode = self .merge_mode (old_schema_entry ,
308321 new_schema_entry ,
309322 base_path )
310323 if mode is None :
311- return None
324+ old_schema_entry ['status' ] = 'ignore'
325+ return old_schema_entry
312326 old_schema_entry ['info' ]['mode' ] = mode
313327 return old_schema_entry
314328
@@ -318,7 +332,8 @@ def merge_schema_entry(
318332 new_schema_entry ,
319333 base_path )
320334 if mode is None :
321- return None
335+ old_schema_entry ['status' ] = 'ignore'
336+ return old_schema_entry
322337 new_schema_entry ['info' ]['mode' ] = mode
323338 return new_schema_entry
324339
@@ -389,7 +404,8 @@ def merge_schema_entry(
389404 new_schema_entry ,
390405 base_path )
391406 if new_mode is None :
392- return None
407+ old_schema_entry ['status' ] = 'ignore'
408+ return old_schema_entry
393409 new_schema_entry ['info' ]['mode' ] = new_mode
394410
395411 # For all other types...
@@ -402,7 +418,8 @@ def merge_schema_entry(
402418 f'old=({ old_status } ,{ full_old_name } ,{ old_mode } ,{ old_type } );'
403419 f' new=({ new_status } ,{ full_new_name } ,{ new_mode } ,{ new_type } )'
404420 )
405- return None
421+ old_schema_entry ['status' ] = 'ignore'
422+ return old_schema_entry
406423
407424 new_info ['type' ] = candidate_type
408425 return new_schema_entry
@@ -414,6 +431,11 @@ def merge_mode(self, old_schema_entry, new_schema_entry, base_path):
414431 flag), because REQUIRED is created only in the flatten_schema()
415432 method. Therefore, a NULLABLE->REQUIRED transition cannot occur.
416433
434+ Returns the merged mode.
435+
436+ Returning None means that the modes of the old_schema and new_schema are
437+ not compatible.
438+
417439 We have the following sub cases for the REQUIRED -> NULLABLE
418440 transition:
419441
@@ -425,8 +447,6 @@ def merge_mode(self, old_schema_entry, new_schema_entry, base_path):
425447 REQUIRED -> NULLABLE transition.
426448 b) If --infer_mode is not given, then we log an error and ignore
427449 this field from the schema.
428-
429- Returning a 'None' causes the field to be dropped from the schema.
430450 """
431451 old_info = old_schema_entry ['info' ]
432452 new_info = new_schema_entry ['info' ]
@@ -778,6 +798,10 @@ def convert_type(atype, btype):
778798 * [Q]FLOAT + [Q]INTEGER => FLOAT (except QFLOAT + QINTEGER)
779799 * (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) +
780800 (DATE, TIME, TIMESTAMP, QBOOLEAN, QINTEGER, QFLOAT, STRING) => STRING
801+
802+ The "Q" refers to the quoted (i.e. string) versions of the various types,
803+ which are needed to emulate the type inference inside quoted strings
804+ performed by BigQuery.
781805 """
782806 # type + type => type
783807 if atype == btype :
@@ -884,6 +908,11 @@ def flatten_schema_map(
884908 filled = meta ['filled' ]
885909 info = meta ['info' ]
886910
911+ # An 'ignore' status means different records had different types for
912+ # this field, so should be ignored.
913+ if status == 'ignore' :
914+ continue
915+
887916 # Schema entries with a status of 'soft' are caused by 'null' or
888917 # empty fields. Don't print those out if the 'keep_nulls' flag is
889918 # False.
0 commit comments