Skip to content

Commit 169d8f2

Browse files
committed
update logging and transform fields
1 parent d3e5588 commit 169d8f2

1 file changed

Lines changed: 12 additions & 2 deletions

File tree

  • academic-observatory-workflows/academic_observatory_workflows/openalex_telescope

academic-observatory-workflows/academic_observatory_workflows/openalex_telescope/tasks.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,13 @@ def transform_file(download_path: str, transform_path: str) -> Tuple[OrderedDict
520520
)
521521
schema_generator.deduce_schema_for_record(obj, schema_map)
522522
except Exception as e:
523-
logging.warning(f"Schema deduction error in {download_path}: {e}")
524-
523+
# Find which fields in this record are empty arrays/null
524+
offending = [k for k, v in obj_for_schema.items() if isinstance(v, list) and len(v) == 0]
525+
logging.warning(
526+
f" Schema deduction error: {e}\n"
527+
f" record id : {obj.get('id', '?')}\n"
528+
f" empty arrays : {offending}"
529+
)
525530
json.dump(obj, f_out)
526531
f_out.write("\n")
527532

@@ -649,6 +654,11 @@ def transform_object(obj: dict):
649654
convert_field_to_int(safe_get_dict(obj, "apc_list"), "value")
650655
convert_field_to_int(safe_get_dict(obj, "apc_list"), "value_usd")
651656

657+
for val in obj.get("sources", []):
658+
remove_none_from_array(val, "issn")
659+
for val in obj.get("locations", []):
660+
remove_none_from_array(safe_get_dict(val, "source"), "issn")
661+
652662
# Remove empty/null arrays so schema generator never sees untyped empty lists
653663
field = "abstract_inverted_index"
654664
if field in obj:

0 commit comments

Comments
 (0)