zephyr: widen inferred parquet schema via pa.unify_schemas (#5142)

ravwojdyla-agent · ravwojdyla · claude · web-flow · commit 0497c64554cd · 2026-04-23T16:17:21.000-07:00
* writers' ``_accumulate_tables`` infers schema from the first ``_MICRO_BATCH_SIZE=8`` records — so if those records have ``None`` for an optional field, the field gets pinned to ``pa.null()`` and later records with real values crash with ``ArrowInvalid: Invalid null value`` * real-world case: ``common-pile/stackv2``'s nested ``metadata.gha_language`` (959 null / 1041 str across ~2000 records) was deterministically failing * separately, ``pa.Table.from_pylist`` **silently drops** top-level keys missing from the pinned schema — any new column appearing in a later batch was being truncated without a signal [^1] * on mismatch, unify via ``pa.unify_schemas`` and rebuild the batch against the widened schema; reconcile prior chunks on yield via ``concat_tables(promote_options="permissive")`` * genuine type conflicts (e.g. ``int`` vs ``string`` for the same field) still raise with both schemas + inference origin shown, so operators can diagnose without extra instrumentation * explicit caller-provided schemas are a contract — mismatches raise without silent widening ## Test plan - [x] `test_write_parquet_file_widens_null_to_concrete_type` — null→string widening succeeds and lands the widened schema on disk - [x] `test_write_parquet_file_captures_fields_appearing_in_later_batches` — new field survives to disk instead of being silently dropped - [x] `test_write_parquet_file_raises_on_incompatible_type_conflict` — int vs string still surfaces as a clear error [^1]: this silent-drop behavior was a latent data-loss bug; the new extra-keys detection catches it and routes through the same widen path. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Rafal Wojdyla <ravwojdyla@gmail.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/lib/zephyr/src/zephyr/writers.py b/lib/zephyr/src/zephyr/writers.py
@@ -171,42 +171,94 @@ def _accumulate_tables(
     Converts records to PyArrow in micro-batches of ``_MICRO_BATCH_SIZE``,
     tracks byte size incrementally, and yields a single ``concat_tables``
     result each time the threshold is reached.
+
+    When the caller did not pass an explicit schema, the schema is inferred
+    from the first micro-batch. If a later micro-batch doesn't fit that
+    schema — e.g. early rows pinned a column as ``null`` and a later row
+    supplies a concrete value, or a new top-level column appears — the
+    schemas are unified via :func:`pa.unify_schemas` and the batch is
+    rebuilt against the widened schema. On yield, prior chunks whose
+    schemas differ are reconciled via ``concat_tables(promote_options=
+    "permissive")``. Genuinely incompatible schemas (e.g. ``int`` vs
+    ``string`` for the same field) still raise, with both schemas shown.
+
+    An explicit caller-provided schema is treated as a contract: mismatches
+    raise without silent widening.
     """
     chunks: list[pa.Table] = []
     bytesize = 0
     convert: Callable | None = None
     schema_inferred = schema is None
 
+    def _raise_schema_mismatch(e: Exception, dicts: list[dict[str, Any]]) -> None:
+        actual_schema = pa.Table.from_pylist(dicts).schema
+        origin = (
+            f"inferred from first {_MICRO_BATCH_SIZE} records (no explicit schema passed)"
+            if schema_inferred
+            else "explicitly provided by caller"
+        )
+        raise pa.ArrowInvalid(
+            f"Schema mismatch converting batch to Arrow: {e}\n"
+            f"Expected schema ({origin}):\n{schema}\n"
+            f"Got schema:\n{actual_schema}"
+        ) from e
+
+    def _build_table(dicts: list[dict[str, Any]], schema: pa.Schema) -> tuple[pa.Table, pa.Schema]:
+        """Convert *dicts* to a table under *schema*, widening via ``pa.unify_schemas`` when needed.
+
+        Returns ``(table, schema)`` where ``schema`` may be wider than the
+        input. Handles two kinds of divergence: (1) ``from_pylist`` raises
+        because a field's type doesn't fit, (2) ``from_pylist`` would
+        silently drop extra top-level keys (new fields appearing only in
+        later batches). Raises (via :func:`_raise_schema_mismatch`) when
+        *schema* was explicitly provided by the caller, or when the
+        divergence isn't representable as a widening (e.g. ``int`` vs
+        ``string``).
+        """
+        mismatch_error: Exception | None = None
+        try:
+            table = pa.Table.from_pylist(dicts, schema=schema)
+        except (pa.ArrowInvalid, pa.ArrowTypeError, pa.ArrowNotImplementedError) as e:
+            mismatch_error = e
+
+        if mismatch_error is None:
+            extra_keys = {k for d in dicts for k in d.keys()} - set(schema.names)
+            if not extra_keys:
+                return table, schema
+            mismatch_error = pa.ArrowInvalid(f"extra top-level keys not in schema: {sorted(extra_keys)}")
+
+        if not schema_inferred:
+            _raise_schema_mismatch(mismatch_error, dicts)
+        new_schema = pa.Table.from_pylist(dicts).schema
+        try:
+            widened = pa.unify_schemas([schema, new_schema])
+        except (pa.ArrowInvalid, pa.ArrowTypeError, pa.ArrowNotImplementedError):
+            _raise_schema_mismatch(mismatch_error, dicts)
+        return pa.Table.from_pylist(dicts, schema=widened), widened
+
     for micro_batch in batchify(records, n=_MICRO_BATCH_SIZE):
         if convert is None:
             convert = asdict if is_dataclass(micro_batch[0]) else (lambda x: x)
         dicts = [convert(r) for r in micro_batch]
         if schema is None:
-            # NOTE: the _MICRO_BATCH_SIZE is fairly small, here we hope it's enough to infer "real" schema
+            # NOTE: _MICRO_BATCH_SIZE is small; if the initial schema turns
+            # out to be narrower than the stream's true schema, we widen
+            # below on the first mismatching batch.
             schema = infer_arrow_schema(dicts)
-        try:
-            table = pa.Table.from_pylist(dicts, schema=schema)
-        except (pa.ArrowInvalid, pa.ArrowTypeError, pa.ArrowNotImplementedError) as e:
-            actual_schema = pa.Table.from_pylist(dicts).schema
-            origin = (
-                f"inferred from first {_MICRO_BATCH_SIZE} records (no explicit schema passed)"
-                if schema_inferred
-                else "explicitly provided by caller"
-            )
-            raise pa.ArrowInvalid(
-                f"Schema mismatch converting batch to Arrow: {e}\n"
-                f"Expected schema ({origin}):\n{schema}\n"
-                f"Got schema:\n{actual_schema}"
-            ) from e
+
+        table, schema = _build_table(dicts, schema)
         chunks.append(table)
         bytesize += table.nbytes
         if bytesize >= target_bytes:
-            yield pa.concat_tables(chunks)
+            # ``promote_options="permissive"`` reconciles chunks whose schemas
+            # widened mid-stream (e.g. a later chunk introduced a new column
+            # or widened ``null`` → concrete type).
+            yield pa.concat_tables(chunks, promote_options="permissive")
             chunks = []
             bytesize = 0
 
     if chunks:
-        yield pa.concat_tables(chunks)
+        yield pa.concat_tables(chunks, promote_options="permissive")
 
 
 def write_parquet_file(
diff --git a/lib/zephyr/tests/test_writers.py b/lib/zephyr/tests/test_writers.py
@@ -151,21 +151,52 @@ def test_write_parquet_file_basic():
         assert len(table) == 2
 
 
-def test_write_parquet_file_schema_mismatch_surfaces_both_schemas():
-    """On schema divergence, the raised error includes expected + actual schemas and inference origin."""
-    # First micro-batch (_MICRO_BATCH_SIZE=8) has `x` all None → inferred as pa.null().
-    # A later record with a real value for `x` then fails to fit that schema.
+def test_write_parquet_file_widens_null_to_concrete_type():
+    """First batch pins a field as null; a later batch with a concrete type widens cleanly.
+
+    This is the stackv2 failure mode: the first ``_MICRO_BATCH_SIZE`` (=8)
+    records all had ``None`` for a field, pinning it to ``pa.null()`` —
+    later records with real values would fail without schema widening.
+    Behavior must: (a) succeed, (b) land the widened schema on disk, (c)
+    preserve all values from both batches.
+    """
     records = [{"x": None}] * 8 + [{"x": "hello"}]
     with tempfile.TemporaryDirectory() as tmpdir:
         output_path = str(Path(tmpdir) / "test.parquet")
-        with pytest.raises(pa.ArrowInvalid) as excinfo:
+        result = write_parquet_file(records, output_path)
+        assert result["count"] == 9
+
+        table = pq.read_table(output_path)
+        assert len(table) == 9
+        assert pa.types.is_string(table.schema.field("x").type)
+        xs = table.column("x").to_pylist()
+        assert xs[:8] == [None] * 8
+        assert xs[8] == "hello"
+
+
+def test_write_parquet_file_captures_fields_appearing_in_later_batches():
+    """A field absent from the first batch but present later must not be silently dropped."""
+    records = [{"x": "a"}] * 8 + [{"x": "b", "z": 42}]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = str(Path(tmpdir) / "test.parquet")
+        result = write_parquet_file(records, output_path)
+        assert result["count"] == 9
+
+        table = pq.read_table(output_path)
+        assert "z" in table.schema.names, "field `z` must survive to disk, not be dropped"
+        assert table.column("z").to_pylist() == [None] * 8 + [42]
+
+
+def test_write_parquet_file_raises_on_incompatible_type_conflict():
+    """Genuine type conflicts (e.g. int vs string) must still raise a clear error."""
+    records = [{"x": i} for i in range(8)] + [{"x": "stringy"}]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = str(Path(tmpdir) / "test.parquet")
+        with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)) as excinfo:
             write_parquet_file(records, output_path)
     msg = str(excinfo.value)
-    assert "Expected schema" in msg
-    assert "Got schema" in msg
-    assert "inferred from first" in msg
-    assert "x: null" in msg
-    assert "x: string" in msg
+    assert "int" in msg.lower() or "int64" in msg.lower()
+    assert "string" in msg.lower()
 
 
 def test_write_parquet_file_empty():