zephyr: widen inferred parquet schema via pa.unify_schemas

ravwojdyla · claude · ravwojdyla · commit 82bb2fa8068b · 2026-04-23T16:03:01.000-07:00
``_accumulate_tables`` infers its schema from the first micro-batch
(``_MICRO_BATCH_SIZE=8``). If those first records happen to have ``None``
for a field — or to lack a field that appears later — downstream batches
that would legitimately widen the schema either crashed with
``ArrowInvalid: Invalid null value`` or (in the new-field case) were
silently truncated by ``pa.Table.from_pylist``.

Unify-widen the inferred schema on mismatch and reconcile chunks on yield
via ``concat_tables(promote_options="permissive")``. Surface genuine
incompatibilities (e.g. int vs string) as errors with both schemas and
the inference origin shown, so operators can diagnose without extra
instrumentation.

An explicit caller-provided schema is treated as a contract: mismatches
raise without silent widening.

Tests cover: null→concrete widening, new-field-appears-later (previously
silently dropped), and int-vs-string conflict surfacing.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/zephyr/src/zephyr/writers.py b/lib/zephyr/src/zephyr/writers.py
@@ -171,28 +171,94 @@ def _accumulate_tables(
     Converts records to PyArrow in micro-batches of ``_MICRO_BATCH_SIZE``,
     tracks byte size incrementally, and yields a single ``concat_tables``
     result each time the threshold is reached.
+
+    When the caller did not pass an explicit schema, the schema is inferred
+    from the first micro-batch. If a later micro-batch doesn't fit that
+    schema — e.g. early rows pinned a column as ``null`` and a later row
+    supplies a concrete value, or a new top-level column appears — the
+    schemas are unified via :func:`pa.unify_schemas` and the batch is
+    rebuilt against the widened schema. On yield, prior chunks whose
+    schemas differ are reconciled via ``concat_tables(promote_options=
+    "permissive")``. Genuinely incompatible schemas (e.g. ``int`` vs
+    ``string`` for the same field) still raise, with both schemas shown.
+
+    An explicit caller-provided schema is treated as a contract: mismatches
+    raise without silent widening.
     """
     chunks: list[pa.Table] = []
     bytesize = 0
     convert: Callable | None = None
+    schema_inferred = schema is None
+
+    def _raise_schema_mismatch(e: Exception, dicts: list[dict[str, Any]]) -> None:
+        actual_schema = pa.Table.from_pylist(dicts).schema
+        origin = (
+            f"inferred from first {_MICRO_BATCH_SIZE} records (no explicit schema passed)"
+            if schema_inferred
+            else "explicitly provided by caller"
+        )
+        raise pa.ArrowInvalid(
+            f"Schema mismatch converting batch to Arrow: {e}\n"
+            f"Expected schema ({origin}):\n{schema}\n"
+            f"Got schema:\n{actual_schema}"
+        ) from e
+
+    def _build_table(dicts: list[dict[str, Any]], schema: pa.Schema) -> tuple[pa.Table, pa.Schema]:
+        """Convert *dicts* to a table under *schema*, widening via ``pa.unify_schemas`` when needed.
+
+        Returns ``(table, schema)`` where ``schema`` may be wider than the
+        input. Handles two kinds of divergence: (1) ``from_pylist`` raises
+        because a field's type doesn't fit, (2) ``from_pylist`` would
+        silently drop extra top-level keys (new fields appearing only in
+        later batches). Raises (via :func:`_raise_schema_mismatch`) when
+        *schema* was explicitly provided by the caller, or when the
+        divergence isn't representable as a widening (e.g. ``int`` vs
+        ``string``).
+        """
+        mismatch_error: Exception | None = None
+        try:
+            table = pa.Table.from_pylist(dicts, schema=schema)
+        except (pa.ArrowInvalid, pa.ArrowTypeError, pa.ArrowNotImplementedError) as e:
+            mismatch_error = e
+
+        if mismatch_error is None:
+            extra_keys = {k for d in dicts for k in d.keys()} - set(schema.names)
+            if not extra_keys:
+                return table, schema
+            mismatch_error = pa.ArrowInvalid(f"extra top-level keys not in schema: {sorted(extra_keys)}")
+
+        if not schema_inferred:
+            _raise_schema_mismatch(mismatch_error, dicts)
+        new_schema = pa.Table.from_pylist(dicts).schema
+        try:
+            widened = pa.unify_schemas([schema, new_schema])
+        except (pa.ArrowInvalid, pa.ArrowTypeError, pa.ArrowNotImplementedError):
+            _raise_schema_mismatch(mismatch_error, dicts)
+        return pa.Table.from_pylist(dicts, schema=widened), widened
 
     for micro_batch in batchify(records, n=_MICRO_BATCH_SIZE):
         if convert is None:
             convert = asdict if is_dataclass(micro_batch[0]) else (lambda x: x)
         dicts = [convert(r) for r in micro_batch]
         if schema is None:
-            # NOTE: the _MICRO_BATCH_SIZE is fairly small, here we hope it's enough to infer "real" schema
+            # NOTE: _MICRO_BATCH_SIZE is small; if the initial schema turns
+            # out to be narrower than the stream's true schema, we widen
+            # below on the first mismatching batch.
             schema = infer_arrow_schema(dicts)
-        table = pa.Table.from_pylist(dicts, schema=schema)
+
+        table, schema = _build_table(dicts, schema)
         chunks.append(table)
         bytesize += table.nbytes
         if bytesize >= target_bytes:
-            yield pa.concat_tables(chunks)
+            # ``promote_options="permissive"`` reconciles chunks whose schemas
+            # widened mid-stream (e.g. a later chunk introduced a new column
+            # or widened ``null`` → concrete type).
+            yield pa.concat_tables(chunks, promote_options="permissive")
             chunks = []
             bytesize = 0
 
     if chunks:
-        yield pa.concat_tables(chunks)
+        yield pa.concat_tables(chunks, promote_options="permissive")
 
 
 def write_parquet_file(
diff --git a/lib/zephyr/tests/test_writers.py b/lib/zephyr/tests/test_writers.py
@@ -7,12 +7,11 @@
 import tempfile
 from pathlib import Path
 
+import pyarrow as pa
 import pyarrow.parquet as pq
 import pytest
 import vortex
 
-import pyarrow as pa
-
 from zephyr.writers import (
     atomic_rename,
     infer_arrow_schema,
@@ -151,6 +150,54 @@ def test_write_parquet_file_basic():
         assert len(table) == 2
 
 
+def test_write_parquet_file_widens_null_to_concrete_type():
+    """First batch pins a field as null; a later batch with a concrete type widens cleanly.
+
+    This is the stackv2 failure mode: the first ``_MICRO_BATCH_SIZE`` (=8)
+    records all had ``None`` for a field, pinning it to ``pa.null()`` —
+    later records with real values would fail without schema widening.
+    Behavior must: (a) succeed, (b) land the widened schema on disk, (c)
+    preserve all values from both batches.
+    """
+    records = [{"x": None}] * 8 + [{"x": "hello"}]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = str(Path(tmpdir) / "test.parquet")
+        result = write_parquet_file(records, output_path)
+        assert result["count"] == 9
+
+        table = pq.read_table(output_path)
+        assert len(table) == 9
+        assert pa.types.is_string(table.schema.field("x").type)
+        xs = table.column("x").to_pylist()
+        assert xs[:8] == [None] * 8
+        assert xs[8] == "hello"
+
+
+def test_write_parquet_file_captures_fields_appearing_in_later_batches():
+    """A field absent from the first batch but present later must not be silently dropped."""
+    records = [{"x": "a"}] * 8 + [{"x": "b", "z": 42}]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = str(Path(tmpdir) / "test.parquet")
+        result = write_parquet_file(records, output_path)
+        assert result["count"] == 9
+
+        table = pq.read_table(output_path)
+        assert "z" in table.schema.names, "field `z` must survive to disk, not be dropped"
+        assert table.column("z").to_pylist() == [None] * 8 + [42]
+
+
+def test_write_parquet_file_raises_on_incompatible_type_conflict():
+    """Genuine type conflicts (e.g. int vs string) must still raise a clear error."""
+    records = [{"x": i} for i in range(8)] + [{"x": "stringy"}]
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = str(Path(tmpdir) / "test.parquet")
+        with pytest.raises((pa.ArrowInvalid, pa.ArrowTypeError)) as excinfo:
+            write_parquet_file(records, output_path)
+    msg = str(excinfo.value)
+    assert "int" in msg.lower() or "int64" in msg.lower()
+    assert "string" in msg.lower()
+
+
 def test_write_parquet_file_empty():
     """Test writing an empty parquet file."""
     with tempfile.TemporaryDirectory() as tmpdir: