implement fix for primitive types in json shared data space

joe-clickhouse · joe-clickhouse · commit c65bf31d77c5 · 2025-12-08T16:52:25.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,9 @@ The supported method of passing ClickHouse server settings is to prefix such arg
 
 ### Bug Fixes
 - Fix issue with DROP table in client temp table test.
+- Fixed `StreamFailureError` when reading JSON columns with shared data by correctly decoding the binary variant values. Closes [#599](https://github.com/ClickHouse/clickhouse-connect/issues/599)
+- Fixed JSON column reconstruction to properly handle nested paths in shared data (keys beyond `max_dynamic_paths`).
+- Corrected the internal definition of `SHARED_DATA_TYPE` for JSON columns.
 
 ### Improvements
 - Add support for QBit data type. Closes [#570](https://github.com/ClickHouse/clickhouse-connect/issues/570)
diff --git a/clickhouse_connect/datatypes/dynamic.py b/clickhouse_connect/datatypes/dynamic.py
@@ -1,10 +1,13 @@
 from collections import namedtuple
-from typing import List, Sequence, Collection, Any
+import logging
+from typing import List, Sequence, Collection, Any, Union
 from urllib.parse import unquote
 
 from clickhouse_connect.datatypes.base import ClickHouseType, TypeDef
 from clickhouse_connect.datatypes.registry import get_from_name
 from clickhouse_connect.driver.common import unescape_identifier, first_value, write_uint64
+from clickhouse_connect.driver.bytesource import ByteArraySource
+from clickhouse_connect.datatypes.string import String
 from clickhouse_connect.driver.ctypes import data_conv
 from clickhouse_connect.driver.errors import handle_error
 from clickhouse_connect.driver.exceptions import DataError
@@ -18,6 +21,8 @@
 _JSON_NULL = b'null'
 _JSON_NULL_STR = 'null'
 
+logger = logging.getLogger(__name__)
+
 json_serialization_format = 0x1
 
 VariantState = namedtuple('VariantState', 'discriminator_node element_states')
@@ -30,6 +35,11 @@ def _json_path_segments(path: str) -> List[str]:
     return segments
 
 
+class SharedDataString(String):
+    def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any):
+        return source.read_str_col(num_rows, None)
+
+
 class Variant(ClickHouseType):
     _slots = 'element_types'
     python_type = object
@@ -169,7 +179,59 @@ def write_str_values(ch_type: ClickHouseType, column: Sequence, dest: bytearray,
     handle_error(data_conv.write_str_col(col, False, encoding, dest), ctx)
 
 
-JSONState = namedtuple('JSONState', 'serialize_version dynamic_paths typed_states dynamic_states')
+JSONState = namedtuple("JSONState", "serialize_version dynamic_paths typed_states dynamic_states shared_state")
+
+# Standard discriminator to type mapping for shared data
+# From https://github.com/ClickHouse/ClickHouse/src/DataTypes/DataTypesBinaryEncoding.cpp
+STANDARD_DISCRIMINATOR_TYPES = {
+    0x00: "Nothing",
+    0x01: "UInt8",
+    0x02: "UInt16",
+    0x03: "UInt32",
+    0x04: "UInt64",
+    0x05: "UInt128",
+    0x06: "UInt256",
+    0x07: "Int8",
+    0x08: "Int16",
+    0x09: "Int32",
+    0x0A: "Int64",
+    0x0B: "Int128",
+    0x0C: "Int256",
+    0x0D: "Float32",
+    0x0E: "Float64",
+    0x15: "String",
+    0x2D: "Bool",
+}
+
+
+def decode_shared_data_value(binary_data: Union[bytes, str], ctx: QueryContext):
+    """Decode a variant-encoded value from JSON shared data using discriminator byte."""
+    if binary_data is None:
+        return None
+
+    if len(binary_data) < 1:
+        return binary_data
+
+    discriminator = binary_data[0]
+    if discriminator == 255:
+        return None
+
+    type_name = STANDARD_DISCRIMINATOR_TYPES.get(discriminator)
+    if type_name is None:
+        return binary_data
+
+    value_type = get_from_name(type_name)
+
+    try:
+        byte_source = ByteArraySource(binary_data[1:])
+        read_state = value_type.read_column_prefix(byte_source, ctx)
+        result = value_type.read_column_data(byte_source, 1, ctx, read_state)
+        return result[0] if result else None
+
+    # pylint: disable=broad-exception-caught
+    except Exception as e:
+        logger.debug("Shared data decode failed: %s", e)
+        return binary_data
 
 
 class JSON(ClickHouseType):
@@ -247,7 +309,8 @@ def read_column_prefix(self, source: ByteSource, ctx: QueryContext) -> JSONState
         dynamic_paths = [source.read_leb128_str() for _ in range(dynamic_path_cnt)]
         typed_states = [typed.read_column_prefix(source, ctx) for typed in self.typed_types]
         dynamic_states = [read_dynamic_prefix(self, source, ctx) for _ in range(dynamic_path_cnt)]
-        return JSONState(serialize_version, dynamic_paths, typed_states, dynamic_states)
+        shared_state = SHARED_DATA_TYPE.read_column_prefix(source, ctx)
+        return JSONState(serialize_version, dynamic_paths, typed_states, dynamic_states, shared_state)
 
     # pylint: disable=too-many-locals
     def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, read_state: JSONState):
@@ -256,7 +319,7 @@ def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryConte
         dynamic_columns = [
             read_variant_column(source, num_rows, ctx, dynamic_state.variant_types, dynamic_state.variant_states)
             for dynamic_state in read_state.dynamic_states]
-        SHARED_DATA_TYPE.read_column_data(source, num_rows, ctx, None)
+        shared_columns = SHARED_DATA_TYPE.read_column_data(source, num_rows, ctx, read_state.shared_state)
         col = []
         for row_num in range(num_rows):
             top = {}
@@ -284,6 +347,11 @@ def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryConte
                         item[key] = child
                     item = child
                 item[chain[-1]] = value
+            if shared_columns and row_num < len(shared_columns):
+                shared_data = shared_columns[row_num]
+                if shared_data:
+                    decoded_shared = {key: decode_shared_data_value(value, ctx) for key, value in shared_data.items()}
+                    top.update(decoded_shared)
             col.append(top)
         if self.read_format(ctx) == 'string':
             return [any_to_json(v) for v in col]
diff --git a/clickhouse_connect/datatypes/postinit.py b/clickhouse_connect/datatypes/postinit.py
@@ -1,7 +1,8 @@
 from clickhouse_connect.datatypes import registry, dynamic, geometric
 
-dynamic.SHARED_DATA_TYPE = registry.get_from_name('Array(String, String)')
 dynamic.STRING_DATA_TYPE = registry.get_from_name('String')
+dynamic.SHARED_DATA_TYPE = registry.get_from_name('Map(String, String)')
+dynamic.SHARED_DATA_TYPE.value_type = dynamic.SharedDataString(dynamic.STRING_DATA_TYPE.type_def)
 
 point = 'Tuple(Float64, Float64)'
 ring = f'Array({point})'
diff --git a/clickhouse_connect/driver/bytesource.py b/clickhouse_connect/driver/bytesource.py
@@ -0,0 +1,160 @@
+import struct
+
+from clickhouse_connect.driver.types import ByteSource
+
+
+class ByteArraySource(ByteSource):
+    """
+    ByteSource implementation for in-memory byte arrays.
+
+    This class wraps a byte array and provides the ByteSource interface,
+    allowing ClickHouse type decoders to read from in-memory data instead
+    of a network stream.
+
+    Used primarily for decoding variant-encoded values from JSON shared data
+    where each value is a complete serialized type instance.
+    """
+
+    def __init__(self, data: bytes, encoding: str = "utf-8"):
+        """
+        Initialize ByteArraySource with byte array data.
+
+        :param data: The byte array to read from
+        :param encoding: Character encoding for string operations (default: utf-8)
+        """
+        self.data = data
+        self.pos = 0
+        self.encoding = encoding
+
+    def read_byte(self) -> int:
+        """Read a single byte and advance position."""
+        if self.pos >= len(self.data):
+            raise EOFError("Attempted to read past end of byte array")
+        b = self.data[self.pos]
+        self.pos += 1
+        return b
+
+    def read_bytes(self, sz: int) -> bytes:
+        """Read specified number of bytes and advance position."""
+        if self.pos + sz > len(self.data):
+            raise EOFError(f"Attempted to read {sz} bytes, only {len(self.data) - self.pos} available")
+        result = self.data[self.pos : self.pos + sz]
+        self.pos += sz
+        return result
+
+    def read_leb128(self) -> int:
+        """Read a LEB128 (variable-length) encoded integer."""
+        sz = 0
+        shift = 0
+        while self.pos < len(self.data):
+            b = self.read_byte()
+            sz += (b & 0x7F) << shift
+            if (b & 0x80) == 0:
+                return sz
+            shift += 7
+        raise EOFError("Unexpected end while reading LEB128")
+
+    def read_leb128_str(self) -> str:
+        """Read a LEB128 length-prefixed string."""
+        sz = self.read_leb128()
+        return self.read_bytes(sz).decode(self.encoding)
+
+    def read_uint64(self) -> int:
+        """Read an unsigned 64-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(8), "little", signed=False)
+
+    def read_int64(self) -> int:
+        """Read a signed 64-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(8), "little", signed=True)
+
+    def read_uint32(self) -> int:
+        """Read an unsigned 32-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(4), "little", signed=False)
+
+    def read_int32(self) -> int:
+        """Read a signed 32-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(4), "little", signed=True)
+
+    def read_uint16(self) -> int:
+        """Read an unsigned 16-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(2), "little", signed=False)
+
+    def read_int16(self) -> int:
+        """Read a signed 16-bit integer (little-endian)."""
+        return int.from_bytes(self.read_bytes(2), "little", signed=True)
+
+    def read_float32(self) -> float:
+        """Read a 32-bit float (little-endian)."""
+        return struct.unpack("<f", self.read_bytes(4))[0]
+
+    def read_float64(self) -> float:
+        """Read a 64-bit float (double, little-endian)."""
+        return struct.unpack("<d", self.read_bytes(8))[0]
+
+    # pylint: disable=too-many-return-statements
+    def read_array(self, array_type: str, num_rows: int):  # type: ignore
+        """
+        Limited implementation of array reading for basic types.
+
+        Args:
+            array_type: Python struct format character
+                'B' = UInt8, 'H' = UInt16, 'I' = UInt32, 'Q' = UInt64
+                'b' = Int8, 'h' = Int16, 'i' = Int32, 'q' = Int64
+                'f' = Float32, 'd' = Float64
+            num_rows: Number of elements to read
+
+        Returns:
+            List of values
+        """
+        if array_type == "B":
+            return [self.read_byte() for _ in range(num_rows)]
+        elif array_type == "H":
+            return [self.read_uint16() for _ in range(num_rows)]
+        elif array_type == "I":
+            return [self.read_uint32() for _ in range(num_rows)]
+        elif array_type == "Q":
+            return [self.read_uint64() for _ in range(num_rows)]
+        elif array_type == "b":
+            return [int.from_bytes([self.read_byte()], "little", signed=True) for _ in range(num_rows)]
+        elif array_type == "h":
+            return [self.read_int16() for _ in range(num_rows)]
+        elif array_type == "i":
+            return [self.read_int32() for _ in range(num_rows)]
+        elif array_type == "q":
+            return [self.read_int64() for _ in range(num_rows)]
+        elif array_type == "f":
+            return [self.read_float32() for _ in range(num_rows)]
+        elif array_type == "d":
+            return [self.read_float64() for _ in range(num_rows)]
+        else:
+            raise NotImplementedError(f"Array type {array_type} not implemented for ByteArraySource")
+
+    # Minimal implementations for other ByteSource methods that aren't needed
+    # for single-value decoding but are required by the interface
+
+    def read_str_col(self, num_rows, encoding, nullable=False, null_obj=None):  # type: ignore
+        """
+        Read a column of strings.
+        For single-value decoding (num_rows=1), read one LEB128 length-prefixed string.
+        """
+        if num_rows != 1:
+            raise NotImplementedError("read_str_col only supports num_rows=1 for single-value decoding")
+
+        length = self.read_leb128()
+        string_bytes = self.read_bytes(length)
+
+        if encoding is None:
+            return [string_bytes]
+
+        return [string_bytes.decode(encoding)]
+
+    def read_bytes_col(self, sz, num_rows):
+        """Not used for single-value decoding."""
+        raise NotImplementedError("read_bytes_col not needed for single-value decoding")
+
+    def read_fixed_str_col(self, sz, num_rows, encoding):
+        """Not used for single-value decoding."""
+        raise NotImplementedError("read_fixed_str_col not needed for single-value decoding")
+
+    def close(self):
+        """No cleanup needed for byte arrays."""
diff --git a/tests/integration_tests/test_dynamic.py b/tests/integration_tests/test_dynamic.py