Skip to content

Commit fb720c4

Browse files
authored
Use string type for null-columns in CSV (#1147)
1 parent 3dbf02a commit fb720c4

File tree

2 files changed

+6
-3
lines changed

2 files changed

+6
-3
lines changed

src/datachain/lib/arrow.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
241241
return dict
242242
if isinstance(col_type, pa.lib.DictionaryType):
243243
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
244+
if pa.types.is_null(col_type):
245+
return str # use strings for null columns
244246
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
245247

246248

tests/unit/lib/test_arrow.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,18 +154,19 @@ def test_arrow_generator_partitioned(tmp_path, catalog, cache):
154154
(pa.map_(pa.string(), pa.int32()), dict),
155155
(pa.dictionary(pa.int64(), pa.string()), str),
156156
(pa.list_(pa.string()), list[str]),
157+
(pa.null(), str),
157158
),
158159
)
159160
def test_arrow_type_mapper(col_type, expected):
160161
assert arrow_type_mapper(col_type) == expected
161162

162163

163164
def test_arrow_type_mapper_struct():
164-
col_type = pa.struct({"x": pa.int32(), "y": pa.string()})
165+
col_type = pa.struct({"x": pa.int32(), "y": pa.string(), "z": pa.null()})
165166
fields = arrow_type_mapper(col_type).model_fields
166-
assert list(fields.keys()) == ["x", "y"]
167+
assert list(fields.keys()) == ["x", "y", "z"]
167168
dtypes = [field.annotation for field in fields.values()]
168-
assert dtypes == [Optional[int], Optional[str]]
169+
assert dtypes == [Optional[int], Optional[str], Optional[str]]
169170

170171

171172
def test_arrow_type_error():

0 commit comments

Comments
 (0)