Skip to content

Commit 747939c

Browse files
committed
fix: csv reader resolve_type handle empty values
1 parent cdeccb3 commit 747939c

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

lavender_data/shard/readers/csv.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
import numpy as np
22
import csv
33
import sys
44
import ast
@@ -17,14 +17,28 @@ class CsvReader(UntypedReader):
1717

1818
def resolve_type(self, value: Any, typestr: str) -> type:
1919
if typestr in ["int", "int32", "int64"]:
20+
if value == "":
21+
return np.nan
2022
return int(value)
2123
elif typestr in ["float", "double"]:
24+
if value == "":
25+
return np.nan
2226
return float(value)
2327
elif typestr in ["string", "text", "str"]:
2428
return str(value)
2529
elif typestr in ["bool", "boolean"]:
2630
return value.lower() in ["true", "t", "yes", "y", "1"]
27-
elif typestr in ["list", "map", "binary"]:
31+
elif typestr in ["list"]:
32+
if value == "":
33+
return []
34+
return ast.literal_eval(value)
35+
elif typestr in ["map"]:
36+
if value == "":
37+
return {}
38+
return ast.literal_eval(value)
39+
elif typestr in ["binary"]:
40+
if value == "":
41+
return b""
2842
return ast.literal_eval(value)
2943
return value
3044

lavender_data/shard/statistics.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ def _is_text_column(values: list[Any]) -> bool:
4545

4646

4747
def _is_categorical_column(values: list[Any]) -> bool:
48+
# unhashable types are not categorical
49+
if not _is_numeric_column(values) and not _is_text_column(values):
50+
return False
51+
4852
unique_values = set(values)
4953

5054
if _is_numeric_column(values):

0 commit comments

Comments
 (0)