Skip to content

Commit b3b0273

Browse files
authored
Fix regression when writing combination of np.NaN and None (#2212)
Early version of ArcticDB allowed for writing a column containing only none and nan values. We assumed both of those are placeholders for an empty string. At some point mixing only np.nan and None in a column stopped working on write side. #### Reference Issues/PRs <!--Example: Fixes #1234. See also #3456.--> #### What does this implement or fix? #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details> <!-- Thanks for contributing a Pull Request to ArcticDB! Please ensure you have taken a look at: - ArcticDB's Code of Conduct: https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md - ArcticDB's Contribution Licensing: https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing -->
1 parent f03d192 commit b3b0273

File tree

4 files changed

+48
-20
lines changed

4 files changed

+48
-20
lines changed

cpp/arcticdb/pipeline/write_frame.cpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,14 @@ std::tuple<stream::StreamSink::PartialKey, SegmentInMemory, FrameSlice> WriteToS
9595
auto& tensor = frame_->field_tensors[slice_.absolute_field_col(col)];
9696
auto opt_error = aggregator_set_data(
9797
fd.type(),
98-
tensor, agg, abs_col, rows_to_write, offset_in_frame, slice_num_for_column_,
99-
regular_slice_size, sparsify_floats_);
98+
tensor,
99+
agg,
100+
abs_col,
101+
rows_to_write,
102+
offset_in_frame,
103+
slice_num_for_column_,
104+
regular_slice_size,
105+
sparsify_floats_);
100106
if (opt_error.has_value()) {
101107
opt_error->raise(fd.name(), offset_in_frame);
102108
}

cpp/arcticdb/python/python_to_tensor_frame.cpp

+11-18
Original file line numberDiff line numberDiff line change
@@ -145,14 +145,10 @@ NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types) {
145145
// wide type always is 64bits
146146
val_bytes = 8;
147147

148-
// If Numpy has type 'O' then get_value_type above will return type 'BYTES'
149-
// If there is no value, and we can't deduce a type then leave it that way,
150-
// otherwise try to work out whether it was a bytes (string) type or unicode
151148
if (!is_fixed_string_type(val_type) && element_count > 0) {
152149
auto none = py::none{};
153150
auto obj = reinterpret_cast<PyObject **>(arr->data);
154-
bool empty = false;
155-
bool all_nans = false;
151+
bool empty_string_placeholder = false;
156152
PyObject *sample = *obj;
157153
PyObject** current_object = obj;
158154
// Arctic allows both None and NaN to represent a string with no value. We have 3 options:
@@ -163,31 +159,28 @@ NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types) {
163159
// * In case there is at least one actual string we can sample it and decide the type of the column segment
164160
// based on it
165161
// Note: ValueType::ASCII_DYNAMIC was used when Python 2 was supported. It is no longer supported, and
166-
// we're not expected to enter that branch.
162+
// we're not expected to enter that branch.
167163
if (sample == none.ptr() || is_py_nan(sample)) {
168-
empty = true;
169-
all_nans = true;
164+
empty_string_placeholder = true;
170165
util::check(c_style, "Non contiguous columns with first element as None not supported yet.");
171166
const auto* end = obj + size;
172167
while(current_object < end) {
173-
174-
if(*current_object == none.ptr()) {
175-
all_nans = false;
176-
} else if(is_py_nan(*current_object)) {
177-
empty = false;
178-
} else {
179-
all_nans = false;
180-
empty = false;
168+
if(!(is_py_nan(*current_object) || *current_object == none.ptr())) {
169+
empty_string_placeholder = false;
181170
break;
182171
}
183172
++current_object;
184173
}
185174
if(current_object != end)
186175
sample = *current_object;
187176
}
188-
if (empty && kind == 'O') {
177+
// Column full of NaN values is interpreted differently based on the kind. If kind is object "O" the column
178+
// is assigned a string type if kind is float "f" the column is assigned a float type. This is done in
179+
// order to preserve a legacy behavior of ArcticDB allowing to use both NaN and None as a placeholder for
180+
// missing string values.
181+
if (empty_string_placeholder && kind == 'O') {
189182
val_type = empty_types ? ValueType::EMPTY : ValueType::UTF_DYNAMIC;
190-
} else if(all_nans || is_unicode(sample)){
183+
} else if(is_unicode(sample)) {
191184
val_type = ValueType::UTF_DYNAMIC;
192185
} else if (PYBIND11_BYTES_CHECK(sample)) {
193186
val_type = ValueType::ASCII_DYNAMIC;

python/tests/hypothesis/arcticdb/test_sort_merge.py

+4
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def get_append_keys(lib, sym):
127127
@given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS))
128128
def test_sort_merge_static_schema_write(lmdb_library, df_list):
129129
lib = lmdb_library
130+
lib._nvs.version_store.clear()
130131
sym = "test_sort_merge_static_schema_write"
131132
for df in df_list:
132133
lib.write(sym, df, staged=True, validate_index=False)
@@ -150,6 +151,7 @@ def test_sort_merge_static_schema_write(lmdb_library, df_list):
150151
@given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS), initial_df=generate_single_dataframe(COLUMN_DESCRIPTIONS, min_size=1, allow_nat_in_index=False))
151152
def test_sort_merge_static_schema_append(lmdb_library, df_list, initial_df):
152153
lib = lmdb_library
154+
lib._nvs.version_store.clear()
153155
sym = "test_sort_merge_static_schema_append"
154156
initial_df.sort_index(inplace=True)
155157
lib.write(sym, initial_df)
@@ -179,6 +181,7 @@ def test_sort_merge_static_schema_append(lmdb_library, df_list, initial_df):
179181
@given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS))
180182
def test_sort_merge_dynamic_schema_write(lmdb_library_dynamic_schema, df_list):
181183
lib = lmdb_library_dynamic_schema
184+
lib._nvs.version_store.clear()
182185
sym = "test_sort_merge_dynamic_schema_write"
183186
for df in df_list:
184187
lib.write(sym, df, staged=True, validate_index=False)
@@ -203,6 +206,7 @@ def test_sort_merge_dynamic_schema_write(lmdb_library_dynamic_schema, df_list):
203206
@given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS), initial_df=generate_single_dataframe(COLUMN_DESCRIPTIONS, min_size=1, allow_nat_in_index=False))
204207
def test_sort_merge_dynamic_schema_append(lmdb_library_dynamic_schema, df_list, initial_df):
205208
lib = lmdb_library_dynamic_schema
209+
lib._nvs.version_store.clear()
206210
sym = "test_sort_merge_dynamic_schema_append"
207211
initial_df.sort_index(inplace=True)
208212
lib.write(sym, initial_df)

python/tests/unit/arcticdb/version_store/test_write.py

+25
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from arcticdb.util._versions import IS_PANDAS_TWO
1313
from arcticdb.util.test import assert_frame_equal
1414
from pandas import MultiIndex
15+
from arcticdb.util.test import assert_frame_equal
1516

1617

1718
def test_write_numpy_array(lmdb_version_store):
@@ -133,6 +134,30 @@ def test_write_non_timestamp_index(lmdb_version_store, index_type, sorted, valid
133134
info = lib.get_info(symbol)
134135
assert info["sorted"] == "UNKNOWN"
135136

137+
class TestMissingStringPlaceholders:
138+
@pytest.mark.parametrize("dtype", [None, object, np.float32, np.double])
139+
def test_write_with_nan_none(self, lmdb_version_store, dtype):
140+
lib = lmdb_version_store
141+
sym = "nan"
142+
lib.write(sym, pd.DataFrame({"a": [None, np.nan]}, dtype=dtype))
143+
data = lib.read(sym).data
144+
assert_frame_equal(data, pd.DataFrame({"a": [None, np.nan]}, dtype=dtype))
145+
146+
@pytest.mark.parametrize("dtype", [None, object])
147+
def test_write_with_nan_none_and_a_string(self, lmdb_version_store, dtype):
148+
lib = lmdb_version_store
149+
sym = "nan"
150+
lib.write(sym, pd.DataFrame({"a": [None, np.nan, "string"]}, dtype=dtype))
151+
data = lib.read(sym).data
152+
assert_frame_equal(data, pd.DataFrame({"a": [None, np.nan, "string"]}, dtype=dtype))
153+
154+
@pytest.mark.parametrize("dtype", [None, object, np.double, np.float32])
155+
def test_write_only_nan_column(self, lmdb_version_store, dtype):
156+
lib = lmdb_version_store
157+
sym = "nan"
158+
lib.write(sym, pd.DataFrame({"a": [np.nan]}, dtype=dtype))
159+
data = lib.read(sym).data
160+
assert_frame_equal(data, pd.DataFrame({"a": [np.nan]}, dtype=dtype))
136161

137162
def test_write_unicode(lmdb_version_store):
138163
symbol = "test_write_unicode"

0 commit comments

Comments
 (0)