Fix regression when writing combination of np.NaN and None (#2212)

vasil-pashov · web-flow · commit b3b0273556be · 2025-03-11T14:51:49.000+02:00
Early version of ArcticDB allowed for writing a column containing only none and nan values. We assumed both of those are placeholders for an empty string. At some point mixing only np.nan and None in a column stopped working on write side. #### Reference Issues/PRs  #### What does this implement or fix? #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
diff --git a/cpp/arcticdb/pipeline/write_frame.cpp b/cpp/arcticdb/pipeline/write_frame.cpp
@@ -95,8 +95,14 @@ std::tuple<stream::StreamSink::PartialKey, SegmentInMemory, FrameSlice> WriteToS
             auto& tensor = frame_->field_tensors[slice_.absolute_field_col(col)];
             auto opt_error = aggregator_set_data(
                 fd.type(),
-                tensor, agg, abs_col, rows_to_write, offset_in_frame, slice_num_for_column_,
-                regular_slice_size, sparsify_floats_);
+                tensor,
+                agg,
+                abs_col,
+                rows_to_write,
+                offset_in_frame,
+                slice_num_for_column_,
+                regular_slice_size,
+                sparsify_floats_);
             if (opt_error.has_value()) {
                 opt_error->raise(fd.name(), offset_in_frame);
             }
diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp
@@ -145,14 +145,10 @@ NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types) {
         // wide type always is 64bits
         val_bytes = 8;
 
-        // If Numpy has type 'O' then get_value_type above will return type 'BYTES'
-        // If there is no value, and we can't deduce a type then leave it that way,
-        // otherwise try to work out whether it was a bytes (string) type or unicode
         if (!is_fixed_string_type(val_type) && element_count > 0) {
             auto none = py::none{};
             auto obj = reinterpret_cast<PyObject **>(arr->data);
-            bool empty = false;
-            bool all_nans = false;
+            bool empty_string_placeholder = false;
             PyObject *sample = *obj;
             PyObject** current_object = obj;
             // Arctic allows both None and NaN to represent a string with no value. We have 3 options:
@@ -163,31 +159,28 @@ NativeTensor obj_to_tensor(PyObject *ptr, bool empty_types) {
             // * In case there is at least one actual string we can sample it and decide the type of the column segment
             //      based on it
             // Note: ValueType::ASCII_DYNAMIC was used when Python 2 was supported. It is no longer supported, and
-            //  we're not expected to enter that branch.
+            // we're not expected to enter that branch.
             if (sample == none.ptr() || is_py_nan(sample)) {
-                empty = true;
-                all_nans = true;
+                empty_string_placeholder = true;
                 util::check(c_style, "Non contiguous columns with first element as None not supported yet.");
                 const auto* end = obj + size;
                 while(current_object < end) {
-
-                    if(*current_object == none.ptr()) {
-                        all_nans = false;
-                    } else if(is_py_nan(*current_object)) {
-                        empty = false;
-                    } else {
-                        all_nans = false;
-                        empty = false;
+                    if(!(is_py_nan(*current_object) || *current_object == none.ptr())) {
+                        empty_string_placeholder = false;
                         break;
                     }
                     ++current_object;
                 }
                 if(current_object != end)
                     sample = *current_object;
             }
-            if (empty && kind == 'O') {
+            // Column full of NaN values is interpreted differently based on the kind. If kind is object "O" the column
+            // is assigned a string type if kind is float "f" the column is assigned a float type. This is done in
+            // order to preserve a legacy behavior of ArcticDB allowing to use both NaN and None as a placeholder for
+            // missing string values.
+            if (empty_string_placeholder && kind == 'O') {
                 val_type = empty_types ? ValueType::EMPTY : ValueType::UTF_DYNAMIC;
-            } else if(all_nans || is_unicode(sample)){
+            } else if(is_unicode(sample)) {
                 val_type = ValueType::UTF_DYNAMIC;
             } else if (PYBIND11_BYTES_CHECK(sample)) {
                 val_type = ValueType::ASCII_DYNAMIC;
diff --git a/python/tests/hypothesis/arcticdb/test_sort_merge.py b/python/tests/hypothesis/arcticdb/test_sort_merge.py
@@ -127,6 +127,7 @@ def get_append_keys(lib, sym):
 @given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS))
 def test_sort_merge_static_schema_write(lmdb_library, df_list):
     lib = lmdb_library
+    lib._nvs.version_store.clear()
     sym = "test_sort_merge_static_schema_write"
     for df in df_list:
         lib.write(sym, df, staged=True, validate_index=False)
@@ -150,6 +151,7 @@ def test_sort_merge_static_schema_write(lmdb_library, df_list):
 @given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS), initial_df=generate_single_dataframe(COLUMN_DESCRIPTIONS, min_size=1, allow_nat_in_index=False))
 def test_sort_merge_static_schema_append(lmdb_library, df_list, initial_df):
     lib = lmdb_library
+    lib._nvs.version_store.clear()
     sym = "test_sort_merge_static_schema_append"
     initial_df.sort_index(inplace=True)
     lib.write(sym, initial_df)
@@ -179,6 +181,7 @@ def test_sort_merge_static_schema_append(lmdb_library, df_list, initial_df):
 @given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS))
 def test_sort_merge_dynamic_schema_write(lmdb_library_dynamic_schema, df_list):
     lib = lmdb_library_dynamic_schema
+    lib._nvs.version_store.clear()
     sym = "test_sort_merge_dynamic_schema_write"
     for df in df_list:
         lib.write(sym, df, staged=True, validate_index=False)
@@ -203,6 +206,7 @@ def test_sort_merge_dynamic_schema_write(lmdb_library_dynamic_schema, df_list):
 @given(df_list=generate_dataframes(COLUMN_DESCRIPTIONS), initial_df=generate_single_dataframe(COLUMN_DESCRIPTIONS, min_size=1, allow_nat_in_index=False))
 def test_sort_merge_dynamic_schema_append(lmdb_library_dynamic_schema, df_list, initial_df):    
     lib = lmdb_library_dynamic_schema
+    lib._nvs.version_store.clear()
     sym = "test_sort_merge_dynamic_schema_append"
     initial_df.sort_index(inplace=True)
     lib.write(sym, initial_df)
diff --git a/python/tests/unit/arcticdb/version_store/test_write.py b/python/tests/unit/arcticdb/version_store/test_write.py
@@ -12,6 +12,7 @@
 from arcticdb.util._versions import IS_PANDAS_TWO
 from arcticdb.util.test import assert_frame_equal
 from pandas import MultiIndex
+from arcticdb.util.test import assert_frame_equal
 
 
 def test_write_numpy_array(lmdb_version_store):
@@ -133,6 +134,30 @@ def test_write_non_timestamp_index(lmdb_version_store, index_type, sorted, valid
     info = lib.get_info(symbol)
     assert info["sorted"] == "UNKNOWN"
 
+class TestMissingStringPlaceholders:
+    @pytest.mark.parametrize("dtype", [None, object, np.float32, np.double])
+    def test_write_with_nan_none(self, lmdb_version_store, dtype):
+        lib = lmdb_version_store
+        sym = "nan"
+        lib.write(sym, pd.DataFrame({"a": [None, np.nan]}, dtype=dtype))
+        data = lib.read(sym).data
+        assert_frame_equal(data, pd.DataFrame({"a": [None, np.nan]}, dtype=dtype))
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_write_with_nan_none_and_a_string(self, lmdb_version_store, dtype):
+        lib = lmdb_version_store
+        sym = "nan"
+        lib.write(sym, pd.DataFrame({"a": [None, np.nan, "string"]}, dtype=dtype))
+        data = lib.read(sym).data
+        assert_frame_equal(data, pd.DataFrame({"a": [None, np.nan, "string"]}, dtype=dtype))
+
+    @pytest.mark.parametrize("dtype", [None, object, np.double, np.float32])
+    def test_write_only_nan_column(self, lmdb_version_store, dtype):
+        lib = lmdb_version_store
+        sym = "nan"
+        lib.write(sym, pd.DataFrame({"a": [np.nan]}, dtype=dtype))
+        data = lib.read(sym).data
+        assert_frame_equal(data, pd.DataFrame({"a": [np.nan]}, dtype=dtype))
 
 def test_write_unicode(lmdb_version_store):
     symbol = "test_write_unicode"