Xfail problematic test

vasil-pashov · Vasil Pashov · commit d966a25c3d24 · 2025-06-09T02:18:21.000+03:00
diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json
@@ -64,7 +64,8 @@
       "environment": { "cmakepreset_expected_host_system": "Windows" },
       "cacheVariables": {
         "VCPKG_OVERLAY_TRIPLETS": "custom-triplets",
-        "VCPKG_TARGET_TRIPLET": "x64-windows-static-msvc"
+        "VCPKG_TARGET_TRIPLET": "x64-windows-static-msvc",
+        "ARCTICDB_MSVC_OMIT_RUNTIME_CHECKS": "ON"
       }
     },
     {
diff --git a/cpp/arcticdb/processing/clause.cpp b/cpp/arcticdb/processing/clause.cpp
@@ -113,7 +113,7 @@ void check_column_presence(OutputSchema& output_schema, const std::unordered_set
     schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(first_missing == required_columns.end(),
             "{}Clause requires column '{}' to exist in input data",
             clause_name,
-            *first_missing
+            first_missing == required_columns.end() ? "" : *first_missing
     );
 }
 
diff --git a/cpp/arcticdb/storage/failure_simulation.hpp b/cpp/arcticdb/storage/failure_simulation.hpp
@@ -64,8 +64,8 @@ struct FailureAction {
             description_(std::move(description)), proxy_(std::move(proxy)) {}
 
     template<typename Func>
-    FailureAction(Description description, Func func):
-            FailureAction(std::move(description), FunctionWrapper{func}.asSharedProxy()) {}
+    FailureAction(Description description, Func&& func):
+            FailureAction(std::move(description), FunctionWrapper{std::forward<Func>(func)}.asSharedProxy()) {}
 
     inline void operator()(FailureType type) const {
         proxy_(type);
diff --git a/cpp/arcticdb/stream/row_builder.hpp b/cpp/arcticdb/stream/row_builder.hpp
@@ -52,7 +52,7 @@ class RowBuilder {
     RowBuilder &operator=(RowBuilder &) = delete;
 
     template<class...Args>
-    void start_row(const Args...args) {
+    void start_row([[maybe_unused]] const Args...args) {
         reset();
         if constexpr(sizeof...(Args)> 0 && !std::is_same_v<Index, EmptyIndex>) {
             index().set([&](std::size_t pos, auto arg) {
diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py
@@ -851,7 +851,7 @@ def drop_inf_and_nan(df: pd.DataFrame) -> pd.DataFrame:
 def drop_inf(df):
     return df[~df.isin([np.inf, -np.inf]).any(axis=1)]
 
-def assert_dfs_approximate(left: pd.DataFrame, right: pd.DataFrame):
+def assert_dfs_approximate(left: pd.DataFrame, right: pd.DataFrame, check_dtype=False):
     """
     Checks if integer columns are exactly the same. For float columns checks if they are approximately the same.
     We can't guarantee the same order of operations for the floats thus numerical errors might appear.
@@ -866,7 +866,7 @@ def assert_dfs_approximate(left: pd.DataFrame, right: pd.DataFrame):
     left_no_inf = drop_inf(left)
     right_no_inf = drop_inf(right)
 
-    check_equals_flags = {"check_dtype": False}
+    check_equals_flags = {"check_dtype": check_dtype}
     if PANDAS_VERSION >= Version("1.1"):
         check_equals_flags["check_freq"] = False
     if PANDAS_VERSION >= Version("1.2"):
@@ -881,16 +881,18 @@ def assert_dfs_approximate(left: pd.DataFrame, right: pd.DataFrame):
                 pd.testing.assert_series_equal(left_no_inf[col], right_no_inf[col], **check_equals_flags)
         except:
             with pd.option_context(
-                    'display.max_columns', None,
-                    'display.max_rows', None,
-                    'display.max_colwidth', None,
-                    'display.width', 0
+                'display.max_columns', None,
+                'display.max_rows', None,
+                'display.max_colwidth', None,
+                'display.width', 0
             ):
                 print("\nError in approximate dataframe comparison. DataFrames are different\n")
                 print("Left:\n")
                 print(left_no_inf)
+                print(left_no_inf.dtypes)
                 print("Right:\n")
                 print(right_no_inf)
+                print(right_no_inf.dtypes)
                 raise
 
 
@@ -956,10 +958,11 @@ def generic_resample_test(
     received = received.reindex(columns=sorted(received.columns))
 
     has_float_column = any(pd.api.types.is_float_dtype(col_type) for col_type in list(expected.dtypes))
+    check_dtype = expected_types is not None
     if has_float_column:
-        assert_dfs_approximate(expected, received)
+        assert_dfs_approximate(expected, received, check_dtype=check_dtype)
     else:
-        assert_frame_equal(expected, received)
+        assert_frame_equal(expected, received, check_dtype=check_dtype)
 
 
 def equals(x, y):
@@ -1008,6 +1011,15 @@ def largest_numeric_type(dtype):
         return np.uint64
     return dtype
 
+def common_float_int_type(float_dtype, int_dtype):
+    # We don't support float16
+    float_dtype = np.dtype(float_dtype)
+    int_dtype = np.dtype(int_dtype)
+    assert float_dtype.itemsize >= 4
+    if int_dtype.itemsize <= 2:
+        return float_dtype
+    return np.float64
+
 def valid_common_type(left, right):
     """
     This is created to mimic the C++ has_valid_common_type function. It takes two numpy dtypes and returns a type able
@@ -1017,17 +1029,19 @@ def valid_common_type(left, right):
     """
     if left is None or right is None:
         return None
+    left = np.dtype(left)
+    right = np.dtype(right)
     if left == right:
         return left
     if pd.api.types.is_float_dtype(left):
         if pd.api.types.is_float_dtype(right):
             return left if left.itemsize > right.itemsize else right
         elif pd.api.types.is_integer_dtype(right):
-            return left
+            return common_float_int_type(left, right)
         return None
     elif pd.api.types.is_signed_integer_dtype(left):
         if pd.api.types.is_float_dtype(right):
-            return right
+            return common_float_int_type(right, left)
         elif pd.api.types.is_signed_integer_dtype(right):
             return left if left.itemsize > right.itemsize else right
         elif pd.api.types.is_unsigned_integer_dtype(right):
@@ -1039,10 +1053,10 @@ def valid_common_type(left, right):
             return int_dtypes[right.itemsize * 2]
     elif pd.api.types.is_unsigned_integer_dtype(left):
         if pd.api.types.is_float_dtype(right):
-            return right
+            return common_float_int_type(right, left)
         elif pd.api.types.is_unsigned_integer_dtype(right):
             return left if left.itemsize > right.itemsize else right
-        elif pd.api.types.is_signed_integer_dtype(left):
+        elif pd.api.types.is_signed_integer_dtype(right):
             int_dtypes = {1: np.dtype("int8"), 2: np.dtype("int16"), 4: np.dtype("int32"), 8: np.dtype("int64")}
             if left.itemsize >= 8:
                 return None
@@ -1071,7 +1085,23 @@ def compute_common_type_for_columns_in_df_list(df_list):
     for df in df_list:
         for col in df.columns:
             if col not in common_types:
-                common_types[col] = df[col].dtype
+                common_types[col] = np.dtype(df[col].dtype)
+            else:
+                common_types[col] = valid_common_type(common_types[col], np.dtype(df[col].dtype))
+    return common_types
+
+def compute_common_type_for_columns(segment_columns: List[dict]):
+    """
+    Takes a list of column/dtype dictionaries where each element of the list is a dictionary describing a segment. The
+    keys of the dictionary are column names and the values are dtypes. A column is allowed to be missing from some
+    segments. Returns a dictionary where the keys are column names and values are combined dtype. If a value is none
+    this means that there are two segments holding a column with incompatible dtypes.
+    """
+    common_types = {}
+    for columns in segment_columns:
+        for name, dtype in columns.items():
+            if name not in common_types:
+                common_types[name] = np.dtype(dtype)
             else:
-                common_types[col] = valid_common_type(common_types[col], df[col].dtype)
+                common_types[name] = valid_common_type(common_types[name], np.dtype(dtype))
     return common_types
diff --git a/python/tests/hypothesis/arcticdb/test_resample.py b/python/tests/hypothesis/arcticdb/test_resample.py
@@ -5,7 +5,12 @@
 import hypothesis.extra.pandas as hs_pd
 import hypothesis.strategies as st
 from arcticdb.util.hypothesis import use_of_function_scoped_fixtures_in_hypothesis_checked
-from arcticdb.util.test import generic_resample_test, compute_common_type_for_columns_in_df_list, expected_aggregation_type
+from arcticdb.util.test import (
+    generic_resample_test,
+    compute_common_type_for_columns_in_df_list,
+    expected_aggregation_type,
+    compute_common_type_for_columns
+)
 from arcticdb.util._versions import IS_PANDAS_TWO
 
 COLUMN_DTYPE = ["float", "int", "uint"]
@@ -108,10 +113,13 @@ def dynamic_schema_column_list(draw):
     segment_ranges = sorted(draw(st.lists(date(min_date=MIN_DATE, max_date=MAX_DATE, unit="s"), unique=True, min_size=segment_count+1, max_size=segment_count+1)))
     segments = []
     dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64]
+    columns_per_segment = [draw(st.lists(st.sampled_from(all_column_names), min_size=1, max_size=3, unique=True)) for _ in range(segment_count)]
+    dtypes_per_segment = [draw(st.lists(st.sampled_from(dtypes), min_size=len(cols), max_size=len(cols))) for cols in columns_per_segment]
+    column_dtype_per_segment = [{name: dtype for name, dtype in zip(columns_per_segment[i], dtypes_per_segment[i])} for i in range(segment_count)]
+    assume(all(col_type is not None for col_type in compute_common_type_for_columns(column_dtype_per_segment).values()))
     for segment_index in range(segment_count):
-        segment_column_names = draw(st.lists(st.sampled_from(all_column_names), min_size=1, max_size=3, unique=True))
-        column_count = len(segment_column_names)
-        column_dtypes = draw(st.lists(st.sampled_from(dtypes), min_size=column_count, max_size=column_count))
+        segment_column_names = columns_per_segment[segment_index]
+        column_dtypes = dtypes_per_segment[segment_index]
         segment_start_date = segment_ranges[segment_index]
         segment_end_date = segment_ranges[segment_index + 1]
         segments.append(draw(dataframe(segment_column_names, column_dtypes, segment_start_date, segment_end_date)))
@@ -152,9 +160,15 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
                 # the first value of the data frame to be outside the computed resampling range. In the arctic, this is not a problem
                 # as we allow this by design.
                 if str(pandas_error) != "Values falls before first bin":
-                    raise pandas_error
+                    raise
                 else:
                     return
+            except RuntimeError as pandas_error:
+                # This is a bug in pandas one that should be fixed in Pandas 2
+                if str(pandas_error) == "empty group with uint64_t" and not IS_PANDAS_TWO:
+                    return
+                else:
+                    raise
 
 @use_of_function_scoped_fixtures_in_hypothesis_checked
 @given(
@@ -166,7 +180,6 @@ def test_resample(lmdb_version_store_v1, df, rule, origin, offset):
 @settings(deadline=None, suppress_health_check=[HealthCheck.data_too_large])
 def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list, rule, origin, offset):
     common_column_types = compute_common_type_for_columns_in_df_list(df_list)
-    assume(all(col_type is not None for col_type in common_column_types.values()))
     lib = lmdb_version_store_dynamic_schema_v1
     lib.version_store.clear()
     sym = "sym"
@@ -176,6 +189,7 @@ def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list,
         # This column will be used to keep track of empty buckets.
         df["_empty_bucket_tracker_"] = np.zeros(df.shape[0], dtype=int)
         lib.append(sym, df)
+
     for closed in ["left", "right"]:
         for label in ["left", "right"]:
             try:
@@ -197,6 +211,12 @@ def test_resample_dynamic_schema(lmdb_version_store_dynamic_schema_v1, df_list,
                 # the first value of the data frame to be outside the computed resampling range. In arctic this is not a problem
                 # as we allow this by design.
                 if str(pandas_error) != "Values falls before first bin":
-                    raise pandas_error
+                    raise
+                else:
+                    return
+            except RuntimeError as pandas_error:
+                # This is a bug in pandas one that should be fixed in Pandas 2
+                if str(pandas_error) == "empty group with uint64_t" and not IS_PANDAS_TWO:
+                    return
                 else:
-                    return
+                    raise
diff --git a/python/tests/unit/arcticdb/version_store/test_aggregation.py b/python/tests/unit/arcticdb/version_store/test_aggregation.py
@@ -316,10 +316,12 @@ def test_docstring_example_query_builder_groupby_max_and_mean(lmdb_version_store
     q = q.groupby("grouping_column").agg({"to_max": "max", "to_mean": "mean"})
 
     lib.write("symbol", df)
-    res = lib.read("symbol", query_builder=q)
-    df = pd.DataFrame({"to_mean": (1.1 + 1.4 + 2.5) / 3, "to_max": [2.5]}, index=["group_1"])
+    res = lib.read("symbol", query_builder=q).data
+    res.sort_index(axis=1, inplace=True)
+    df = pd.DataFrame({"to_max": [2.5], "to_mean": [(1.1 + 1.4 + 2.5) / 3]}, index=["group_1"])
     df.index.rename("grouping_column", inplace=True)
-    assert_frame_equal(res.data, df)
+    df.sort_index(axis=1, inplace=True)
+    assert_frame_equal(res, df)
 
 
 ##################################
diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py
@@ -1115,6 +1115,16 @@ def test_float32_binary_comparison(lmdb_version_store_v1):
 ################################
 
 
+@pytest.mark.xfail(reason="""Fails on Pandas < 2 because of this logic:
+    https://github.com/man-group/ArcticDB/blob/fc9514f25712d8e86fbdbd2f7e37e64f3a10df40/python/arcticdb/version_store/_normalization.py#L230
+    The assumptions there however are not correct. The dtype of empty columns is not deterministic and varies between
+    Pandas versions, for example the test passes on the CI with Python > 3.8 because it uses pandas 2.3.0 and the dtype
+    of the column is float64 (note the if in the link above sets the dtype to object only for pandas < 2 because it
+    expects that int Pandas >= 2 it'll always be object). It fails for object dtype because in arcticdb that becomes
+    string type and string types cannot be filtered using < or >, thus modify_schema fails
+    """,
+    strict=False
+)
 @pytest.mark.parametrize("lib_type", ["lmdb_version_store_v1", "lmdb_version_store_dynamic_schema_v1"])
 def test_filter_empty_dataframe(request, lib_type):
     lib = request.getfixturevalue(lib_type)
@@ -1198,10 +1208,10 @@ def test_filter_column_not_present_dynamic(lmdb_version_store_dynamic_schema_v1)
 def test_filter_column_present_in_some_segments(lmdb_version_store_dynamic_schema_v1):
     lib = lmdb_version_store_dynamic_schema_v1
     symbol = "test_filter_column_not_present_dynamic"
-    df = pd.DataFrame({"a": np.arange(2)}, index=np.arange(2), dtype="int64")
+    df = pd.DataFrame({"a": np.arange(2)}, dtype="int64")
     lib.write(symbol, df)
 
-    df = pd.DataFrame({"b": [1, 10]}, index=np.arange(2), dtype="int64")
+    df = pd.DataFrame({"b": [1, 10]}, dtype="int64")
     lib.append(symbol, df)
 
     q = QueryBuilder()
diff --git a/python/tests/unit/arcticdb/version_store/test_projection.py b/python/tests/unit/arcticdb/version_store/test_projection.py
@@ -16,7 +16,16 @@
 
 pytestmark = pytest.mark.pipeline
 
-
+@pytest.mark.xfail(reason="""Fails on Pandas < 2 because of this logic:
+    https://github.com/man-group/ArcticDB/blob/fc9514f25712d8e86fbdbd2f7e37e64f3a10df40/python/arcticdb/version_store/_normalization.py#L230
+    The assumptions there however are not correct. The dtype of empty columns is not deterministic and varies between
+    Pandas versions, for example the test passes on the CI with Python > 3.8 because it uses pandas 2.3.0 and the dtype
+    of the column is float64 (note the if in the link above sets the dtype to object only for pandas < 2 because it
+    expects that int Pandas >= 2 it'll always be object). It fails for object dtype because in arcticdb that becomes
+    string type and string types cannot be summed with a number.
+    """,
+    strict=False
+)
 @pytest.mark.parametrize("lib_type", ["lmdb_version_store_v1", "lmdb_version_store_dynamic_schema_v1"])
 def test_project_empty_dataframe(request, lib_type):
     lib = request.getfixturevalue(lib_type)
diff --git a/python/tests/unit/arcticdb/version_store/test_resample.py b/python/tests/unit/arcticdb/version_store/test_resample.py
@@ -983,7 +983,7 @@ def test_bucket_intersects_two_segments_aggregation_column_not_in_second(self, l
         expected_types = {
             "col_0_min": dtype,
             "col_0_max": dtype,
-            "col_0_sum": np.uint64,
+            "col_0_sum": np.int64,
             "col_0_mean": np.float64,
             "col_0_first": dtype,
             "col_0_last": dtype,

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@`
`64`	`64`	`"environment": { "cmakepreset_expected_host_system": "Windows" },`
`65`	`65`	`"cacheVariables": {`
`66`	`66`	`"VCPKG_OVERLAY_TRIPLETS": "custom-triplets",`
`67`		`- "VCPKG_TARGET_TRIPLET": "x64-windows-static-msvc"`
	`67`	`+ "VCPKG_TARGET_TRIPLET": "x64-windows-static-msvc",`
	`68`	`+ "ARCTICDB_MSVC_OMIT_RUNTIME_CHECKS": "ON"`
`68`	`69`	`}`
`69`	`70`	`},`
`70`	`71`	`{`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ void check_column_presence(OutputSchema& output_schema, const std::unordered_set`
`113`	`113`	`schema::check<ErrorCode::E_COLUMN_DOESNT_EXIST>(first_missing == required_columns.end(),`
`114`	`114`	`"{}Clause requires column '{}' to exist in input data",`
`115`	`115`	`clause_name,`
`116`		`- *first_missing`
	`116`	`+ first_missing == required_columns.end() ? "" : *first_missing`
`117`	`117`	`);`
`118`	`118`	`}`
`119`	`119`