Skip to content

Query Builder regex filter support and upgrade PCRE to PCRE2 #2385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 11, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -43,6 +43,7 @@ find_package(zstd CONFIG REQUIRED) # "CONFIG" bypasses our cpp/CMake/FindZstd.c

find_package(azure-identity-cpp CONFIG REQUIRED)
find_package(azure-storage-blobs-cpp CONFIG REQUIRED)
find_package(PCRE2 REQUIRED COMPONENTS 8BIT 32BIT)

if(${BUILD_WITH_REMOTERY})
add_compile_definitions(USE_REMOTERY)
@@ -53,7 +54,6 @@ endif()
add_compile_definitions(_LIBCPP_DISABLE_AVAILABILITY)

if(NOT ${ARCTICDB_USING_CONDA})
find_package(unofficial-pcre REQUIRED)
find_package(Libevent CONFIG REQUIRED)
set(ARCTICDB_MONGO_LIBS $<IF:$<TARGET_EXISTS:mongo::mongocxx_static>,mongo::mongocxx_static,mongo::mongocxx_shared>)
set(Zstd_LIBRARY $<IF:$<TARGET_EXISTS:zstd::libzstd_shared>,zstd::libzstd_shared,zstd::libzstd_static>)
@@ -67,7 +67,6 @@ else()
SET(HIDE_LINKED_SYMBOLS OFF)

find_package(pybind11 REQUIRED)
find_package(PCRE REQUIRED)
find_package(Libevent REQUIRED)
find_package(semimap REQUIRED)

@@ -106,7 +105,6 @@ set_target_properties(xxHash PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${xxHash_I

# Libraries baked into the internal and external Linux images are found by searching various paths
# On Windows, vcpkg will provide a "Config" which takes precedence
find_library(Pcre_LIBRARY NAMES pcre PATHS /usr/local/lib/libpcre.a REQUIRED)
if (WIN32)
find_package(Iconv)
else ()
@@ -672,7 +670,7 @@ if(${ARCTICDB_USE_PCH})
<ankerl/unordered_dense.h>
<third_party/recycle/src/recycle/shared_pool.hpp>
<xxhash.h>
<pcre.h>
<pcre2.h>
)
target_precompile_headers(arcticdb_core_object PRIVATE ${BASE_PCH})
endif()
@@ -708,6 +706,8 @@ set (arcticdb_core_libraries
Azure::azure-identity
Azure::azure-storage-blobs
sparrow::sparrow
PCRE2::8BIT
PCRE2::32BIT
)

if(${ARCTICDB_COUNT_ALLOCATIONS})
@@ -729,7 +729,6 @@ endif()
if(${ARCTICDB_USING_CONDA})

list (APPEND arcticdb_core_libraries
pcre
msgpack-c
${LMDB_LIBRARIES}
)
@@ -741,7 +740,6 @@ if(${ARCTICDB_USING_CONDA})
else()

list (APPEND arcticdb_core_libraries
unofficial::pcre::pcre
msgpackc-cxx
EnTT::EnTT
lmdb
@@ -821,6 +819,7 @@ set(arcticdb_python_srcs
stream/python_bindings.cpp
toolbox/python_bindings.cpp
version/python_bindings.cpp
util/python_bindings.cpp
python/reader.hpp
python/adapt_read_dataframe.hpp
python/python_handlers.hpp
@@ -1097,7 +1096,9 @@ if(${TEST})
stream/test/stream_test_common.cpp
column_store/test/benchmark_column.cpp
column_store/test/benchmark_memory_segment.cpp
processing/test/benchmark_binary.cpp
processing/test/benchmark_clause.cpp
processing/test/benchmark_common.cpp
processing/test/benchmark_ternary.cpp
version/test/benchmark_write.cpp)

39 changes: 29 additions & 10 deletions cpp/arcticdb/column_store/string_pool.cpp
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@
#include <arcticdb/column_store/string_pool.hpp>
#include <arcticdb/util/offset_string.hpp>
#include <arcticdb/column_store/segment_utils.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <ankerl/unordered_dense.h>

#include <pybind11/pybind11.h>
@@ -208,22 +209,18 @@ py::buffer_info StringPool::as_buffer_info() const {
};
}

std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) {
std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);
ankerl::unordered_dense::map<std::string_view, offset_t> col_values;
col_values.reserve(unique_values.size());
for(auto pos : unique_values) {
col_values.emplace(block_.const_at(pos), pos);
if (block_.const_at(pos) == string) {
return pos;
}
}

std::optional<position_t> output;
if(auto loc = col_values.find(string); loc != col_values.end())
output = loc->second;
return output;
return std::nullopt;
}

ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) {
ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);
ankerl::unordered_dense::map<std::string_view, offset_t> col_values;
@@ -240,4 +237,26 @@ ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(cons
}
return output;
}

ankerl::unordered_dense::set<position_t> StringPool::get_regex_match_offsets_for_column(const util::RegexGeneric& regex_generic, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);

ankerl::unordered_dense::set<position_t> output;
if (is_fixed_string_type(column.type().value_type())) {
for(auto pos : unique_values) {
auto match_text = block_.const_at(pos);
if (regex_generic.match(std::u32string_view(reinterpret_cast<const char32_t*>(match_text.data()), match_text.size() / sizeof(char32_t)))) {
output.insert(pos);
}
}
} else {
for(auto pos : unique_values) {
if (regex_generic.match(block_.const_at(pos))) {
output.insert(pos);
}
}
}
return output;
}
}
7 changes: 5 additions & 2 deletions cpp/arcticdb/column_store/string_pool.hpp
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
#include <arcticdb/entity/types.hpp>
#include <arcticdb/util/buffer.hpp>
#include <arcticdb/util/cursored_buffer.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <arcticdb/column_store/chunked_buffer.hpp>
#include <arcticdb/column_store/column_data.hpp>

@@ -84,6 +85,7 @@ class StringBlock {

std::string_view at(position_t pos);
[[nodiscard]] std::string_view const_at(position_t pos) const;
[[nodiscard]] std::u32string_view u32_const_at(position_t pos) const;

void reset();

@@ -174,8 +176,9 @@ class StringPool {

py::buffer_info as_buffer_info() const;

std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column);
ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column);
std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column) const;
ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) const;
ankerl::unordered_dense::set<position_t> get_regex_match_offsets_for_column(const util::RegexGeneric& regex_generic, const Column& column) const;
private:
MapType map_;
mutable StringBlock block_;
5 changes: 5 additions & 0 deletions cpp/arcticdb/processing/expression_context.hpp
Original file line number Diff line number Diff line change
@@ -55,9 +55,14 @@ struct ExpressionContext {
value_sets_.set_value(name, std::move(value_set));
}

void add_regex(const std::string& name, std::shared_ptr<util::RegexGeneric> regex) {
regex_matches_.set_value(name, std::move(regex));
}

ConstantMap<ExpressionNode> expression_nodes_;
ConstantMap<Value> values_;
ConstantMap<ValueSet> value_sets_;
ConstantMap<util::RegexGeneric> regex_matches_;
VariantNode root_node_name_;
bool dynamic_schema_{false};
};
8 changes: 6 additions & 2 deletions cpp/arcticdb/processing/expression_node.hpp
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@

#include <arcticdb/util/bitset.hpp>
#include <arcticdb/util/string_wrapping_value.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <arcticdb/processing/operation_types.hpp>
#include <arcticdb/pipeline/value.hpp>
#include <arcticdb/pipeline/value_set.hpp>
@@ -33,7 +34,10 @@ using ValueSetName = util::StringWrappingValue<ValueSetNameTag>;
struct ExpressionNameTag{};
using ExpressionName = util::StringWrappingValue<ExpressionNameTag>;

using VariantNode = std::variant<std::monostate, ColumnName, ValueName, ValueSetName, ExpressionName>;
struct RegexNameTag{};
using RegexName = util::StringWrappingValue<RegexNameTag>;

using VariantNode = std::variant<std::monostate, ColumnName, ValueName, ValueSetName, ExpressionName, RegexName>;

struct ProcessingUnit;
class Column;
@@ -79,7 +83,7 @@ struct FullResult {};

struct EmptyResult {};

using VariantData = std::variant<FullResult, EmptyResult, std::shared_ptr<Value>, std::shared_ptr<ValueSet>, ColumnWithStrings, util::BitSet>;
using VariantData = std::variant<FullResult, EmptyResult, std::shared_ptr<Value>, std::shared_ptr<ValueSet>, ColumnWithStrings, util::BitSet, std::shared_ptr<util::RegexGeneric>>;

// Used to represent that an ExpressionNode returns a bitset
struct BitSetTag{};
2 changes: 2 additions & 0 deletions cpp/arcticdb/processing/operation_dispatch_binary.cpp
Original file line number Diff line number Diff line change
@@ -155,6 +155,8 @@ VariantData dispatch_binary(const VariantData& left, const VariantData& right, O
return visit_binary_comparator(left, right, GreaterThanOperator{});
case OperationType::GE:
return visit_binary_comparator(left, right, GreaterThanEqualsOperator{});
case OperationType::REGEX_MATCH:
return visit_binary_comparator(left, right, RegexMatchOperator{});
case OperationType::ISIN:
return visit_binary_membership(left, right, IsInOperator{});
case OperationType::ISNOTIN:
66 changes: 51 additions & 15 deletions cpp/arcticdb/processing/operation_dispatch_binary.hpp
Original file line number Diff line number Diff line change
@@ -280,32 +280,68 @@ VariantData binary_comparator(const ColumnWithStrings& column_with_strings, cons
return VariantData{std::move(output_bitset)};
}

template <typename Func>
VariantData binary_comparator(const ColumnWithStrings& column_with_strings, const util::RegexGeneric& regex_generic, Func&& func) {
if (is_empty_type(column_with_strings.column_->type().data_type())) {
return EmptyResult{};
}
if constexpr(std::is_same_v<std::remove_reference_t<Func>, RegexMatchOperator>) {
util::BitSet output_bitset;
details::visit_type(column_with_strings.column_->type().data_type(), [&](auto col_tag) {
using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
if constexpr(is_sequence_type(col_type_info::data_type)) {
auto offset_set = column_with_strings.string_pool_->get_regex_match_offsets_for_column(regex_generic, *column_with_strings.column_);
Column::transform<typename col_type_info::TDT>(
*column_with_strings.column_,
output_bitset,
false,
[&offset_set, &func](auto input_value) {
auto offset = static_cast<entity::position_t>(input_value);
return func(offset, offset_set);
});
} else {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Cannot perform regex_match with pattern {} on column {} as it has non-string type {}",
regex_generic.text(), column_with_strings.column_name_, get_user_friendly_type_string(column_with_strings.column_->type()));
}
});
ARCTICDB_DEBUG(log::version(), "Filtered column of size {} down to {} bits", column_with_strings.column_->last_row() + 1, output_bitset.count());
return VariantData{std::move(output_bitset)};
} else {
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Invalid operator {} for regex match", func);
return EmptyResult{};
}
}

template<typename Func>
VariantData visit_binary_comparator(const VariantData& left, const VariantData& right, Func&& func) {
if(std::holds_alternative<EmptyResult>(left) || std::holds_alternative<EmptyResult>(right))
return EmptyResult{};

return std::visit(util::overload {
[&func] (const ColumnWithStrings& l, const std::shared_ptr<Value>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func] (const ColumnWithStrings& l, const std::shared_ptr<Value>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&] (const ColumnWithStrings& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func] (const ColumnWithStrings& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&](const std::shared_ptr<Value>& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func), true>(r, *l, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func](const std::shared_ptr<Value>& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func), true>(r, *l, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&] ([[maybe_unused]] const std::shared_ptr<Value>& l, [[maybe_unused]] const std::shared_ptr<Value>& r) ->VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Two value inputs not accepted to binary comparators");
return EmptyResult{};
[&func](const ColumnWithStrings& l, const std::shared_ptr<util::RegexGeneric>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[] ([[maybe_unused]] const std::shared_ptr<Value>& l, [[maybe_unused]] const std::shared_ptr<Value>& r) ->VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Two value inputs not accepted to binary comparators");
return EmptyResult{};
},
[](const auto &, const auto&) -> VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Bitset/ValueSet inputs not accepted to binary comparators");
return EmptyResult{};
}
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Bitset/ValueSet inputs not accepted to binary comparators");
return EmptyResult{};
}
}, left, right);
}

24 changes: 24 additions & 0 deletions cpp/arcticdb/processing/operation_types.hpp
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
#include <arcticdb/processing/signed_unsigned_comparison.hpp>
#include <arcticdb/util/constants.hpp>
#include <arcticdb/util/preconditions.hpp>
#include <arcticdb/entity/types.hpp>
#include <ankerl/unordered_dense.h>

namespace arcticdb {
@@ -43,6 +44,7 @@ enum class OperationType : uint8_t {
GE,
ISIN,
ISNOTIN,
REGEX_MATCH,
// Boolean
AND,
OR,
@@ -72,6 +74,7 @@ inline std::string_view operation_type_to_str(const OperationType ot) {
TO_STR(GE)
TO_STR(ISIN)
TO_STR(ISNOTIN)
TO_STR(REGEX_MATCH)
TO_STR(AND)
TO_STR(OR)
TO_STR(XOR)
@@ -513,6 +516,16 @@ bool operator()(int64_t t, uint64_t u) const {
}
};

struct RegexMatchOperator {
template<typename T, typename U>
bool operator()(T, U) const {
util::raise_rte("RegexMatchOperator does not support {} and {}", typeid(T).name(), typeid(U).name());
}
bool operator()(entity::position_t offset, const ankerl::unordered_dense::set<position_t>& offset_set) const {
return offset_set.contains(offset);
}
};

struct MembershipOperator {
protected:
template<typename U>
@@ -802,4 +815,15 @@ struct formatter<arcticdb::IsNotInOperator> {
}
};

template<>
struct formatter<arcticdb::RegexMatchOperator> {
template<typename ParseContext>
constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }

template<typename FormatContext>
constexpr auto format(arcticdb::RegexMatchOperator, FormatContext &ctx) const {
return fmt::format_to(ctx.out(), "REGEX MATCH");
}
};

} // namespace fmt
3 changes: 3 additions & 0 deletions cpp/arcticdb/processing/processing_unit.cpp
Original file line number Diff line number Diff line change
@@ -75,6 +75,9 @@ VariantData ProcessingUnit::get(const VariantNode &name) {
[&](const ValueSetName &value_set_name) {
return VariantData(expression_context_->value_sets_.get_value(value_set_name.value));
},
[&](const RegexName &regex_name) {
return VariantData(expression_context_->regex_matches_.get_value(regex_name.value));
},
[&](const ExpressionName &expression_name) {
if (auto computed = computed_data_.find(expression_name.value);
computed != std::end(computed_data_)) {
Loading

Unchanged files with check annotations Beta