Skip to content

Query Builder regex filter support #2385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -741,6 +741,7 @@ if(${ARCTICDB_USING_CONDA})
else()

list (APPEND arcticdb_core_libraries
unofficial::pcre::pcre32
unofficial::pcre::pcre
msgpackc-cxx
EnTT::EnTT
@@ -821,6 +822,7 @@ set(arcticdb_python_srcs
stream/python_bindings.cpp
toolbox/python_bindings.cpp
version/python_bindings.cpp
util/python_bindings.cpp
python/reader.hpp
python/adapt_read_dataframe.hpp
python/python_handlers.hpp
@@ -1096,7 +1098,9 @@ if(${TEST})
set(benchmark_srcs
stream/test/stream_test_common.cpp
column_store/test/benchmark_memory_segment.cpp
processing/test/benchmark_binary.cpp
processing/test/benchmark_clause.cpp
processing/test/benchmark_common.cpp
processing/test/benchmark_ternary.cpp
version/test/benchmark_write.cpp)

39 changes: 29 additions & 10 deletions cpp/arcticdb/column_store/string_pool.cpp
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@
#include <arcticdb/column_store/string_pool.hpp>
#include <arcticdb/util/offset_string.hpp>
#include <arcticdb/column_store/segment_utils.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <ankerl/unordered_dense.h>

#include <pybind11/pybind11.h>
@@ -208,22 +209,18 @@ py::buffer_info StringPool::as_buffer_info() const {
};
}

std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) {
std::optional<position_t> StringPool::get_offset_for_column(std::string_view string, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);
ankerl::unordered_dense::map<std::string_view, offset_t> col_values;
col_values.reserve(unique_values.size());
for(auto pos : unique_values) {
col_values.emplace(block_.const_at(pos), pos);
if (block_.const_at(pos) == string) {
return pos;
}
}

std::optional<position_t> output;
if(auto loc = col_values.find(string); loc != col_values.end())
output = loc->second;
return output;
return std::nullopt;
}

ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) {
ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);
ankerl::unordered_dense::map<std::string_view, offset_t> col_values;
@@ -240,4 +237,26 @@ ankerl::unordered_dense::set<position_t> StringPool::get_offsets_for_column(cons
}
return output;
}

ankerl::unordered_dense::set<position_t> StringPool::get_regex_match_offsets_for_column(const util::RegexGeneric& regex_generic, const Column& column) const {
auto unique_values = unique_values_for_string_column(column);
remove_nones_and_nans(unique_values);

ankerl::unordered_dense::set<position_t> output;
if (is_fixed_string_type(column.type().value_type())) {
for(auto pos : unique_values) {
auto match_text = block_.const_at(pos);
if (regex_generic.match(std::u32string_view(reinterpret_cast<const char32_t*>(match_text.data()), match_text.size() / sizeof(char32_t)))) {
output.insert(pos);
}
}
} else {
for(auto pos : unique_values) {
if (regex_generic.match(block_.const_at(pos))) {
output.insert(pos);
}
}
}
return output;
}
}
7 changes: 5 additions & 2 deletions cpp/arcticdb/column_store/string_pool.hpp
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
#include <arcticdb/entity/types.hpp>
#include <arcticdb/util/buffer.hpp>
#include <arcticdb/util/cursored_buffer.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <arcticdb/column_store/chunked_buffer.hpp>
#include <arcticdb/column_store/column_data.hpp>

@@ -84,6 +85,7 @@ class StringBlock {

std::string_view at(position_t pos);
[[nodiscard]] std::string_view const_at(position_t pos) const;
[[nodiscard]] std::u32string_view u32_const_at(position_t pos) const;

void reset();

@@ -174,8 +176,9 @@ class StringPool {

py::buffer_info as_buffer_info() const;

std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column);
ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column);
std::optional<position_t> get_offset_for_column(std::string_view str, const Column& column) const;
ankerl::unordered_dense::set<position_t> get_offsets_for_column(const std::shared_ptr<std::unordered_set<std::string>>& strings, const Column& column) const;
ankerl::unordered_dense::set<position_t> get_regex_match_offsets_for_column(const util::RegexGeneric& regex_generic, const Column& column) const;
private:
MapType map_;
mutable StringBlock block_;
5 changes: 5 additions & 0 deletions cpp/arcticdb/processing/expression_context.hpp
Original file line number Diff line number Diff line change
@@ -55,9 +55,14 @@ struct ExpressionContext {
value_sets_.set_value(name, std::move(value_set));
}

void add_regex(const std::string& name, std::shared_ptr<util::RegexGeneric> regex) {
regex_matches_.set_value(name, std::move(regex));
}

ConstantMap<ExpressionNode> expression_nodes_;
ConstantMap<Value> values_;
ConstantMap<ValueSet> value_sets_;
ConstantMap<util::RegexGeneric> regex_matches_;
VariantNode root_node_name_;
bool dynamic_schema_{false};
};
8 changes: 6 additions & 2 deletions cpp/arcticdb/processing/expression_node.hpp
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@

#include <arcticdb/util/bitset.hpp>
#include <arcticdb/util/string_wrapping_value.hpp>
#include <arcticdb/util/regex_filter.hpp>
#include <arcticdb/processing/operation_types.hpp>
#include <arcticdb/pipeline/value.hpp>
#include <arcticdb/pipeline/value_set.hpp>
@@ -33,7 +34,10 @@ using ValueSetName = util::StringWrappingValue<ValueSetNameTag>;
struct ExpressionNameTag{};
using ExpressionName = util::StringWrappingValue<ExpressionNameTag>;

using VariantNode = std::variant<std::monostate, ColumnName, ValueName, ValueSetName, ExpressionName>;
struct RegexNameTag{};
using RegexName = util::StringWrappingValue<RegexNameTag>;

using VariantNode = std::variant<std::monostate, ColumnName, ValueName, ValueSetName, ExpressionName, RegexName>;

struct ProcessingUnit;
class Column;
@@ -79,7 +83,7 @@ struct FullResult {};

struct EmptyResult {};

using VariantData = std::variant<FullResult, EmptyResult, std::shared_ptr<Value>, std::shared_ptr<ValueSet>, ColumnWithStrings, util::BitSet>;
using VariantData = std::variant<FullResult, EmptyResult, std::shared_ptr<Value>, std::shared_ptr<ValueSet>, ColumnWithStrings, util::BitSet, std::shared_ptr<util::RegexGeneric>>;

// Used to represent that an ExpressionNode returns a bitset
struct BitSetTag{};
2 changes: 2 additions & 0 deletions cpp/arcticdb/processing/operation_dispatch_binary.cpp
Original file line number Diff line number Diff line change
@@ -155,6 +155,8 @@ VariantData dispatch_binary(const VariantData& left, const VariantData& right, O
return visit_binary_comparator(left, right, GreaterThanOperator{});
case OperationType::GE:
return visit_binary_comparator(left, right, GreaterThanEqualsOperator{});
case OperationType::REGEX_MATCH:
return visit_binary_comparator(left, right, RegexMatchOperator{});
case OperationType::ISIN:
return visit_binary_membership(left, right, IsInOperator{});
case OperationType::ISNOTIN:
66 changes: 51 additions & 15 deletions cpp/arcticdb/processing/operation_dispatch_binary.hpp
Original file line number Diff line number Diff line change
@@ -280,32 +280,68 @@ VariantData binary_comparator(const ColumnWithStrings& column_with_strings, cons
return VariantData{std::move(output_bitset)};
}

template <typename Func>
VariantData binary_comparator(const ColumnWithStrings& column_with_strings, const util::RegexGeneric& regex_generic, Func&& func) {
if (is_empty_type(column_with_strings.column_->type().data_type())) {
return EmptyResult{};
}
if constexpr(std::is_same_v<std::remove_reference_t<Func>, RegexMatchOperator>) {
util::BitSet output_bitset;
details::visit_type(column_with_strings.column_->type().data_type(), [&](auto col_tag) {
using col_type_info = ScalarTypeInfo<decltype(col_tag)>;
if constexpr(is_sequence_type(col_type_info::data_type)) {
auto offset_set = column_with_strings.string_pool_->get_regex_match_offsets_for_column(regex_generic, *column_with_strings.column_);
Column::transform<typename col_type_info::TDT>(
*column_with_strings.column_,
output_bitset,
false,
[&offset_set, &func](auto input_value) {
auto offset = static_cast<entity::position_t>(input_value);
return func(offset, offset_set);
});
} else {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Cannot perform regex_match with pattern {} on column {} as it has non-string type {}",
regex_generic.text(), column_with_strings.column_name_, get_user_friendly_type_string(column_with_strings.column_->type()));
}
});
ARCTICDB_DEBUG(log::version(), "Filtered column of size {} down to {} bits", column_with_strings.column_->last_row() + 1, output_bitset.count());
return VariantData{std::move(output_bitset)};
} else {
internal::raise<ErrorCode::E_ASSERTION_FAILURE>("Invalid operator {} for regex match", func);
return EmptyResult{};
}
}

template<typename Func>
VariantData visit_binary_comparator(const VariantData& left, const VariantData& right, Func&& func) {
if(std::holds_alternative<EmptyResult>(left) || std::holds_alternative<EmptyResult>(right))
return EmptyResult{};

return std::visit(util::overload {
[&func] (const ColumnWithStrings& l, const std::shared_ptr<Value>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func] (const ColumnWithStrings& l, const std::shared_ptr<Value>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&] (const ColumnWithStrings& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func] (const ColumnWithStrings& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&](const std::shared_ptr<Value>& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func), true>(r, *l, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
[&func](const std::shared_ptr<Value>& l, const ColumnWithStrings& r) ->VariantData {
auto result = binary_comparator<decltype(func), true>(r, *l, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[&] ([[maybe_unused]] const std::shared_ptr<Value>& l, [[maybe_unused]] const std::shared_ptr<Value>& r) ->VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Two value inputs not accepted to binary comparators");
return EmptyResult{};
[&func](const ColumnWithStrings& l, const std::shared_ptr<util::RegexGeneric>& r) ->VariantData {
auto result = binary_comparator<decltype(func)>(l, *r, std::forward<decltype(func)>(func));
return transform_to_placeholder(result);
},
[] ([[maybe_unused]] const std::shared_ptr<Value>& l, [[maybe_unused]] const std::shared_ptr<Value>& r) ->VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Two value inputs not accepted to binary comparators");
return EmptyResult{};
},
[](const auto &, const auto&) -> VariantData {
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Bitset/ValueSet inputs not accepted to binary comparators");
return EmptyResult{};
}
user_input::raise<ErrorCode::E_INVALID_USER_ARGUMENT>("Bitset/ValueSet inputs not accepted to binary comparators");
return EmptyResult{};
}
}, left, right);
}

24 changes: 24 additions & 0 deletions cpp/arcticdb/processing/operation_types.hpp
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
#include <arcticdb/processing/signed_unsigned_comparison.hpp>
#include <arcticdb/util/constants.hpp>
#include <arcticdb/util/preconditions.hpp>
#include <arcticdb/entity/types.hpp>
#include <ankerl/unordered_dense.h>

namespace arcticdb {
@@ -43,6 +44,7 @@ enum class OperationType : uint8_t {
GE,
ISIN,
ISNOTIN,
REGEX_MATCH,
// Boolean
AND,
OR,
@@ -72,6 +74,7 @@ inline std::string_view operation_type_to_str(const OperationType ot) {
TO_STR(GE)
TO_STR(ISIN)
TO_STR(ISNOTIN)
TO_STR(REGEX_MATCH)
TO_STR(AND)
TO_STR(OR)
TO_STR(XOR)
@@ -513,6 +516,16 @@ bool operator()(int64_t t, uint64_t u) const {
}
};

struct RegexMatchOperator {
template<typename T, typename U>
bool operator()(T, U) const {
util::raise_rte("RegexMatchOperator does not support {} and {}", typeid(T).name(), typeid(U).name());
}
bool operator()(entity::position_t offset, const ankerl::unordered_dense::set<position_t>& offset_set) const {
return offset_set.contains(offset);
}
};

struct MembershipOperator {
protected:
template<typename U>
@@ -802,4 +815,15 @@ struct formatter<arcticdb::IsNotInOperator> {
}
};

template<>
struct formatter<arcticdb::RegexMatchOperator> {
template<typename ParseContext>
constexpr auto parse(ParseContext &ctx) { return ctx.begin(); }

template<typename FormatContext>
constexpr auto format(arcticdb::RegexMatchOperator, FormatContext &ctx) const {
return fmt::format_to(ctx.out(), "REGEX MATCH");
}
};

} // namespace fmt
3 changes: 3 additions & 0 deletions cpp/arcticdb/processing/processing_unit.cpp
Original file line number Diff line number Diff line change
@@ -75,6 +75,9 @@ VariantData ProcessingUnit::get(const VariantNode &name) {
[&](const ValueSetName &value_set_name) {
return VariantData(expression_context_->value_sets_.get_value(value_set_name.value));
},
[&](const RegexName &regex_name) {
return VariantData(expression_context_->regex_matches_.get_value(regex_name.value));
},
[&](const ExpressionName &expression_name) {
if (auto computed = computed_data_.find(expression_name.value);
computed != std::end(computed_data_)) {
28 changes: 28 additions & 0 deletions cpp/arcticdb/processing/test/benchmark_binary.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* Copyright 2023 Man Group Operations Limited
*
* Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
*
* As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
*/
#include <arcticdb/processing/test/benchmark_common.hpp>
#include <arcticdb/processing/operation_dispatch_binary.hpp>
#include <arcticdb/util/regex_filter.hpp>

using namespace arcticdb;

// run like: --benchmark_time_unit=ms --benchmark_filter=.* --benchmark_min_time=5x

static void BM_regex_match(benchmark::State& state) {
const auto num_rows = static_cast<size_t>(state.range(0));
const auto left = generate_string_dense_column(num_rows, state.range(1), state.range(2) ? DataType::UTF_DYNAMIC64 : DataType::UTF_FIXED64);
const auto right = util::RegexGeneric(".*");
for (auto _ : state) {
binary_comparator(left, right, RegexMatchOperator{});
}
}

BENCHMARK(BM_regex_match)
->Args({100'000, 1'000, true})
->Args({100'000, 1'000, false})
->Args({100'000, 10'000, true})
->Args({100'000, 10'000, false});
Loading

Unchanged files with check annotations Beta