Skip to content

Commit cc6503e

Browse files
authored
Merge pull request #62 from c-dickens/bloom-filter-bindings
Bloom filter bindings
2 parents 65e1f1d + d4b3700 commit cc6503e

File tree

4 files changed

+824
-132
lines changed

4 files changed

+824
-132
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# OSX files
1010
.DS_Store
1111

12+
# Compiled binary files
13+
*.so
14+
1215
# Python created files
1316
.eggs/
1417
.tox/

_datasketches.so

-1.29 MB
Binary file not shown.

src/bloom_filter_wrapper.cpp

Lines changed: 140 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,24 +30,153 @@ void bind_bloom_filter(nb::module_ &m, const char* name) {
3030
using namespace datasketches;
3131
using bloom_filter_type = bloom_filter_alloc<A>;
3232

33-
// Start with just one simple function
34-
m.def("create_bloom_filter",
35-
[](uint64_t max_distinct_items, double target_false_positive_prob) {
36-
return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob);
37-
},
38-
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
39-
"Creates a Bloom filter with optimal parameters for the given accuracy requirements");
40-
41-
// Bind the class with minimal methods
33+
// Bind the class with static factory methods only
4234
nb::class_<bloom_filter_type>(m, name)
4335
.def("is_empty", &bloom_filter_type::is_empty,
4436
"Returns True if the filter has seen no items, otherwise False")
37+
38+
// Update methods - efficient overloads for Python types
39+
// Non-negative integers (uint64_t)
40+
.def("update", static_cast<void (bloom_filter_type::*)(uint64_t)>(&bloom_filter_type::update),
41+
nb::arg("item"),
42+
"Updates the filter with a non-negative integer")
43+
// Negative/positive integers (int64_t)
44+
.def("update", static_cast<void (bloom_filter_type::*)(int64_t)>(&bloom_filter_type::update),
45+
nb::arg("item"),
46+
"Updates the filter with a negative/positive integer")
47+
// Float (double)
48+
.def("update", static_cast<void (bloom_filter_type::*)(double)>(&bloom_filter_type::update),
49+
nb::arg("item"),
50+
"Updates the filter with a float")
51+
// String (std::string)
4552
.def("update", static_cast<void (bloom_filter_type::*)(const std::string&)>(&bloom_filter_type::update),
4653
nb::arg("item"),
47-
"Updates the filter with the given string")
54+
"Updates the filter with a string")
55+
// Bytes object
56+
.def("update",
57+
[](bloom_filter_type& self, nb::bytes b) {
58+
self.update(b.c_str(), b.size());
59+
},
60+
nb::arg("item"),
61+
"Updates the filter with a bytes object")
62+
63+
// Query methods - efficient overloads for Python types
64+
// Non-negative integers (uint64_t)
65+
.def("query", static_cast<bool (bloom_filter_type::*)(uint64_t) const>(&bloom_filter_type::query),
66+
nb::arg("item"),
67+
"Queries the filter for a non-negative integer")
68+
// Negative/positive integers (int64_t)
69+
.def("query", static_cast<bool (bloom_filter_type::*)(int64_t) const>(&bloom_filter_type::query),
70+
nb::arg("item"),
71+
"Queries the filter for a negative/positive integer")
72+
// Float (double)
73+
.def("query", static_cast<bool (bloom_filter_type::*)(double) const>(&bloom_filter_type::query),
74+
nb::arg("item"),
75+
"Queries the filter for a float")
76+
// String (std::string)
4877
.def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query),
4978
nb::arg("item"),
50-
"Queries the filter for the given string");
79+
"Queries the filter for a string")
80+
// Bytes object
81+
.def("query",
82+
[](const bloom_filter_type& self, nb::bytes b) -> bool {
83+
return self.query(b.c_str(), b.size());
84+
},
85+
nb::arg("item"),
86+
"Queries the filter for a bytes object")
87+
88+
.def("reset", &bloom_filter_type::reset,
89+
"Resets the Bloom filter to its original empty state")
90+
.def("union_with", &bloom_filter_type::union_with,
91+
nb::arg("other"),
92+
"Performs a union operation with another Bloom filter. Both filters must have the same capacity, number of hashes, and seed.")
93+
.def("intersect", &bloom_filter_type::intersect,
94+
nb::arg("other"),
95+
"Performs an intersection operation with another Bloom filter. Both filters must have the same capacity, number of hashes, and seed.")
96+
.def("invert", &bloom_filter_type::invert,
97+
"Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.")
98+
.def("to_string", &bloom_filter_type::to_string,
99+
nb::arg("print_filter")=false,
100+
"Returns a string representation of the Bloom filter\n\n"
101+
":param print_filter: If True, includes the actual bit array in the output\n:type print_filter: bool, optional\n"
102+
":return: String representation of the filter\n:rtype: str")
103+
.def("__str__", [](const bloom_filter_type& self) { return self.to_string(false); },
104+
"Returns a string summary of the Bloom filter (without printing the bit array)")
105+
.def("__repr__", [](const bloom_filter_type& self) { return self.to_string(false); },
106+
"Returns a detailed string representation of the Bloom filter for debugging and REPL use")
107+
.def("__copy__", [](const bloom_filter_type& self) { return bloom_filter_type(self); },
108+
"Returns a copy of the Bloom filter")
109+
.def("is_compatible", &bloom_filter_type::is_compatible,
110+
nb::arg("other"),
111+
"Returns True if the other Bloom filter is compatible for union/intersection operations (same capacity, num_hashes, and seed)")
112+
.def("get_serialized_size_bytes",
113+
[](const bloom_filter_type& sk) { return sk.get_serialized_size_bytes(); },
114+
"Returns the size in bytes of the serialized image of the filter")
115+
.def("serialize",
116+
[](const bloom_filter_type& sk) {
117+
auto v = sk.serialize(); // vector_bytes (std::vector<uint8_t, Allocator>)
118+
return nb::bytes(reinterpret_cast<const char*>(v.data()), v.size());
119+
},
120+
"Serialize the filter to a cross-language compatible byte string")
121+
.def_static(
122+
"deserialize",
123+
[](const nb::bytes& bytes) {
124+
return bloom_filter_type::deserialize(bytes.c_str(), bytes.size());
125+
},
126+
nb::arg("bytes"),
127+
"Reads a bytes object and returns the corresponding bloom_filter")
128+
.def_static("suggest_num_hashes",
129+
static_cast<uint16_t (*)(uint64_t, uint64_t)>(&bloom_filter_type::builder::suggest_num_hashes),
130+
nb::arg("max_distinct_items"), nb::arg("num_filter_bits"),
131+
"Suggests the optimal number of hash functions for given target numbers of distinct items and filter size")
132+
.def_static("suggest_num_hashes_by_probability",
133+
static_cast<uint16_t (*)(double)>(&bloom_filter_type::builder::suggest_num_hashes),
134+
nb::arg("target_false_positive_prob"),
135+
"Suggests the optimal number of hash functions to achieve a target false positive probability")
136+
.def_static("suggest_num_filter_bits",
137+
&bloom_filter_type::builder::suggest_num_filter_bits,
138+
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"),
139+
"Suggests the optimal number of bits for given target numbers of distinct items and false positive probability")
140+
.def_static("create_by_accuracy",
141+
[](uint64_t max_distinct_items, double target_false_positive_prob, uint64_t seed) {
142+
return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob, seed);
143+
},
144+
nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"), nb::arg("seed")=bloom_filter_type::builder::generate_random_seed(),
145+
"Creates a Bloom filter with optimal parameters for the given accuracy requirements\n\n"
146+
":param max_distinct_items: Maximum expected number of distinct items to add to the filter\n:type max_distinct_items: int\n"
147+
":param target_false_positive_prob: Desired false positive probability per item\n:type target_false_positive_prob: float\n"
148+
":param seed: Hash seed to use (default: random)\n:type seed: int, optional"
149+
)
150+
.def_static("create_by_size",
151+
[](uint64_t num_bits, uint16_t num_hashes, uint64_t seed) {
152+
return bloom_filter_type::builder::create_by_size(num_bits, num_hashes, seed);
153+
},
154+
nb::arg("num_bits"), nb::arg("num_hashes"), nb::arg("seed")=bloom_filter_type::builder::generate_random_seed(),
155+
"Creates a Bloom filter with specified size parameters\n\n"
156+
":param num_bits: Size of the Bloom filter in bits\n:type num_bits: int\n"
157+
":param num_hashes: Number of hash functions to apply to items\n:type num_hashes: int\n"
158+
":param seed: Hash seed to use (default: random)\n:type seed: int, optional"
159+
)
160+
.def_prop_ro(
161+
"num_bits_used",
162+
&bloom_filter_type::get_bits_used,
163+
"Number of bits set to 1 in the Bloom filter"
164+
)
165+
.def_prop_ro(
166+
"capacity",
167+
&bloom_filter_type::get_capacity,
168+
"Number of bits in the Bloom filter's bit array"
169+
)
170+
.def_prop_ro(
171+
"num_hashes",
172+
&bloom_filter_type::get_num_hashes,
173+
"Number of hash functions used by this Bloom filter"
174+
)
175+
.def_prop_ro(
176+
"seed",
177+
&bloom_filter_type::get_seed,
178+
"Hash seed used by this Bloom filter"
179+
);
51180
}
52181

53182
void init_bloom_filter(nb::module_ &m) {

0 commit comments

Comments
 (0)