Skip to content

Commit ddbe959

Browse files
committed
[python] Use pyroaring.BitMap64 for better performance
1 parent 3e706fe commit ddbe959

File tree

2 files changed

+146
-57
lines changed

2 files changed

+146
-57
lines changed

.github/workflows/paimon-python-checks.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ jobs:
135135
else
136136
python -m pip install --upgrade pip
137137
pip install torch --index-url https://download.pytorch.org/whl/cpu
138-
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 faiss-cpu==1.7.4
138+
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 pyroaring==1.0.3 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 faiss-cpu==1.7.4
139139
fi
140140
df -h
141141
- name: Run lint-python.sh
@@ -170,7 +170,7 @@ jobs:
170170
run: |
171171
python -m pip install --upgrade pip
172172
pip install torch --index-url https://download.pytorch.org/whl/cpu
173-
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0
173+
python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 pyroaring==1.0.3 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0
174174
- name: Run lint-python.sh
175175
shell: bash
176176
run: |
@@ -266,7 +266,7 @@ jobs:
266266
pip install torch --index-url https://download.pytorch.org/whl/cpu
267267
python -m pip install --no-cache-dir \
268268
pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 \
269-
fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 \
269+
fastavro==1.11.1 pyarrow==16.0.0 pyroaring==1.0.3 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 \
270270
numpy==1.24.3 pandas==2.0.3 cramjam pytest~=7.0 py4j==0.10.9.9 requests \
271271
parameterized==0.9.0 packaging
272272
- name: Test Ray version compatibility

paimon-python/pypaimon/globalindex/roaring_bitmap.py

Lines changed: 143 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -20,61 +20,98 @@
2020
Roaring Bitmap.
2121
"""
2222

23-
from typing import Iterator, Set
24-
import struct
23+
from typing import Iterator
24+
25+
try:
26+
from pyroaring import BitMap64
27+
PYROARING_AVAILABLE = True
28+
except ImportError:
29+
PYROARING_AVAILABLE = False
2530

2631

2732
class RoaringBitmap64:
2833
"""
2934
A 64-bit roaring bitmap implementation.
35+
3036
This class provides efficient storage and operations for sets of 64-bit integers.
31-
It uses a set-based implementation for simplicity, which can be replaced with
32-
a more efficient roaring bitmap library if needed.
37+
If pyroaring is available, it uses BitMap64 for better performance and memory efficiency.
38+
Otherwise, it falls back to a set-based implementation.
3339
"""
3440

3541
def __init__(self):
36-
self._data: Set[int] = set()
42+
if PYROARING_AVAILABLE:
43+
self._data = BitMap64()
44+
else:
45+
self._data = set()
46+
self._fallback = True
3747

3848
def add(self, value: int) -> None:
3949
"""Add a single value to the bitmap."""
40-
self._data.add(value)
50+
if PYROARING_AVAILABLE:
51+
self._data.add(value)
52+
else:
53+
self._data.add(value)
4154

4255
def add_range(self, from_: int, to: int) -> None:
4356
"""Add a range of values [from_, to] to the bitmap."""
44-
for i in range(from_, to + 1):
45-
self._data.add(i)
57+
if PYROARING_AVAILABLE:
58+
self._data.add_range(from_, to + 1)
59+
else:
60+
for i in range(from_, to + 1):
61+
self._data.add(i)
4662

4763
def contains(self, value: int) -> bool:
4864
"""Check if the bitmap contains the given value."""
49-
return value in self._data
65+
if PYROARING_AVAILABLE:
66+
return value in self._data
67+
else:
68+
return value in self._data
5069

5170
def is_empty(self) -> bool:
5271
"""Check if the bitmap is empty."""
53-
return len(self._data) == 0
72+
if PYROARING_AVAILABLE:
73+
return len(self._data) == 0
74+
else:
75+
return len(self._data) == 0
5476

5577
def cardinality(self) -> int:
5678
"""Return the number of elements in the bitmap."""
57-
return len(self._data)
79+
if PYROARING_AVAILABLE:
80+
return len(self._data)
81+
else:
82+
return len(self._data)
5883

5984
def __iter__(self) -> Iterator[int]:
6085
"""Iterate over all values in the bitmap in sorted order."""
61-
return iter(sorted(self._data))
86+
if PYROARING_AVAILABLE:
87+
return iter(self._data)
88+
else:
89+
return iter(sorted(self._data))
6290

6391
def __len__(self) -> int:
6492
"""Return the number of elements in the bitmap."""
65-
return len(self._data)
93+
if PYROARING_AVAILABLE:
94+
return len(self._data)
95+
else:
96+
return len(self._data)
6697

6798
def __contains__(self, value: int) -> bool:
6899
"""Check if the bitmap contains the given value."""
69100
return self.contains(value)
70101

71102
def clear(self) -> None:
72103
"""Clear all values from the bitmap."""
73-
self._data.clear()
104+
if PYROARING_AVAILABLE:
105+
self._data.clear()
106+
else:
107+
self._data.clear()
74108

75109
def to_list(self) -> list:
76110
"""Return a sorted list of all values in the bitmap."""
77-
return sorted(self._data)
111+
if PYROARING_AVAILABLE:
112+
return list(self._data)
113+
else:
114+
return sorted(self._data)
78115

79116
def to_range_list(self) -> list:
80117
"""
@@ -85,76 +122,128 @@ def to_range_list(self) -> list:
85122
if self.is_empty():
86123
return []
87124

88-
sorted_values = sorted(self._data)
89-
ranges = []
90-
start = sorted_values[0]
91-
end = start
92-
93-
for i in range(1, len(sorted_values)):
94-
if sorted_values[i] == end + 1:
95-
# Consecutive, extend the range
96-
end = sorted_values[i]
97-
else:
98-
# Gap, close current range and start new one
99-
ranges.append(Range(start, end))
100-
start = sorted_values[i]
101-
end = start
102-
103-
# Add the last range
104-
ranges.append(Range(start, end))
105-
106-
return ranges
125+
if PYROARING_AVAILABLE:
126+
# Use pyroaring's efficient iteration
127+
ranges = []
128+
sorted_values = list(self._data)
129+
start = sorted_values[0]
130+
end = start
131+
132+
for i in range(1, len(sorted_values)):
133+
if sorted_values[i] == end + 1:
134+
# Consecutive, extend the range
135+
end = sorted_values[i]
136+
else:
137+
# Gap, close current range and start new one
138+
ranges.append(Range(start, end))
139+
start = sorted_values[i]
140+
end = start
141+
142+
# Add the last range
143+
ranges.append(Range(start, end))
144+
145+
return ranges
146+
else:
147+
# Fallback implementation
148+
sorted_values = sorted(self._data)
149+
ranges = []
150+
start = sorted_values[0]
151+
end = start
152+
153+
for i in range(1, len(sorted_values)):
154+
if sorted_values[i] == end + 1:
155+
# Consecutive, extend the range
156+
end = sorted_values[i]
157+
else:
158+
# Gap, close current range and start new one
159+
ranges.append(Range(start, end))
160+
start = sorted_values[i]
161+
end = start
162+
163+
# Add the last range
164+
ranges.append(Range(start, end))
165+
166+
return ranges
107167

108168
@staticmethod
109169
def and_(a: 'RoaringBitmap64', b: 'RoaringBitmap64') -> 'RoaringBitmap64':
110170
"""Return the intersection of two bitmaps."""
111171
result = RoaringBitmap64()
112-
result._data = a._data & b._data
172+
if PYROARING_AVAILABLE:
173+
result._data = a._data & b._data
174+
else:
175+
result._data = a._data & b._data
113176
return result
114177

115178
@staticmethod
116179
def or_(a: 'RoaringBitmap64', b: 'RoaringBitmap64') -> 'RoaringBitmap64':
117180
"""Return the union of two bitmaps."""
118181
result = RoaringBitmap64()
119-
result._data = a._data | b._data
182+
if PYROARING_AVAILABLE:
183+
result._data = a._data | b._data
184+
else:
185+
result._data = a._data | b._data
120186
return result
121187

122188
@staticmethod
123189
def remove_all(a: 'RoaringBitmap64', b: 'RoaringBitmap64') -> 'RoaringBitmap64':
124190
result = RoaringBitmap64()
125-
result._data = a._data - b._data
191+
if PYROARING_AVAILABLE:
192+
result._data = a._data - b._data
193+
else:
194+
result._data = a._data - b._data
126195
return result
127196

128197
def serialize(self) -> bytes:
129198
"""Serialize the bitmap to bytes."""
130-
# Simple serialization format: count followed by sorted values
131-
values = sorted(self._data)
132-
data = struct.pack('>Q', len(values)) # 8-byte count
133-
for v in values:
134-
data += struct.pack('>q', v) # 8-byte signed value
135-
return data
199+
if PYROARING_AVAILABLE:
200+
return self._data.serialize()
201+
else:
202+
# Simple serialization format: count followed by sorted values
203+
import struct
204+
values = sorted(self._data)
205+
data = struct.pack('>Q', len(values)) # 8-byte count
206+
for v in values:
207+
data += struct.pack('>q', v) # 8-byte signed value
208+
return data
136209

137210
@staticmethod
138211
def deserialize(data: bytes) -> 'RoaringBitmap64':
139212
"""Deserialize a bitmap from bytes."""
140213
result = RoaringBitmap64()
141-
count = struct.unpack('>Q', data[:8])[0]
142-
offset = 8
143-
for _ in range(count):
144-
value = struct.unpack('>q', data[offset:offset + 8])[0]
145-
result.add(value)
146-
offset += 8
214+
if PYROARING_AVAILABLE:
215+
result._data = BitMap64.deserialize(data)
216+
else:
217+
import struct
218+
count = struct.unpack('>Q', data[:8])[0]
219+
offset = 8
220+
for _ in range(count):
221+
value = struct.unpack('>q', data[offset:offset + 8])[0]
222+
result.add(value)
223+
offset += 8
147224
return result
148225

149226
def __eq__(self, other: object) -> bool:
150227
if not isinstance(other, RoaringBitmap64):
151228
return False
152-
return self._data == other._data
229+
if PYROARING_AVAILABLE:
230+
return self._data == other._data
231+
else:
232+
return self._data == other._data
153233

154234
def __hash__(self) -> int:
155-
return hash(frozenset(self._data))
235+
if PYROARING_AVAILABLE:
236+
return hash(tuple(sorted(self._data)))
237+
else:
238+
return hash(frozenset(self._data))
156239

157240
def __repr__(self) -> str:
158-
if len(self._data) <= 10:
159-
return f"RoaringBitmap64({sorted(self._data)})"
160-
return f"RoaringBitmap64({len(self._data)} elements)"
241+
if PYROARING_AVAILABLE:
242+
values = list(self._data)
243+
if len(values) <= 10:
244+
return f"RoaringBitmap64({values})"
245+
return f"RoaringBitmap64({len(values)} elements)"
246+
else:
247+
if len(self._data) <= 10:
248+
return f"RoaringBitmap64({sorted(self._data)})"
249+
return f"RoaringBitmap64({len(self._data)} elements)"

0 commit comments

Comments
 (0)