Skip to content

Commit 8005e55

Browse files
committed
[python] Use pyroaring.BitMap64 for better performance
1 parent 3e706fe commit 8005e55

File tree

1 file changed

+17
-26
lines changed

1 file changed

+17
-26
lines changed

paimon-python/pypaimon/globalindex/roaring_bitmap.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,29 +20,28 @@
2020
Roaring Bitmap.
2121
"""
2222

23-
from typing import Iterator, Set
24-
import struct
23+
from typing import Iterator
24+
from pyroaring import BitMap64
2525

2626

2727
class RoaringBitmap64:
2828
"""
2929
A 64-bit roaring bitmap implementation.
30+
3031
This class provides efficient storage and operations for sets of 64-bit integers.
31-
It uses a set-based implementation for simplicity, which can be replaced with
32-
a more efficient roaring bitmap library if needed.
32+
It uses pyroaring.BitMap64 for better performance and memory efficiency.
3333
"""
3434

3535
def __init__(self):
36-
self._data: Set[int] = set()
36+
self._data = BitMap64()
3737

3838
def add(self, value: int) -> None:
3939
"""Add a single value to the bitmap."""
4040
self._data.add(value)
4141

4242
def add_range(self, from_: int, to: int) -> None:
4343
"""Add a range of values [from_, to] to the bitmap."""
44-
for i in range(from_, to + 1):
45-
self._data.add(i)
44+
self._data.add_range(from_, to + 1)
4645

4746
def contains(self, value: int) -> bool:
4847
"""Check if the bitmap contains the given value."""
@@ -58,7 +57,7 @@ def cardinality(self) -> int:
5857

5958
def __iter__(self) -> Iterator[int]:
6059
"""Iterate over all values in the bitmap in sorted order."""
61-
return iter(sorted(self._data))
60+
return iter(self._data)
6261

6362
def __len__(self) -> int:
6463
"""Return the number of elements in the bitmap."""
@@ -74,7 +73,7 @@ def clear(self) -> None:
7473

7574
def to_list(self) -> list:
7675
"""Return a sorted list of all values in the bitmap."""
77-
return sorted(self._data)
76+
return list(self._data)
7877

7978
def to_range_list(self) -> list:
8079
"""
@@ -85,8 +84,9 @@ def to_range_list(self) -> list:
8584
if self.is_empty():
8685
return []
8786

88-
sorted_values = sorted(self._data)
87+
# Use pyroaring's efficient iteration
8988
ranges = []
89+
sorted_values = list(self._data)
9090
start = sorted_values[0]
9191
end = start
9292

@@ -127,23 +127,13 @@ def remove_all(a: 'RoaringBitmap64', b: 'RoaringBitmap64') -> 'RoaringBitmap64':
127127

128128
def serialize(self) -> bytes:
129129
"""Serialize the bitmap to bytes."""
130-
# Simple serialization format: count followed by sorted values
131-
values = sorted(self._data)
132-
data = struct.pack('>Q', len(values)) # 8-byte count
133-
for v in values:
134-
data += struct.pack('>q', v) # 8-byte signed value
135-
return data
130+
return self._data.serialize()
136131

137132
@staticmethod
138133
def deserialize(data: bytes) -> 'RoaringBitmap64':
139134
"""Deserialize a bitmap from bytes."""
140135
result = RoaringBitmap64()
141-
count = struct.unpack('>Q', data[:8])[0]
142-
offset = 8
143-
for _ in range(count):
144-
value = struct.unpack('>q', data[offset:offset + 8])[0]
145-
result.add(value)
146-
offset += 8
136+
result._data = BitMap64.deserialize(data)
147137
return result
148138

149139
def __eq__(self, other: object) -> bool:
@@ -152,9 +142,10 @@ def __eq__(self, other: object) -> bool:
152142
return self._data == other._data
153143

154144
def __hash__(self) -> int:
155-
return hash(frozenset(self._data))
145+
return hash(tuple(sorted(self._data)))
156146

157147
def __repr__(self) -> str:
158-
if len(self._data) <= 10:
159-
return f"RoaringBitmap64({sorted(self._data)})"
160-
return f"RoaringBitmap64({len(self._data)} elements)"
148+
values = list(self._data)
149+
if len(values) <= 10:
150+
return f"RoaringBitmap64({values})"
151+
return f"RoaringBitmap64({len(values)} elements)"

0 commit comments

Comments
 (0)