Skip to content

Commit 64e06ad

Browse files
committed
Support proper numpy integration for ~100x performance boost
1 parent 0a6cb89 commit 64e06ad

6 files changed

Lines changed: 121 additions & 16 deletions

File tree

flatdata-py/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,37 @@ Once you have [created a flatdata schema file](../README.md#creating-a-schema),
1818
flatdata-generator --gen py --schema locations.flatdata --output-file locations.py
1919
```
2020

21+
## Performance tips
22+
23+
`flatdata-py` supports two data access patterns with very different performance characteristics on large archives.
24+
25+
Iterating over a vector yields one Python object per element. Each field access unpacks bits from the underlying memory-mapped data. This is fine for accessing individual elements or small ranges, but has significant per-element overhead for bulk operations:
26+
27+
```python
28+
count = sum(1 for x in archive.links if x.speed_limit > 100)
29+
```
30+
31+
For bulk operations, use the vectorized access methods that read fields directly into NumPy arrays:
32+
33+
```python
34+
# single column access, returns a pandas DataFrame
35+
df = archive.links.speed_limit
36+
count = len(df[df['speed_limit'] > 100])
37+
38+
# full NumPy structured array with all fields
39+
arr = archive.links.to_numpy()
40+
count = int(np.sum(arr['speed_limit'] > 100))
41+
42+
# slices work too
43+
arr = archive.links[1000:2000].to_numpy()
44+
df = archive.links[::10].to_data_frame()
45+
```
46+
47+
* Use `vector.field_name` (column access) when you only need one or a few fields.
48+
* Use `vector.to_numpy()` or `vector.to_data_frame()` when you need all fields at once.
49+
* Use `vector[i].field` for random access to individual elements.
50+
* The underlying data is memory-mapped; the OS pages it from disk on demand. Vectorized results are materialized as NumPy arrays in RAM.
51+
2152
## Using the inspector
2253

2354
`flatdata-py` comes with a handy tool called the `flatdata-inspector` to inspect the contents of an archive:

flatdata-py/flatdata/lib/archive.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ def __init__(self, resource_storage):
3939
self.__getattr__(name)
4040

4141
def __getattr__(self, name):
42-
if name not in list(self._RESOURCES.keys()):
42+
if name not in self._RESOURCES:
4343
raise AttributeError("Resource %s not defined in archive." % name)
44-
if name not in list(self._loaded_resources.keys()):
44+
if name not in self._loaded_resources:
4545
self._loaded_resources[name] = self._open_resource(name)
4646
return self._loaded_resources[name]
4747

flatdata-py/flatdata/lib/data_access.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
See the LICENSE file in the root of this project for license details.
44
'''
55

6+
import numpy as np
7+
68
# Sign bits cache for the value reading.
79
_SIGN_BITS = [0] + [(1 << (bits - 1)) for bits in range(1, 65)]
810

@@ -62,3 +64,32 @@ def write_value(data, offset_bits, num_bits, is_signed, value):
6264
surrounding_bits = data[offset_bytes + byte_idx] & ~((1 << offset_bits) - 1)
6365
data[offset_bytes + byte_idx] = value_in_little_endian[byte_idx] & ((1 << (8 - (bits_written % 8))) - 1)
6466
data[offset_bytes + byte_idx] |= surrounding_bits
67+
68+
69+
def read_field_vectorized(raw_bytes_2d, field_offset_bits, field_width_bits, is_signed):
70+
"""Read a bit-packed field from all elements at once, returning a numpy array.
71+
72+
:param raw_bytes_2d: numpy uint8 array shaped (num_elements, struct_size_bytes)
73+
:param field_offset_bits: bit offset of the field within each element
74+
:param field_width_bits: width of the field in bits (max 64)
75+
:param is_signed: whether to sign-extend the result
76+
:return: numpy array of field values
77+
"""
78+
byte_start = field_offset_bits // 8
79+
bit_shift = field_offset_bits % 8
80+
bytes_needed = (bit_shift + field_width_bits + 7) // 8
81+
82+
result = np.zeros(raw_bytes_2d.shape[0], dtype=np.uint64)
83+
for b in range(bytes_needed):
84+
result |= raw_bytes_2d[:, byte_start + b].astype(np.uint64) << np.uint64(b * 8)
85+
result >>= np.uint64(bit_shift)
86+
87+
if field_width_bits < 64:
88+
result &= np.uint64((1 << field_width_bits) - 1)
89+
90+
if is_signed:
91+
sign_bit = np.uint64(1 << (field_width_bits - 1))
92+
signed = result.astype(np.int64) - np.int64(1 << field_width_bits)
93+
result = np.where(result & sign_bit, signed, result.astype(np.int64))
94+
95+
return result

flatdata-py/flatdata/lib/resources.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pandas as pd
99
import numpy as np
1010

11-
from .data_access import read_value
11+
from .data_access import read_value, read_field_vectorized
1212
from .errors import CorruptResourceError
1313

1414
SIZE_OFFSET_IN_BITS = 64
@@ -24,6 +24,7 @@ def __init__(self, mem, element_type):
2424
self._element_type = element_type
2525
self._element_types = [element_type]
2626
self._type_size_in_bytes = self._element_type._SIZE_IN_BYTES if self._element_type else 1
27+
self._raw_numpy_2d = None
2728

2829
def size_in_bytes(self):
2930
return len(self._mem)
@@ -35,6 +36,20 @@ def _get_item(self, index):
3536
offset = self._item_offset(index)
3637
return self._element_type(self._mem, offset)
3738

39+
def _as_numpy_2d(self):
40+
"""Return the raw data as a 2D numpy uint8 array of shape (n, struct_size).
41+
Zero-copy via np.frombuffer on the mmap'd memory. Cached after first call.
42+
"""
43+
if self._raw_numpy_2d is None:
44+
n = len(self)
45+
struct_size = self._type_size_in_bytes
46+
raw = np.frombuffer(
47+
self._mem[SIZE_OFFSET_IN_BYTES:SIZE_OFFSET_IN_BYTES + n * struct_size],
48+
dtype=np.uint8,
49+
)
50+
self._raw_numpy_2d = raw.reshape(n, struct_size)
51+
return self._raw_numpy_2d
52+
3853
def _repr_attributes(self):
3954
return {
4055
"container_type": self.__class__.__name__,
@@ -60,14 +75,19 @@ def __init__(self, s, sequence):
6075
self._sequence = sequence
6176

6277
def to_numpy(self, limit=None):
78+
raw_2d = self._sequence._as_numpy_2d()
6379
indices = self._slice.indices(len(self._sequence))
64-
num_items = len(range(*indices)) if not limit else limit
65-
result = np.empty(
66-
shape=num_items,
67-
dtype=self._sequence._element_type.dtype()
68-
)
69-
for index, item in enumerate(self):
70-
result[index] = item.as_tuple()
80+
sliced = raw_2d[self._slice]
81+
if limit is not None:
82+
sliced = sliced[:limit]
83+
84+
fields = self._sequence._element_type._FIELDS
85+
dtype = self._sequence._element_type.dtype()
86+
result = np.empty(sliced.shape[0], dtype=dtype)
87+
for name, field in fields.items():
88+
result[name] = read_field_vectorized(
89+
sliced, field.offset, field.width, field.is_signed
90+
)
7191
return result
7292

7393
def to_data_frame(self, limit=None):
@@ -78,7 +98,10 @@ def __iter__(self):
7898
yield self._sequence[i]
7999

80100
def __getattr__(self, name):
81-
return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name])
101+
raw_2d = self._sequence._as_numpy_2d()[self._slice]
102+
field = self._sequence._element_type._FIELDS[name]
103+
values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed)
104+
return pd.DataFrame(data=values, columns=[name])
82105

83106
def __repr__(self):
84107
return "Displaying first 100 records:\n" + self.to_data_frame(limit=100).__repr__()
@@ -92,8 +115,20 @@ def __init__(self, mem, element_type):
92115
assert rem == 0, "Malformed vector"
93116
self._size = size
94117

118+
def to_numpy(self):
119+
"""Convert entire vector to a numpy structured array (vectorized)."""
120+
raw_2d = self._as_numpy_2d()
121+
fields = self._element_type._FIELDS
122+
dtype = self._element_type.dtype()
123+
result = np.empty(self._size, dtype=dtype)
124+
for name, field in fields.items():
125+
result[name] = read_field_vectorized(
126+
raw_2d, field.offset, field.width, field.is_signed
127+
)
128+
return result
129+
95130
def to_data_frame(self):
96-
return self[:].to_data_frame()
131+
return pd.DataFrame(data=self.to_numpy())
97132

98133
def __getitem__(self, index):
99134
if isinstance(index, slice):
@@ -106,11 +141,17 @@ def __getitem__(self, index):
106141
return self._get_item(index)
107142

108143
def __iter__(self):
109-
for i in range(len(self)):
110-
yield self._get_item(i)
144+
mem = self._mem
145+
element_type = self._element_type
146+
size_bytes = self._type_size_in_bytes
147+
for i in range(self._size):
148+
yield element_type(mem, SIZE_OFFSET_IN_BYTES + size_bytes * i)
111149

112150
def __getattr__(self, name):
113-
return pd.DataFrame(data=[[getattr(item, name)] for item in self], columns=[name])
151+
raw_2d = self._as_numpy_2d()
152+
field = self._element_type._FIELDS[name]
153+
values = read_field_vectorized(raw_2d, field.offset, field.width, field.is_signed)
154+
return pd.DataFrame(data=values, columns=[name])
114155

115156
def __len__(self):
116157
return self._size

flatdata-py/flatdata/lib/structure.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010

1111
class Structure:
12+
__slots__ = ('_mem', '_pos')
13+
1214
def __init__(self, mem, pos):
1315
self._mem = mem
1416
self._pos = pos

flatdata-py/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "flatdata-py"
7-
version = "0.4.10"
7+
version = "0.4.11"
88
description = "Python 3 implementation of Flatdata"
99
readme = "README.md"
1010
authors = [

0 commit comments

Comments
 (0)