Skip to content

Commit 70b9050

Browse files
authored
Support proper numpy integration for ~100x performance boost (#259)
# flatdata-py performance: vectorized access and scalar optimization ## What Adds NumPy-based vectorized field access to flatdata-py and optimizes the scalar (element-by-element) read path. Also fixes a pre-existing bug in `read_value()` for unaligned 64-bit fields. ## Changes ### Vectorized access (`data_access.py`, `resources.py`) - `read_field_vectorized()`: reads a bit-packed field from all vector elements at once via NumPy, returning an `ndarray`. Zero-copy over the mmap'd buffer. - `Vector.__getattr__("field")` returns a DataFrame column for the field. - `Vector.to_numpy()` / `to_data_frame()` return all fields at once. - `_VectorSlice` gets the same vectorized methods. - Results are cached per vector instance via `_as_numpy_2d()`. ### Pre-computed field readers (`data_access.py`, `structure.py`) - `make_field_reader(offset, width, signed)` builds a specialized closure with all constants (byte offset, bit shift, mask, sign handling) pre-computed. Six variants cover the cross-product of field types. - `Structure.__init_subclass__` builds a `_READERS` dict once per class. - `__getattr__`, `as_dict`, `as_list`, `as_tuple`, `as_nparray` all use `_READERS`. - `read_value()` is preserved as a thin wrapper around `make_field_reader` for one-off reads. ### Bug fix (`data_access.py`) - `read_value()` for 64-bit fields at non-byte-aligned offsets could return values wider than 64 bits (Python arbitrary-precision ints). The bit mask was only applied when `num_bits < 64`, missing the case where `offset_extra_bits > 0`. Fixed by masking when `num_bits < 64 or offset_extra_bits > 0`. ### Other - `__slots__ = ()` added to generated Structure subclasses (generator template + 10 golden files). Reduces instance size from 72 to 48 bytes. - `Vector.__iter__` uses local variable caching to avoid repeated attribute lookups. - Removed unnecessary `list()` on dict keys in `Archive.__getattr__`. - Performance tips section added to `flatdata-py/README.md`. - Version bump: flatdata-generator and flatdata-py both 0.4.10 → 0.4.11. - CI workflow updated to install local generator before flatdata-py (`py.yml`). ## Performance Measured on a vector from a test archive (5.8M elements, 20 fields, 32 bytes each): | Access pattern | Before | After | |---|---|---| | Scalar iteration (1 field) | 9.7s | 5.8s | | Vectorized column access (1 field) | n/a | 0.07s | --------- Signed-off-by: Christian Vetter <christian.vetter@here.com>
1 parent 0a6cb89 commit 70b9050

21 files changed

Lines changed: 436 additions & 24 deletions

File tree

.github/workflows/py.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ jobs:
1919
- name: Run tests
2020
run: |
2121
cd flatdata-py
22-
uv run --with pytest --with ../flatdata-generator pytest -v
23-
pip install .
24-
flatdata-inspector --help
22+
uv venv
23+
uv pip install ../flatdata-generator
24+
uv pip install ".[inspector]" pytest
25+
.venv/bin/pytest -v
26+
.venv/bin/flatdata-inspector --help
2527

flatdata-generator/flatdata/generator/templates/py/python.jinja2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import flatdata.lib as flatdata
1010
{{ struct.doc|to_python_doc}}
1111
class {{ tree.namespace_path(struct, "_") }}_{{ struct.name }}(flatdata.structure.Structure):
1212
"""{{ struct.doc|safe_py_string_line }}"""
13+
__slots__ = ()
1314
_SCHEMA = """{{ tree.schema(struct) }}"""
1415
_NAME = "{{ tree.namespace_path(struct, "_") }}_{{ struct.name }}"
1516
_SIZE_IN_BITS = {{ struct.size_in_bits }}

flatdata-generator/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "flatdata-generator"
7-
version = "0.4.10"
7+
version = "0.4.11"
88
description = "Generate source code for C++, Rust, Go or Python from a Flatdata schema file"
99
readme = "README.md"
1010
authors = [

flatdata-generator/tests/generators/py_expectations/archives/multivector.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class n_S(flatdata.structure.Structure):
22
""""""
3+
__slots__ = ()
34
_SCHEMA = """namespace n {
45
struct S
56
{
@@ -20,6 +21,7 @@ class n_S(flatdata.structure.Structure):
2021

2122
class n_T(flatdata.structure.Structure):
2223
""""""
24+
__slots__ = ()
2325
_SCHEMA = """namespace n {
2426
struct T
2527
{
@@ -40,6 +42,7 @@ class n_T(flatdata.structure.Structure):
4042
# Builtin type to for MultiVector index
4143
class n__builtin_multivector_IndexType8(flatdata.structure.Structure):
4244
"""/** Builtin type to for MultiVector index */"""
45+
__slots__ = ()
4346
_SCHEMA = """"""
4447
_NAME = "n__builtin_multivector_IndexType8"
4548
_SIZE_IN_BITS = 8
@@ -53,6 +56,7 @@ class n__builtin_multivector_IndexType8(flatdata.structure.Structure):
5356
# Builtin type to for MultiVector index
5457
class n__builtin_multivector_IndexType16(flatdata.structure.Structure):
5558
"""/** Builtin type to for MultiVector index */"""
59+
__slots__ = ()
5660
_SCHEMA = """"""
5761
_NAME = "n__builtin_multivector_IndexType16"
5862
_SIZE_IN_BITS = 16
@@ -66,6 +70,7 @@ class n__builtin_multivector_IndexType16(flatdata.structure.Structure):
6670
# Builtin type to for MultiVector index
6771
class n__builtin_multivector_IndexType64(flatdata.structure.Structure):
6872
"""/** Builtin type to for MultiVector index */"""
73+
__slots__ = ()
6974
_SCHEMA = """"""
7075
_NAME = "n__builtin_multivector_IndexType64"
7176
_SIZE_IN_BITS = 64

flatdata-generator/tests/generators/py_expectations/archives/namespaces.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class n_S(flatdata.structure.Structure):
22
""""""
3+
__slots__ = ()
34
_SCHEMA = """namespace n {
45
struct S
56
{
@@ -92,6 +93,7 @@ def __init__(self, resource_storage):
9293

9394
class m_S(flatdata.structure.Structure):
9495
""""""
96+
__slots__ = ()
9597
_SCHEMA = """namespace m {
9698
struct S
9799
{
@@ -184,6 +186,7 @@ def __init__(self, resource_storage):
184186
# Builtin type to for MultiVector index
185187
class a__builtin_multivector_IndexType32(flatdata.structure.Structure):
186188
"""/** Builtin type to for MultiVector index */"""
189+
__slots__ = ()
187190
_SCHEMA = """"""
188191
_NAME = "a__builtin_multivector_IndexType32"
189192
_SIZE_IN_BITS = 32

flatdata-generator/tests/generators/py_expectations/archives/ranges.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class n_S(flatdata.structure.Structure):
22
""""""
3+
__slots__ = ()
34
_SCHEMA = """namespace n {
45
struct S
56
{

flatdata-generator/tests/generators/py_expectations/archives/struct.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class n_S(flatdata.structure.Structure):
22
""""""
3+
__slots__ = ()
34
_SCHEMA = """namespace n {
45
struct S
56
{

flatdata-generator/tests/generators/py_expectations/archives/vector.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
class n_S(flatdata.structure.Structure):
22
""""""
3+
__slots__ = ()
34
_SCHEMA = """namespace n {
45
struct S
56
{
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# This is a comment about Foo
22
class n_Foo(flatdata.structure.Structure):
33
"""// This is a comment about Foo"""
4+
__slots__ = ()
45
_SCHEMA = """namespace n {
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
# This is a comment about Foo
22
class n_Foo(flatdata.structure.Structure):
3-
"""// This is a comment about Foo"""
3+
"""// This is a comment about Foo"""
4+
__slots__ = ()

0 commit comments

Comments
 (0)