Skip to content

Commit bbd1ba5

Browse files
committed
Phase 7
Signed-off-by: Christian Vetter <christian.vetter@here.com>
1 parent 8098e3b commit bbd1ba5

4 files changed

Lines changed: 76 additions & 32 deletions

File tree

flatdata-py/flatdata/lib/archive.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,26 @@
33
See the LICENSE file in the root of this project for license details.
44
'''
55

6-
from collections import namedtuple
7-
from typing import Any
6+
from __future__ import annotations
7+
8+
from typing import Any, NamedTuple, TYPE_CHECKING
89

910
import pandas as pd
1011

1112
from .errors import MissingResourceError, SchemaMismatchError
1213

13-
ResourceSignature = namedtuple("ResourceSignature",
14-
["container", "initializer", "schema", "is_optional", "doc"])
14+
if TYPE_CHECKING:
15+
from .resources import ReadStorage, ResourceBase
16+
17+
18+
class ResourceSignature(NamedTuple):
19+
container: type[ResourceBase] | type[Archive]
20+
initializer: Any
21+
schema: str
22+
is_optional: bool
23+
doc: str
1524

16-
def _is_archive_signature(resource_signature: Any) -> bool:
25+
def _is_archive_signature(resource_signature: ResourceSignature) -> bool:
1726
return bool(resource_signature.container == Archive)
1827

1928
_SCHEMA_EXT = ".schema"
@@ -26,16 +35,16 @@ class Archive:
2635
"""
2736
_NAME: str
2837
_SCHEMA: str
29-
_RESOURCES: dict[str, Any]
38+
_RESOURCES: dict[str, ResourceSignature]
3039

31-
def __init__(self, resource_storage: Any) -> None:
40+
def __init__(self, resource_storage: ReadStorage) -> None:
3241
"""
3342
Opens archive from a given resource storage.
3443
:raises flatdata.errors.CorruptArchiveError
3544
:raises flatdata.errors.SchemaMismatchError
3645
:param resource_storage: Resource storage to use.
3746
"""
38-
self._resource_storage: Any = resource_storage
47+
self._resource_storage: ReadStorage = resource_storage
3948
self._loaded_resources: dict[str, Any] = {}
4049

4150
# Preload resources and check their schemas
@@ -78,7 +87,7 @@ def resource_schema(cls, resource: str) -> str:
7887
return str(cls._RESOURCES[resource].schema)
7988

8089
@classmethod
81-
def open(cls, storage: Any, name: str, initializer: Any, is_optional: bool = False) -> Any:
90+
def open(cls, storage: ReadStorage, name: str, initializer: type[Archive], is_optional: bool = False) -> Archive | None:
8291
nested_storage = storage.get(name, is_optional)
8392
assert nested_storage is not None or is_optional
8493
if nested_storage is None:
@@ -93,7 +102,7 @@ def size_in_bytes(self) -> int:
93102
def __len__(self) -> int:
94103
return len(self._RESOURCES)
95104

96-
def _schema_validated_resource_signature(self, name: str) -> Any:
105+
def _schema_validated_resource_signature(self, name: str) -> ResourceSignature | None:
97106
resource_signature = self._RESOURCES[name]
98107
# We check only schema for non-subarchives, since the subarchives schema is checked,
99108
# when it is initialized.
@@ -120,7 +129,7 @@ def _open_resource(self, name: str) -> Any:
120129
return None
121130

122131
@staticmethod
123-
def _check_non_subarchive_schema(name: str, resource_signature: Any, storage: Any) -> None:
132+
def _check_non_subarchive_schema(name: str, resource_signature: ResourceSignature, storage: Any) -> None:
124133
actual_schema = bytes(storage).decode()
125134
if actual_schema != resource_signature.schema:
126135
raise SchemaMismatchError(

flatdata-py/flatdata/lib/archive_builder.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,43 @@
33
See the LICENSE file in the root of this project for license details.
44
'''
55

6-
from collections import namedtuple
6+
from __future__ import annotations
7+
78
import os
8-
from typing import Any
9+
from typing import Any, NamedTuple, Protocol, TYPE_CHECKING
910

1011
from .errors import IndexWriterError, MissingFieldError, UnknownFieldError, \
1112
UnknownStructureError, UnknownResourceError, ResourceAlreadySetError
1213

1314
from .resources import Instance, Vector, Multivector, RawData
1415
from .data_access import write_value
1516

17+
if TYPE_CHECKING:
18+
from .resource_storage import _Resource
19+
from .structure import Structure
20+
1621
_SCHEMA_EXT = ".schema"
1722

18-
ResourceSignature = namedtuple("ResourceSignature",
19-
["container", "initializer", "schema", "is_optional", "doc"])
23+
24+
class ResourceSignature(NamedTuple):
25+
container: type
26+
initializer: Any
27+
schema: str
28+
is_optional: bool
29+
doc: str
30+
31+
32+
class WriteStorage(Protocol):
33+
def get(self, resource_name: str, is_subarchive: bool = False) -> Any: ...
34+
def close(self) -> None: ...
2035

2136

2237
class IndexWriter:
2338
"""
2439
IndexWriter class. Only applicable when multivector is present in archive schema.
2540
"""
2641

27-
def __init__(self, name: str, size: int, resource_storage: Any) -> None:
42+
def __init__(self, name: str, size: int, resource_storage: WriteStorage) -> None:
2843
"""
2944
Create IndexWriter class.
3045
@@ -37,7 +52,7 @@ def __init__(self, name: str, size: int, resource_storage: Any) -> None:
3752

3853
self._name = name
3954
self._index_size = size
40-
self._fout = resource_storage.get(f'{self._name}_index', False)
55+
self._fout: _Resource = resource_storage.get(f'{self._name}_index', False)
4156

4257
def add(self, index: int) -> None:
4358
"""
@@ -63,16 +78,16 @@ class ArchiveBuilder:
6378
"""
6479
_NAME: str
6580
_SCHEMA: str
66-
_RESOURCES: dict[str, Any]
81+
_RESOURCES: dict[str, ResourceSignature]
6782

68-
def __init__(self, resource_storage: Any, path: str = "") -> None:
83+
def __init__(self, resource_storage: WriteStorage, path: str = "") -> None:
6984
"""
7085
Opens archive from a given resource writer.
7186
:param resource_storage: storage manager to store and write to disc
7287
:param path: file path where archive is created
7388
"""
7489
self._path = os.path.join(path, self._NAME)
75-
self._resource_storage = resource_storage
90+
self._resource_storage: WriteStorage = resource_storage
7691
self._write_archive_signature()
7792
self._write_archive_schema()
7893
self._resources_written = [f"{self._NAME}.archive"]
@@ -129,7 +144,7 @@ def subarchive(self, name: str) -> 'ArchiveBuilder':
129144
raise NotImplementedError(f"subarchive '{name}' is not implemented")
130145

131146
@classmethod
132-
def __validate_structure_fields(cls, name: str, struct: dict[str, Any], initializer: Any) -> None:
147+
def __validate_structure_fields(cls, name: str, struct: dict[str, Any], initializer: type[Structure]) -> None:
133148
'''
134149
Validates whether passed object has all required fields
135150
@@ -146,7 +161,7 @@ def __validate_structure_fields(cls, name: str, struct: dict[str, Any], initiali
146161
if key not in initializer._FIELD_KEYS:
147162
raise UnknownFieldError(key, name)
148163

149-
def __set_instance(self, storage: Any, name: str, value: dict[str, Any]) -> None:
164+
def __set_instance(self, storage: _Resource, name: str, value: dict[str, Any]) -> None:
150165
'''
151166
Creates and writes instance type resource
152167
@@ -164,7 +179,7 @@ def __set_instance(self, storage: Any, name: str, value: dict[str, Any]) -> None
164179

165180
storage.write(bout)
166181

167-
def __set_vector(self, storage: Any, name: str, vector: list[dict[str, Any]]) -> None:
182+
def __set_vector(self, storage: _Resource, name: str, vector: list[dict[str, Any]]) -> None:
168183
'''
169184
Creates and writes vector resource
170185
@@ -183,7 +198,7 @@ def __set_vector(self, storage: Any, name: str, vector: list[dict[str, Any]]) ->
183198
field.is_signed, value[key])
184199
storage.write(bout)
185200

186-
def __set_multivector(self, storage: Any, name: str, value: list[list[dict[str, Any]]]) -> None:
201+
def __set_multivector(self, storage: _Resource, name: str, value: list[list[dict[str, Any]]]) -> None:
187202
'''
188203
Creates and writes multivector resource
189204

flatdata-py/flatdata/lib/resources.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,32 @@
33
See the LICENSE file in the root of this project for license details.
44
'''
55

6+
from __future__ import annotations
7+
68
from collections.abc import Iterator
79
import json
8-
from typing import Any
10+
from typing import Any, Protocol, TYPE_CHECKING
911

1012
import pandas as pd
1113
import numpy as np
1214

13-
from .data_access import read_value, read_field_vectorized
15+
from .data_access import ReadableBuffer, read_value, read_field_vectorized
1416
from .errors import CorruptResourceError
1517

18+
if TYPE_CHECKING:
19+
from .structure import Structure
20+
1621
SIZE_OFFSET_IN_BITS = 64
1722
SIZE_OFFSET_IN_BYTES = SIZE_OFFSET_IN_BITS // 8
1823
SIZE_PADDING_IN_BYTES = 8
1924

2025

26+
class ReadStorage(Protocol):
27+
def get(self, key: str, is_optional: bool = False) -> Any: ...
28+
29+
2130
class ResourceBase:
22-
def __init__(self, mem: Any, element_type: type[Any]) -> None:
31+
def __init__(self, mem: ReadableBuffer, element_type: type[Structure] | None) -> None:
2332
if len(mem) < (SIZE_OFFSET_IN_BYTES + SIZE_PADDING_IN_BYTES):
2433
raise CorruptResourceError()
2534
self._mem = memoryview(mem)
@@ -32,9 +41,11 @@ def size_in_bytes(self) -> int:
3241
return len(self._mem)
3342

3443
def _item_offset(self, index: int) -> int:
44+
assert self._element_type is not None
3545
return int(SIZE_OFFSET_IN_BYTES + self._element_type._SIZE_IN_BYTES * index)
3646

3747
def _get_item(self, index: int) -> Any:
48+
assert self._element_type is not None
3849
offset = self._item_offset(index)
3950
return self._element_type(self._mem, offset)
4051

@@ -67,7 +78,7 @@ def __repr__(self) -> str:
6778
return json.dumps(self._repr_attributes(), indent=4)
6879

6980
@classmethod
70-
def open(cls, storage: Any, name: str, initializer: Any, is_optional: bool = False) -> Any:
81+
def open(cls, storage: ReadStorage, name: str, initializer: Any, is_optional: bool = False) -> Any:
7182
return cls(storage.get(name, is_optional), initializer)
7283

7384

@@ -82,6 +93,7 @@ def to_numpy(self, limit: int | None = None) -> Any:
8293
if limit is not None:
8394
sliced = sliced[:limit]
8495

96+
assert self._sequence._element_type is not None
8597
fields = self._sequence._element_type._FIELDS
8698
dtype = self._sequence._element_type.dtype()
8799
result = np.empty(sliced.shape[0], dtype=dtype)
@@ -99,6 +111,7 @@ def __iter__(self) -> Iterator[Any]:
99111
yield self._sequence[i]
100112

101113
def __getattr__(self, name: str) -> pd.DataFrame:
114+
assert self._sequence._element_type is not None
102115
try:
103116
field = self._sequence._element_type._FIELDS[name]
104117
except KeyError:
@@ -112,7 +125,7 @@ def __repr__(self) -> str:
112125

113126

114127
class Vector(ResourceBase):
115-
def __init__(self, mem: Any, element_type: type[Any]) -> None:
128+
def __init__(self, mem: ReadableBuffer, element_type: type[Structure]) -> None:
116129
ResourceBase.__init__(self, mem, element_type)
117130
size_in_bytes = read_value(self._mem, 0, SIZE_OFFSET_IN_BITS, False)
118131
size, rem = divmod(size_in_bytes, self._type_size_in_bytes)
@@ -122,6 +135,7 @@ def __init__(self, mem: Any, element_type: type[Any]) -> None:
122135
def to_numpy(self) -> Any:
123136
"""Convert entire vector to a numpy structured array (vectorized)."""
124137
raw_2d = self._as_numpy_2d()
138+
assert self._element_type is not None
125139
fields = self._element_type._FIELDS
126140
dtype = self._element_type.dtype()
127141
result = np.empty(self._size, dtype=dtype)
@@ -147,11 +161,13 @@ def __getitem__(self, index: int | slice) -> Any:
147161
def __iter__(self) -> Iterator[Any]:
148162
mem = self._mem
149163
element_type = self._element_type
164+
assert element_type is not None
150165
size_bytes = self._type_size_in_bytes
151166
for i in range(self._size):
152167
yield element_type(mem, SIZE_OFFSET_IN_BYTES + size_bytes * i)
153168

154169
def __getattr__(self, name: str) -> pd.DataFrame:
170+
assert self._element_type is not None
155171
try:
156172
field = self._element_type._FIELDS[name]
157173
except KeyError:
@@ -178,14 +194,14 @@ def __repr__(self) -> str:
178194

179195

180196
class Multivector(ResourceBase):
181-
def __init__(self, index_mem: Any, mem: Any, index_type: type[Any], *element_types: type[Any]) -> None:
197+
def __init__(self, index_mem: ReadableBuffer, mem: ReadableBuffer, index_type: type[Structure], *element_types: type[Structure]) -> None:
182198
self._index = Vector(index_mem, index_type)
183-
self._mem = mem
199+
self._mem = memoryview(mem)
184200
self._element_types = list(element_types)
185201
self._index_type = index_type
186202

187203
@classmethod
188-
def open(cls, storage: Any, name: str, initializer: Any, is_optional: bool = False) -> 'Multivector':
204+
def open(cls, storage: ReadStorage, name: str, initializer: list[type[Structure]], is_optional: bool = False) -> Multivector:
189205
return cls(storage.get(name + "_index", is_optional),
190206
storage.get(name, is_optional),
191207
*initializer)
@@ -208,6 +224,7 @@ def __getitem__(self, index: int | slice) -> Any:
208224
type_index = read_value(self._mem, offset * 8, 8, False)
209225
offset += 1
210226
element_type = self._element_types[type_index]
227+
assert element_type is not None
211228
element = element_type(self._mem, offset)
212229
elements.append(element)
213230
offset += element_type._SIZE_IN_BYTES
@@ -280,6 +297,7 @@ def __iter__(self) -> Iterator[Any]:
280297
yield self._get_item(i)
281298

282299
def __getattr__(self, name: str) -> Any:
300+
assert self._element_type is not None
283301
offset = self._item_offset(0)
284302
return getattr(self._element_type(self._mem, offset), name)
285303

flatdata-py/flatdata/lib/structure.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class Structure:
1717
_FIELDS: dict[str, FieldSignature]
1818
_FIELD_KEYS: list[str]
1919
_SCHEMA: str
20+
_SIZE_IN_BYTES: int
21+
_NAME: str
2022

2123
def __init_subclass__(cls, **kwargs: Any) -> None:
2224
super().__init_subclass__(**kwargs)

0 commit comments

Comments
 (0)