Skip to content

Commit 0a27c83

Browse files
CodyCBakerPhDbendichterpre-commit-ci[bot]stephprince
authored
Integrate and test Zarr backend (#513)
* setup for zarr * fix imports * fix test class names; add more zarr * changelog * control when io is returned * debugs * debugs * debugs * debugs * debugs * down to only one * remove f strings automatically * remove some * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * override mystery * fix streaming path * Update tests/test_inspector.py Co-authored-by: Steph Prince <[email protected]> * PR suggestions * adjust docstring * add Zarr to caching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rollback caching * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Ben Dichter <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Steph Prince <[email protected]>
1 parent ea18bf6 commit 0a27c83

19 files changed

+373
-145
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
### Deprecation (API)
77
* The `inspect_nwb` method has been removed. Please use `inspect_nwbfile` or `inspect_nwbfile_object` instead. [#505](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/505)
88

9+
### New Features
10+
* Added Zarr support. [#513](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/513)
11+
912
### Improvements
1013
* Removed the `robust_ros3_read` utility helper. [#506](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/506)
1114
* Simplified the `nwbinspector.testing` configuration framework. [#509](https://github.com/NeurodataWithoutBorders/nwbinspector/pull/509)

MANIFEST.in

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
graft tests
22
global-exclude *.py[cod]
3-
include src/nwbinspector/internal_configs/dandi.inspector_config.yaml
4-
include src/nwbinspector/config.schema.json
5-
include requirements.txt
3+
include src/nwbinspector/_internal_configs/dandi.inspector_config.yaml
4+
include src/nwbinspector/_internal_configs/config.schema.json

src/nwbinspector/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
# Still keeping the legacy magic version attribute requested by some users
2727
__version__ = importlib.metadata.version(distribution_name="nwbinspector")
2828

29+
# Note: this is not exposed at this outer level, but is used here to trigger the automatic submodule import
30+
# (otherwise someone would have to import nwbinspector.testing explicitly)
31+
from .testing import check_streaming_tests_enabled # noqa: F401
32+
2933
__all__ = [
3034
"available_checks",
3135
"default_check_registry",
@@ -51,4 +55,8 @@
5155
"FormatterOptions",
5256
"organize_messages",
5357
"__version__",
58+
# Public submodules
59+
"checks",
60+
"testing",
61+
"utils",
5462
]

src/nwbinspector/_configuration.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@
1313
from . import available_checks
1414
from ._registration import Importance
1515

16-
INTERNAL_CONFIGS = dict(dandi=Path(__file__).parent / "internal_configs" / "dandi.inspector_config.yaml")
16+
INTERNAL_CONFIGS = dict(
17+
dandi=Path(__file__).parent / "_internal_configs" / "dandi.inspector_config.yaml",
18+
)
1719

1820

1921
def validate_config(config: dict):
2022
"""Validate an instance of configuration against the official schema."""
21-
with open(file=Path(__file__).parent / "config.schema.json", mode="r") as fp:
23+
config_schema_file_path = Path(__file__).parent / "_internal_configs" / "config.schema.json"
24+
with open(file=config_schema_file_path, mode="r") as fp:
2225
schema = json.load(fp=fp)
2326
jsonschema.validate(instance=config, schema=schema)
2427

src/nwbinspector/_nwb_inspection.py

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
from natsort import natsorted
1313
from tqdm import tqdm
1414

15-
from . import available_checks, configure_checks
15+
from . import available_checks
16+
from ._configuration import configure_checks
1617
from ._registration import Importance, InspectorMessage
18+
from .tools._read_nwbfile import read_nwbfile
1719
from .utils import (
1820
FilePathType,
1921
OptionalListOfStrings,
@@ -127,7 +129,7 @@ def inspect_all(
127129
progress_bar_options = dict(position=0, leave=False)
128130

129131
if in_path.is_dir():
130-
nwbfiles = list(in_path.rglob("*.nwb"))
132+
nwbfiles = list(in_path.rglob("*.nwb*"))
131133

132134
# Remove any macOS sidecar files
133135
nwbfiles = [nwbfile for nwbfile in nwbfiles if not nwbfile.name.startswith("._")]
@@ -141,17 +143,16 @@ def inspect_all(
141143
# Manual identifier check over all files in the folder path
142144
identifiers = defaultdict(list)
143145
for nwbfile_path in nwbfiles:
144-
with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
145-
try:
146-
nwbfile = io.read()
147-
identifiers[nwbfile.identifier].append(nwbfile_path)
148-
except Exception as exception:
149-
yield InspectorMessage(
150-
message=traceback.format_exc(),
151-
importance=Importance.ERROR,
152-
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
153-
file_path=nwbfile_path,
154-
)
146+
try:
147+
nwbfile = read_nwbfile(nwbfile_path=nwbfile_path)
148+
identifiers[nwbfile.identifier].append(nwbfile_path)
149+
except Exception as exception:
150+
yield InspectorMessage(
151+
message=traceback.format_exc(),
152+
importance=Importance.ERROR,
153+
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
154+
file_path=nwbfile_path,
155+
)
155156

156157
if len(identifiers) != len(nwbfiles):
157158
for identifier, nwbfiles_with_identifier in identifiers.items():
@@ -198,7 +199,7 @@ def inspect_all(
198199
yield message
199200
else:
200201
for nwbfile_path in nwbfiles_iterable:
201-
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks):
202+
for message in inspect_nwbfile(nwbfile_path=nwbfile_path, checks=checks, skip_validate=skip_validate):
202203
yield message
203204

204205

@@ -237,7 +238,7 @@ def inspect_nwbfile(
237238
config : dict
238239
Dictionary valid against our JSON configuration schema.
239240
Can specify a mapping of importance levels and list of check functions whose importance you wish to change.
240-
Typically loaded via json.load from a valid .json file
241+
Typically loaded via `json.load` from a valid .json file.
241242
ignore: list, optional
242243
Names of functions to skip.
243244
select: list, optional
@@ -267,10 +268,12 @@ def inspect_nwbfile(
267268
filterwarnings(action="ignore", message="No cached namespaces found in .*")
268269
filterwarnings(action="ignore", message="Ignoring cached namespace .*")
269270

270-
if not skip_validate:
271-
validation_error_list, _ = pynwb.validate(paths=[nwbfile_path])
272-
for validation_namespace_errors in validation_error_list:
273-
for validation_error in validation_namespace_errors:
271+
try:
272+
in_memory_nwbfile, io = read_nwbfile(nwbfile_path=nwbfile_path, return_io=True)
273+
274+
if not skip_validate:
275+
validation_errors = pynwb.validate(io=io)
276+
for validation_error in validation_errors:
274277
yield InspectorMessage(
275278
message=validation_error.reason,
276279
importance=Importance.PYNWB_VALIDATION,
@@ -279,27 +282,23 @@ def inspect_nwbfile(
279282
file_path=nwbfile_path,
280283
)
281284

282-
with pynwb.NWBHDF5IO(path=nwbfile_path, mode="r", load_namespaces=True) as io:
283-
try:
284-
in_memory_nwbfile = io.read()
285-
286-
for inspector_message in inspect_nwbfile_object(
287-
nwbfile_object=in_memory_nwbfile,
288-
checks=checks,
289-
config=config,
290-
ignore=ignore,
291-
select=select,
292-
importance_threshold=importance_threshold,
293-
):
294-
inspector_message.file_path = nwbfile_path
295-
yield inspector_message
296-
except Exception as exception:
297-
yield InspectorMessage(
298-
message=traceback.format_exc(),
299-
importance=Importance.ERROR,
300-
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
301-
file_path=nwbfile_path,
302-
)
285+
for inspector_message in inspect_nwbfile_object(
286+
nwbfile_object=in_memory_nwbfile,
287+
checks=checks,
288+
config=config,
289+
ignore=ignore,
290+
select=select,
291+
importance_threshold=importance_threshold,
292+
):
293+
inspector_message.file_path = nwbfile_path
294+
yield inspector_message
295+
except Exception as exception:
296+
yield InspectorMessage(
297+
message=traceback.format_exc(),
298+
importance=Importance.ERROR,
299+
check_function_name=f"During io.read() - {type(exception)}: {str(exception)}",
300+
file_path=nwbfile_path,
301+
)
303302

304303

305304
# TODO: deprecate once subject types and dandi schemas have been extended

src/nwbinspector/_registration.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Optional
55

66
import h5py
7+
import zarr
78
from pynwb import NWBFile
89
from pynwb.ecephys import Device, ElectrodeGroup
910
from pynwb.file import Subject
@@ -93,7 +94,7 @@ def _auto_parse(check_function, obj, result: Optional[InspectorMessage] = None):
9394

9495

9596
def _parse_location(neurodata_object) -> Optional[str]:
96-
"""Grab the object location from a h5py.Dataset or a container content that is an h5py.Dataset object."""
97+
"""Grab the object location from a dataset or a container content that is an dataset object."""
9798
known_locations = {
9899
NWBFile: "/",
99100
Subject: "/general/subject",
@@ -105,13 +106,16 @@ def _parse_location(neurodata_object) -> Optional[str]:
105106
for key, val in known_locations.items():
106107
if isinstance(neurodata_object, key):
107108
return val
108-
"""Infer the human-readable path of the object within an NWBFile by tracing its parents."""
109+
110+
# Infer the human-readable path of the object within an NWBFile by tracing its parents
109111
if neurodata_object.parent is None:
110112
return "/"
111113
# Best solution: object is or has a HDF5 Dataset
112-
if isinstance(neurodata_object, h5py.Dataset):
114+
if isinstance(neurodata_object, (h5py.Dataset, zarr.Array)):
113115
return neurodata_object.name
114116
else:
115-
for field in neurodata_object.fields.values():
117+
for field_name, field in neurodata_object.fields.items():
116118
if isinstance(field, h5py.Dataset):
117119
return field.parent.name
120+
elif isinstance(field, zarr.Array):
121+
return field.name.removesuffix(f"/{field_name}")

src/nwbinspector/checks/_nwb_containers.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44

55
import h5py
6+
import zarr
67
from pynwb import NWBContainer
78

89
from .._registration import Importance, InspectorMessage, Severity, register_check
@@ -19,9 +20,16 @@ def check_large_dataset_compression(nwb_container: NWBContainer, gb_lower_bound:
1920
Best Practice: :ref:`best_practice_compression`
2021
"""
2122
for field in getattr(nwb_container, "fields", dict()).values():
22-
if not isinstance(field, h5py.Dataset):
23+
if not isinstance(field, (h5py.Dataset, zarr.Array)):
2324
continue
24-
if field.compression is None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
25+
26+
compression_indicator = None
27+
if isinstance(field, h5py.Dataset):
28+
compression_indicator = field.compression
29+
elif isinstance(field, zarr.Array):
30+
compression_indicator = field.compressor
31+
32+
if compression_indicator is not None and field.size * field.dtype.itemsize > gb_lower_bound * 1e9:
2533
return InspectorMessage(
2634
severity=Severity.HIGH,
2735
message=f"{os.path.split(field.name)[1]} is a large uncompressed dataset! Please enable compression.",
@@ -44,10 +52,17 @@ def check_small_dataset_compression(
4452
Best Practice: :ref:`best_practice_compression`
4553
"""
4654
for field in getattr(nwb_container, "fields", dict()).values():
47-
if not isinstance(field, h5py.Dataset):
55+
if not isinstance(field, (h5py.Dataset, zarr.Array)):
4856
continue
57+
58+
compression_indicator = None
59+
if isinstance(field, h5py.Dataset):
60+
compression_indicator = field.compression
61+
elif isinstance(field, zarr.Array):
62+
compression_indicator = field.compressor
63+
4964
if (
50-
field.compression is None
65+
compression_indicator is None
5166
and mb_lower_bound * 1e6 < field.size * field.dtype.itemsize < gb_upper_bound * 1e9
5267
):
5368
if field.size * field.dtype.itemsize > gb_severity_threshold * 1e9:

src/nwbinspector/tools/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from ._dandi import get_s3_urls_and_dandi_paths
22
from ._nwb import all_of_type, get_nwbfile_path_from_internal_object
3-
from ._read_nwbfile import read_nwbfile
3+
from ._read_nwbfile import BACKEND_IO_CLASSES, read_nwbfile
44

55
__all__ = [
6+
"BACKEND_IO_CLASSES",
67
"get_s3_urls_and_dandi_paths",
78
"all_of_type",
89
"get_nwbfile_path_from_internal_object",

0 commit comments

Comments
 (0)