Skip to content

Commit 80cb46e

Browse files
committed
add tests for validation of vlen string datasets from zarr arrays
1 parent f167dbb commit 80cb46e

File tree

1 file changed

+52
-1
lines changed

1 file changed

+52
-1
lines changed

tests/unit/validator_tests/test_validate.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from abc import ABCMeta, abstractmethod
22
from datetime import datetime, date
3-
from unittest import mock, skip
3+
from unittest import mock, skip, skipIf
44

55
import numpy as np
66
from dateutil.tz import tzlocal
@@ -13,6 +13,7 @@
1313
from hdmf.validate.errors import (DtypeError, MissingError, ExpectedArrayError, MissingDataType,
1414
IncorrectQuantityError, IllegalLinkError, ShapeError)
1515
from hdmf.backends.hdf5 import HDF5IO
16+
from hdmf.utils import ZARR_INSTALLED
1617

1718
CORE_NAMESPACE = 'test_core'
1819

@@ -1605,3 +1606,53 @@ def test_scalar_instead_of_array(self):
16051606
# Should be ExpectedArrayError, not ShapeError
16061607
self.assertIsInstance(result[0], ExpectedArrayError)
16071608
self.assertNotIsInstance(result[0], ShapeError)
1609+
1610+
1611+
class TestObjectDtypeArrays(TestCase):
1612+
"""Test validation of arrays with object dtype (e.g., zarr variable length strings)"""
1613+
1614+
def set_up_spec(self):
1615+
spec_catalog = SpecCatalog()
1616+
spec = GroupSpec('A test group specification with a data type',
1617+
data_type_def='Bar',
1618+
datasets=[DatasetSpec('an example dataset', 'text', name='data', shape=(None,))],
1619+
attributes=[AttributeSpec('attr1', 'an example string attribute', 'text')])
1620+
spec_catalog.register_spec(spec, 'test.yaml')
1621+
self.namespace = SpecNamespace(
1622+
'a test namespace', CORE_NAMESPACE, [{'source': 'test.yaml'}], version='0.1.0', catalog=spec_catalog)
1623+
self.vmap = ValidatorMap(self.namespace)
1624+
1625+
@skipIf(not ZARR_INSTALLED, "Zarr is not installed")
1626+
def test_non_empty_object_dtype_array(self):
1627+
"""Test that validator can determine dtype for non-empty zarr.Array with object dtype"""
1628+
import zarr
1629+
import numcodecs
1630+
1631+
self.set_up_spec()
1632+
1633+
# Create a zarr array with object dtype containing strings
1634+
# Zarr uses object dtype for variable-length strings, unlike HDF5 which uses vlen metadata
1635+
zarr_array = zarr.array(['string1', 'string2', 'string3'], dtype=object, object_codec=numcodecs.VLenUTF8())
1636+
bar_builder = GroupBuilder('my_bar',
1637+
attributes={'data_type': 'Bar', 'attr1': 'a string attribute'},
1638+
datasets=[DatasetBuilder('data', zarr_array)])
1639+
results = self.vmap.validate(bar_builder)
1640+
# Should pass validation - object array with strings should be detected as 'utf' type
1641+
self.assertEqual(len(results), 0)
1642+
1643+
@skipIf(not ZARR_INSTALLED, "Zarr is not installed")
1644+
def test_empty_object_dtype_array(self):
1645+
"""Test that validator can determine dtype for empty zarr.Array with object dtype"""
1646+
import zarr
1647+
import numcodecs
1648+
1649+
self.set_up_spec()
1650+
1651+
# Create an empty zarr array with object dtype
1652+
empty_zarr_array = zarr.array([], dtype=object, object_codec=numcodecs.VLenUTF8())
1653+
bar_builder = GroupBuilder('my_bar',
1654+
attributes={'data_type': 'Bar', 'attr1': 'a string attribute'},
1655+
datasets=[DatasetBuilder('data', empty_zarr_array)])
1656+
results = self.vmap.validate(bar_builder)
1657+
# Should pass validation - empty object array defaults to 'utf' type
1658+
self.assertEqual(len(results), 0)

0 commit comments

Comments
 (0)