diff --git a/pyproject.toml b/pyproject.toml index bcba820..5ddb024 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,17 +19,18 @@ authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] requires-python = ">=3.8" dependencies = [ - "h5py >=3.10", + "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", "jsonschema >=4.4.0", "tomli; python_version<'3.11'", "numpy >=1.20,<2.0.0; python_version=='3.8'", ] + dynamic = ["version"] [project.urls] -Homepage = "https://hdf5-json.readthedocs.io" -Documentation = "https://hdf5-json.readthedocs.io" +Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" +Documentation = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" Source = "https://github.com/HDFGroup/hdf5-json" "Bug Reports" = "https://github.com/HDFGroup/hdf5-json/issues" Social = "https://twitter.com/hdf5" diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py index 704d241..d4a7f78 100644 --- a/src/h5json/__init__.py +++ b/src/h5json/__init__.py @@ -21,6 +21,14 @@ from .hdf5dtype import getTypeResponse from .hdf5dtype import getItemSize from .hdf5dtype import createDataType +from .objid import createObjId +from .objid import getCollectionForId +from .objid import isObjId +from .objid import isS3ObjKey +from .objid import getS3Key +from .objid import getObjId +from .objid import isSchema2Id +from .objid import isRootObjId from .hdf5db import Hdf5db from . import _version diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 27f2094..db48eda 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -13,12 +13,12 @@ import time import h5py import numpy as np -import uuid import os.path as op import os import json import logging -from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .hdf5dtype import getTypeItem, createDataType, getItemSize, Reference, RegionReference +from .objid import createObjId from .apiversion import _apiver @@ -73,6 +73,43 @@ _H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") +def convert_dtype(srcdt): + """Return a dtype based on input dtype, converting any Reference types from + h5json style to h5py. + """ + + if len(srcdt) > 0: + fields = [] + for name in srcdt.fields: + item = srcdt.fields[name] + # item is a tuple of dtype and integer offset + field_dt = convert_dtype(item[0]) + fields.append((name, field_dt)) + tgt_dt = np.dtype(fields) + else: + # check if this a "special dtype" + if srcdt.metadata and "ref" in srcdt.metadata: + if srcdt.metadata['ref'] is Reference: + tgt_dt = h5py.special_dtype(ref=h5py.Reference) + elif srcdt.metadata['ref'] is RegionReference: + tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) + else: + raise TypeError(f"Unexpected ref type: {srcdt}") + elif srcdt.metadata and "vlen" in srcdt.metadata: + src_vlen = srcdt.metadata["vlen"] + if isinstance(src_vlen, np.dtype): + tgt_base = convert_dtype(src_vlen) + else: + tgt_base = src_vlen + tgt_dt = h5py.special_dtype(vlen=tgt_base) + elif srcdt.kind == "U": + # use vlen for unicode strings + tgt_dt = h5py.special_dtype(vlen=str) + else: + tgt_dt = srcdt # no conversion needed + return tgt_dt + + def visitObj(path, obj): hdf5db = _db[obj.file.filename] hdf5db.visit(path, obj) @@ -561,7 +598,7 @@ def initFile(self): self.log.info("initializing file") if not self.root_uuid: - self.root_uuid = str(uuid.uuid1()) + self.root_uuid = createObjId() self.dbGrp.attrs["rootUUID"] = self.root_uuid self.dbGrp.create_group("{groups}") self.dbGrp.create_group("{datasets}") @@ -593,21 +630,21 @@ def visit(self, path, obj): msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file" self.log.error(msg) raise IOError(errno.EIO, msg) - uuid1 = uuid.uuid1() # create uuid - id = str(uuid1) + obj_id = createObjId() # create uuid + addrGrp = self.dbGrp["{addr}"] if not self.readonly: # storing db in the file itself, so we can link to the object directly - col[id] = obj.ref # save attribute ref to object + col[obj_id] = obj.ref # save attribute ref to object else: # store path to object - col[id] = obj.name + col[obj_id] = obj.name addr = h5py.h5o.get_info(obj.id).addr # store reverse map as an attribute - addrGrp.attrs[str(addr)] = id + addrGrp.attrs[str(addr)] = obj_id # - # Get Datset creation properties + # Get Dataset creation properties # def getDatasetCreationProps(self, dset_uuid): prop_list = {} @@ -760,7 +797,7 @@ def getObjByPath(self, path): def getObjectByUuid(self, col_type, obj_uuid): # col_type should be either "datasets", "groups", or "datatypes" if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpectd error, invalid col_type: [" + col_type + "]" + msg = "Unexpected error, invalid col_type: [" + col_type + "]" self.log.error(msg) raise IOError(errno.EIO, msg) if col_type == "groups" and obj_uuid == self.dbGrp.attrs["rootUUID"]: @@ -1087,7 +1124,7 @@ def createCommittedType(self, datatype, obj_uuid=None): raise IOError(errno.EPERM, msg) datatypes = self.dbGrp["{datatypes}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = self.createTypeFromItem(datatype) datatypes[obj_uuid] = dt @@ -1476,6 +1513,7 @@ def makeAttribute(self, obj, attr_name, shape, attr_type, value): self.makeNullTermStringAttribute(obj, attr_name, strLength, value) else: typeItem = getTypeItem(dt) + dt = convert_dtype(dt) value = self.toRef(rank, typeItem, value) # create numpy array @@ -1725,6 +1763,7 @@ def toNumPyValue(self, typeItem, src, des): baseType = typeItem["base"] dt = self.createTypeFromItem(baseType) + dt = convert_dtype(dt) des = np.array(src, dtype=dt) elif typeClass == "H5T_REFERENCE": @@ -1901,7 +1940,7 @@ def listToRef(self, data): # object reference should be in the form: / for prefix in ("datasets", "groups", "datatypes"): if data.startswith(prefix): - uuid_ref = data[len(prefix) :] + uuid_ref = data[len(prefix):] if len(uuid_ref) == (UUID_LEN + 1) and uuid_ref.startswith("/"): obj = self.getObjectByUuid(prefix, uuid_ref[1:]) if obj: @@ -2193,7 +2232,8 @@ def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): raise IOError(errno.EIO, msg) if isinstance(slices, (list, tuple)) and len(slices) != rank: - msg = "Unexpected error: getDatasetValuesByUuid: number of dims in selection not same as rank" + msg = "Unexpected error: getDatasetValuesByUuid: " + msg += "number of dims in selection not same as rank" self.log.error(msg) raise IOError(errno.EIO, msg) @@ -2715,7 +2755,7 @@ def createDataset( raise IOError(errno.EPERM, msg) datasets = self.dbGrp["{datasets}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() dt = None item = {} fillvalue = None @@ -3490,7 +3530,7 @@ def createGroup(self, obj_uuid=None): raise IOError(errno.EPERM, msg) groups = self.dbGrp["{groups}"] if not obj_uuid: - obj_uuid = str(uuid.uuid1()) + obj_uuid = createObjId() newGroup = groups.create_group(obj_uuid) # store reverse map as an attribute addr = h5py.h5o.get_info(newGroup.id).addr diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py old mode 100755 new mode 100644 index 9f867f2..9c565ce --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -2,37 +2,199 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -""" -This class is used to map between HDF5 type representations and numpy types - -""" +import weakref import numpy as np -from h5py.h5t import special_dtype -from h5py.h5t import check_dtype -from h5py.h5r import Reference -from h5py.h5r import RegionReference + + +class Reference: + """ + Represents an HDF5 object reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + # TBD: this is not consistent with hsds or h5py... + if not isinstance(self._id.id, str): + raise TypeError("Expected string id") + item = None + + collection_type = self._id.collection_type + item = f"{collection_type}/{self._id.id}" + return item + + def tolist(self): + if type(self._id.id) is not str: + raise TypeError("Expected string id") + if self._id.objtype_code == "d": + return [ + ("datasets/" + self._id.id), + ] + elif self._id.objtype_code == "g": + return [ + ("groups/" + self._id.id), + ] + elif self._id.objtype_code == "t": + return [ + ("datatypes/" + self._id.id), + ] + else: + raise TypeError("Unexpected id type") + + +class RegionReference: + """ + Represents an HDF5 region reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + return "" + + +def special_dtype(**kwds): + """Create a new h5py "special" type. Only one keyword may be given. + + Legal keywords are: + + vlen = basetype + Base type for HDF5 variable-length datatype. This can be Python + str type or instance of np.dtype. + Example: special_dtype( vlen=str ) + + enum = (basetype, values_dict) + Create a NumPy representation of an HDF5 enumerated type. Provide + a 2-tuple containing an (integer) base dtype and a dict mapping + string names to integer values. + + ref = Reference | RegionReference + Create a NumPy representation of an HDF5 object or region reference + type.""" + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, val = kwds.popitem() + + if name == "vlen": + + return np.dtype("O", metadata={"vlen": val}) + + if name == "enum": + + try: + dt, enum_vals = val + except TypeError: + msg = "Enums must be created from a 2-tuple " + msg += "(basetype, values_dict)" + raise TypeError(msg) + + dt = np.dtype(dt) + if dt.kind not in "iu": + raise TypeError("Only integer types can be used as enums") + + return np.dtype(dt, metadata={"enum": enum_vals}) + + if name == "ref": + dt = None + if val is Reference: + dt = np.dtype("S48", metadata={"ref": Reference}) + elif val is RegionReference: + dt = np.dtype("S48", metadata={"ref": RegionReference}) + else: + raise ValueError("Ref class must be Reference or RegionReference") + + return dt + + raise TypeError(f'Unknown special type "{name}"') + + +def check_dtype(**kwds): + """Check a dtype for h5py special type "hint" information. Only one + keyword may be given. + + vlen = dtype + If the dtype represents an HDF5 vlen, returns the Python base class. + Currently only builting string vlens (str) are supported. Returns + None if the dtype does not represent an HDF5 vlen. + + enum = dtype + If the dtype represents an HDF5 enumerated type, returns the dictionary + mapping string names to integer values. Returns None if the dtype does + not represent an HDF5 enumerated type. + + ref = dtype + If the dtype represents an HDF5 reference type, returns the reference + class (either Reference or RegionReference). Returns None if the dtype + does not represent an HDF5 reference type. + """ + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, dt = kwds.popitem() + + if name not in ("vlen", "enum", "ref"): + raise TypeError('Unknown special type "%s"' % name) + + try: + return dt.metadata[name] + except TypeError: + return None + except KeyError: + return None def getTypeResponse(typeItem): """ Convert the given type item to a predefined type string for - predefined integer and floating point types ("H5T_STD_I64LE", et. al). - For compound types, recursively iterate through the typeItem and do same - conversion for fields of the compound type. - """ + predefined integer and floating point types ("H5T_STD_I64LE", et. al). + For compound types, recursively iterate through the typeItem and do + same conversion for fields of the compound type.""" response = None if "uuid" in typeItem: # committed type, just return uuid response = "datatypes/" + typeItem["uuid"] - elif typeItem["class"] == "H5T_INTEGER" or typeItem["class"] == "H5T_FLOAT": + elif typeItem["class"] in ("H5T_INTEGER", "H5T_FLOAT"): # just return the class and base for pre-defined types response = {} response["class"] = typeItem["class"] @@ -52,7 +214,7 @@ def getTypeResponse(typeItem): for field in typeItem["fields"]: fieldItem = {} fieldItem["name"] = field["name"] - fieldItem["type"] = getTypeResponse(field["type"]) # recursive call + fieldItem["type"] = getTypeResponse(field["type"]) # recurse call fieldList.append(fieldItem) response["fields"] = fieldList else: @@ -60,7 +222,7 @@ def getTypeResponse(typeItem): for k in typeItem.keys(): if k == "base": if isinstance(typeItem[k], dict): - response[k] = getTypeResponse(typeItem[k]) # recursive call + response[k] = getTypeResponse(typeItem[k]) # recurse call else: response[k] = typeItem[k] # predefined type elif k not in ("size", "base_size"): @@ -68,112 +230,12 @@ def getTypeResponse(typeItem): return response -def getItemSize(typeItem): - """ - Get size of an item in bytes. - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE" +def getTypeItem(dt, metadata=None): """ - # handle the case where we are passed a primitive type first - if isinstance(typeItem, bytes): - typeItem = typeItem.decode("ascii") - if isinstance(typeItem, str): - for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): - if typeItem.startswith(type_prefix): - num_bits = typeItem[len(type_prefix) :] - if num_bits[-2:] in ("LE", "BE"): - num_bits = num_bits[:-2] - try: - return int(num_bits) // 8 - except ValueError: - raise TypeError("Invalid Type") - # none of the expect primative types mathched - raise TypeError("Invalid Type") - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - item_size = 0 - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - item_size = typeItem["length"] - - elif typeClass == "H5T_VLEN": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_OPAQUE": - if "size" not in typeItem: - raise KeyError("'size' not provided") - item_size = int(typeItem["size"]) - - elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("'base' must be provided for enum types") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_REFERENCE": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if type(fields) is not list: - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - # add up the size of each sub-field - for field in fields: - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "type" not in field: - raise KeyError("'type' missing from field") - subtype_size = getItemSize(field["type"]) # recursive call - if subtype_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" - break # don't need to look at the rest - - item_size += subtype_size - else: - raise TypeError("Invalid type class") - - # calculate array type - if "dims" in typeItem and type(item_size) is int: - dims = typeItem["dims"] - for dim in dims: - item_size *= dim - - return item_size - - -""" Return type info. For primitive types, return string with typename For compound types return array of dictionary items -""" - - -def getTypeItem(dt): - + """ predefined_int_types = { "int8": "H5T_STD_I8", "uint8": "H5T_STD_U8", @@ -184,10 +246,16 @@ def getTypeItem(dt): "int64": "H5T_STD_I64", "uint64": "H5T_STD_U64", } - predefined_float_types = {"float32": "H5T_IEEE_F32", "float64": "H5T_IEEE_F64"} + predefined_float_types = { + "float16": "H5T_IEEE_F16", + "float32": "H5T_IEEE_F32", + "float64": "H5T_IEEE_F64", + } + if not metadata and dt.metadata: + metadata = dt.metadata type_info = {} - if len(dt) > 1 or dt.names: + if len(dt) > 1: # compound type names = dt.names type_info["class"] = "H5T_COMPOUND" @@ -204,15 +272,22 @@ def getTypeItem(dt): # array type type_info["dims"] = dt.shape type_info["class"] = "H5T_ARRAY" - type_info["base"] = getTypeItem(dt.base) + type_info["base"] = getTypeItem(dt.base, metadata=metadata) elif dt.kind == "O": # vlen string or data # # check for h5py variable length extension - vlen_check = check_dtype(vlen=dt.base) - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) - ref_check = check_dtype(ref=dt.base) + vlen_check = None + if metadata and "vlen" in metadata: + vlen_check = metadata["vlen"] + if vlen_check is not None and not isinstance(vlen_check, np.dtype): + vlen_check = np.dtype(vlen_check) + + if metadata and "ref" in metadata: + ref_check = metadata["ref"] + else: + ref_check = check_dtype(ref=dt.base) + if vlen_check == bytes: type_info["class"] = "H5T_STRING" type_info["length"] = "H5T_VARIABLE" @@ -229,15 +304,15 @@ def getTypeItem(dt): type_info["size"] = "H5T_VARIABLE" type_info["base"] = getTypeItem(vlen_check) elif vlen_check is not None: - # unknown vlen type + # unknown vlen type raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) elif ref_check is not None: # a reference type type_info["class"] = "H5T_REFERENCE" - if ref_check is Reference: + if ref_check.__name__ == "Reference": type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: + elif ref_check.__name__ == "RegionReference": type_info["base"] = "H5T_STD_REF_DSETREG" # region ref else: raise TypeError("unexpected reference type") @@ -249,14 +324,40 @@ def getTypeItem(dt): type_info["size"] = dt.itemsize type_info["tag"] = "" # todo - determine tag elif dt.base.kind == "S": - # Fixed length string type - type_info["class"] = "H5T_STRING" - type_info["charSet"] = "H5T_CSET_ASCII" + # check for object reference + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + # a reference type + type_info["class"] = "H5T_REFERENCE" + + if ref_check is Reference: + type_info["base"] = "H5T_STD_REF_OBJ" # objref + elif ref_check is RegionReference: + type_info["base"] = "H5T_STD_REF_DSETREG" # region ref + else: + raise TypeError("unexpected reference type") + else: + # Fixed length string type + type_info["class"] = "H5T_STRING" type_info["length"] = dt.itemsize + type_info["charSet"] = "H5T_CSET_ASCII" type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.base.kind == "U": # Fixed length unicode type - raise TypeError("Fixed length unicode type is not supported") + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + raise TypeError("unexpected reference type") + + # Fixed length string type with unicode support + type_info["class"] = "H5T_STRING" + + # this can be problematic if the encoding of the string is not valid, + # or reqires too many bytes. Use variable length strings to handle all + # UTF8 strings correctly + type_info["charSet"] = "H5T_CSET_UTF8" + # convert from UTF32 length to a fixed length + type_info["length"] = dt.itemsize + type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.kind == "b": # boolean type - h5py stores as enum @@ -265,13 +366,12 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" # this mapping is an h5py convention for boolean support - members = [{"name": "FALSE", "value": 0}, {"name": "TRUE", "value": 1}] + mapping = {"FALSE": 0, "TRUE": 1} type_info["class"] = "H5T_ENUM" - type_info["members"] = members + type_info["mapping"] = mapping base_info = {"class": "H5T_INTEGER"} base_info["base"] = "H5T_STD_I8" + byteorder type_info["base"] = base_info - elif dt.kind == "f": # floating point type type_info["class"] = "H5T_FLOAT" @@ -280,7 +380,8 @@ def getTypeItem(dt): byteorder = "BE" if dt.name in predefined_float_types: # maps to one of the HDF5 predefined types - type_info["base"] = predefined_float_types[dt.base.name] + byteorder + float_type = predefined_float_types[dt.base.name] + type_info["base"] = float_type + byteorder else: raise TypeError("Unexpected floating point type: " + dt.name) elif dt.kind == "i" or dt.kind == "u": @@ -291,14 +392,13 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" - # numpy integer type - but check to see if this is the h5py + # numpy integer type - but check to see if this is the hypy # enum extension - mapping = check_dtype(enum=dt) - - if mapping: + if metadata and "enum" in metadata: # yes, this is an enum! + mapping = metadata["enum"] type_info["class"] = "H5T_ENUM" - type_info["members"] = [{"name": n, "value": v} for n, v in mapping.items()] + type_info["mapping"] = mapping if dt.name not in predefined_int_types: raise TypeError("Unexpected integer type: " + dt.name) # maps to one of the HDF5 predefined types @@ -316,11 +416,146 @@ def getTypeItem(dt): else: # unexpected kind - raise TypeError("unexpected dtype kind: " + dt.kind) + raise TypeError(f"unexpected dtype kind: {dt.kind}") return type_info +def getItemSize(typeItem): + """ + Get size of an item in bytes. + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE" + """ + # handle the case where we are passed a primitive type first + if isinstance(typeItem, str) or isinstance(typeItem, bytes): + for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): + if typeItem.startswith(type_prefix): + nlen = len(type_prefix) + num_bits = typeItem[nlen:] + if num_bits[-2:] in ("LE", "BE"): + num_bits = num_bits[:-2] + try: + return int(num_bits) // 8 + except ValueError: + raise TypeError("Invalid Type") + # none of the expect primative types mathched + raise TypeError("Invalid Type") + if not isinstance(typeItem, dict): + raise TypeError("invalid type") + + item_size = 0 + if "class" not in typeItem: + raise KeyError("'class' not provided") + typeClass = typeItem["class"] + + if typeClass == "H5T_INTEGER": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_FLOAT": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_STRING": + if "length" not in typeItem: + raise KeyError("'length' not provided") + item_size = typeItem["length"] + + elif typeClass == "H5T_VLEN": + item_size = "H5T_VARIABLE" + elif typeClass == "H5T_OPAQUE": + if "size" not in typeItem: + raise KeyError("'size' not provided") + item_size = int(typeItem["size"]) + + elif typeClass == "H5T_ARRAY": + if "dims" not in typeItem: + raise KeyError("'dims' must be provided for array types") + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_ENUM": + if "base" not in typeItem: + raise KeyError("'base' must be provided for enum types") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_REFERENCE": + if "length" in typeItem: + item_size = typeItem["length"] + elif "base" in typeItem and typeItem["base"] == "H5T_STD_REF_OBJ": + # obj ref values are in the form: "groups/" or + # "datasets/" or "datatypes/" + item_size = 48 + else: + item_size = 80 # tb: just take a guess at this for now + elif typeClass == "H5T_COMPOUND": + if "fields" not in typeItem: + raise KeyError("'fields' not provided for compound type") + fields = typeItem["fields"] + if not isinstance(fields, list): + raise TypeError("Type Error: expected list type for 'fields'") + if not fields: + raise KeyError("no 'field' elements provided") + # add up the size of each sub-field + for field in fields: + if not isinstance(field, dict): + raise TypeError("Expected dictionary type for field") + if "type" not in field: + raise KeyError("'type' missing from field") + subtype_size = getItemSize(field["type"]) # recursive call + if subtype_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + break # don't need to look at the rest + + item_size += subtype_size + else: + raise TypeError("Invalid type class") + + # calculate array type + if "dims" in typeItem and isinstance(item_size, int): + dims = typeItem["dims"] + for dim in dims: + item_size *= dim + + return item_size + + +def getDtypeItemSize(dtype): + """ Return size of dtype in bytes + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE + """ + item_size = 0 + if len(dtype) > 0: + # compound dtype + for i in range(len(dtype)): + sub_dt = dtype[i] + sub_dt_size = getDtypeItemSize(sub_dt) + if sub_dt_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" # return variable if any component is variable + break + item_size += sub_dt_size + else: + # primitive type + if dtype.shape: + base_size = getDtypeItemSize(dtype.base) + if base_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + else: + nelements = np.prod(dtype.shape) + item_size = base_size * nelements + else: + if dtype.metadata and "vlen" in dtype.metadata: + item_size = "H5T_VARIABLE" + else: + item_size = dtype.itemsize + return item_size + + def getNumpyTypename(hdf5TypeName, typeClass=None): predefined_int_types = { "H5T_STD_I8": "i1", @@ -332,7 +567,11 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): "H5T_STD_I64": "i8", "H5T_STD_U64": "u8", } - predefined_float_types = {"H5T_IEEE_F32": "f4", "H5T_IEEE_F64": "f8"} + predefined_float_types = { + "H5T_IEEE_F16": "f2", + "H5T_IEEE_F32": "f4", + "H5T_IEEE_F64": "f8", + } if len(hdf5TypeName) < 3: raise Exception("Type Error: invalid typename: ") @@ -356,7 +595,6 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): def createBaseDataType(typeItem): - dtRet = None if isinstance(typeItem, str): # should be one of the predefined types @@ -371,20 +609,32 @@ def createBaseDataType(typeItem): raise KeyError("'class' not provided") typeClass = typeItem["class"] + dims = "" + if "dims" in typeItem: + if typeClass != "H5T_ARRAY": + raise TypeError("'dims' only supported for integer types") + + dims = None + if isinstance(typeItem["dims"], int): + dims = typeItem["dims"] # make into a tuple + elif not isinstance(typeItem["dims"], list) and not isinstance( + typeItem["dims"], tuple + ): + raise TypeError("expected list or integer for dims") + else: + dims = typeItem["dims"] + dims = str(tuple(dims)) + if typeClass == "H5T_INTEGER": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for integer types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_INTEGER") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_FLOAT": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for floating point types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_FLOAT") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_STRING": if "length" not in typeItem: raise KeyError("'length' not provided") @@ -392,8 +642,9 @@ def createBaseDataType(typeItem): raise KeyError("'charSet' not provided") if typeItem["length"] == "H5T_VARIABLE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for variable types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if typeItem["charSet"] == "H5T_CSET_ASCII": dtRet = special_dtype(vlen=bytes) elif typeItem["charSet"] == "H5T_CSET_UTF8": @@ -408,20 +659,25 @@ def createBaseDataType(typeItem): if typeItem["charSet"] == "H5T_CSET_ASCII": type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": - raise TypeError("fixed-width unicode strings are not supported") + # use the same type_code as ascii strings + # (othewise, numpy will reserve bytes for UTF32 representation) + type_code = "S" else: raise TypeError("unexpected 'charSet' value") - dtRet = np.dtype(type_code + str(nStrSize)) # fixed size string + # a fixed size string + dtRet = np.dtype(dims + type_code + str(nStrSize)) elif typeClass == "H5T_VLEN": - if "dims" in typeItem: - raise TypeError("'dims' not supported for vlen types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if "base" not in typeItem: raise KeyError("'base' not provided") baseType = createBaseDataType(typeItem["base"]) dtRet = special_dtype(vlen=np.dtype(baseType)) elif typeClass == "H5T_OPAQUE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for opaque types") + if dims: + msg = "Opaque Type is not supported for variable len types" + raise TypeError(msg) if "size" not in typeItem: raise KeyError("'size' not provided") nSize = int(typeItem["size"]) @@ -429,26 +685,19 @@ def createBaseDataType(typeItem): raise TypeError("'size' must be non-negative") dtRet = np.dtype("V" + str(nSize)) elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: + if not dims: raise KeyError("'dims' must be provided for array types") if "base" not in typeItem: raise KeyError("'base' not provided") arrayBaseType = typeItem["base"] - if type(arrayBaseType) is dict: + if isinstance(arrayBaseType, dict): if "class" not in arrayBaseType: raise KeyError("'class' not provided for array base type") - if arrayBaseType["class"] not in ( - "H5T_INTEGER", - "H5T_FLOAT", - "H5T_STRING", - "H5T_COMPOUND", - ): - raise TypeError( - f"{arrayBaseType['class']}: H5T_ARRAY base type not supported." - ) - - dt_base = createDataType(arrayBaseType) - + type_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_COMPOUND", "H5T_ARRAY") + if arrayBaseType["class"] not in type_classes: + msg = "Array Type base type must be integer, float, string, compound or array" + raise TypeError(msg) + baseType = createDataType(arrayBaseType) if isinstance(typeItem["dims"], int): dims = typeItem["dims"] # make into a tuple elif type(typeItem["dims"]) not in (list, tuple): @@ -457,11 +706,17 @@ def createBaseDataType(typeItem): dims = typeItem["dims"] # create an array type of the base type - dtRet = np.dtype((dt_base, dims)) - + dtRet = np.dtype((baseType, dims)) + """ + metadata = None + if baseType.metadata: + metadata = dict(baseType.metadata) + dtRet = np.dtype(dims + baseType.str, metadata=metadata) + else: + dtRet = np.dtype(dims + baseType.str) + return dtRet # return predefined type + """ elif typeClass == "H5T_REFERENCE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for reference types") if "base" not in typeItem: raise KeyError("'base' not provided") if typeItem["base"] == "H5T_STD_REF_OBJ": @@ -470,6 +725,7 @@ def createBaseDataType(typeItem): dtRet = special_dtype(ref=RegionReference) else: raise TypeError("Invalid base type for reference type") + elif typeClass == "H5T_ENUM": if "base" not in typeItem: raise KeyError("Expected 'base' to be provided for enum type") @@ -477,21 +733,36 @@ def createBaseDataType(typeItem): if "class" not in base_json: raise KeyError("Expected class field in base type") if base_json["class"] != "H5T_INTEGER": - raise TypeError("Only integer base types can be used with enum type") - if "members" not in typeItem: - raise KeyError("'members' not provided for enum type") - members = typeItem["members"] - if len(members) == 0: - raise KeyError("empty enum members") + msg = "Only integer base types can be used with enum type" + raise TypeError(msg) + if "mapping" in typeItem: + mapping = typeItem["mapping"] + elif "members" in typeItem: + mapping = typeItem["members"] # backward-compatibility for hdf5-json + else: + raise KeyError("'mapping' not provided for enum type") + + if len(mapping) == 0: + raise KeyError("empty enum map") dt = createBaseDataType(base_json) - values_dict = dict((m["name"], m["value"]) for m in members) - if ( - dt.kind == "i" - and dt.name == "int8" - and len(members) == 2 - and "TRUE" in values_dict - and "FALSE" in values_dict + if isinstance(mapping, list): + # convert to a dictionary + values_dict = dict((m["name"], m["value"]) for m in mapping) + elif isinstance(mapping, dict): + # just use as is + values_dict = mapping + else: + raise TypeError("Expected dict or list mapping for enum type") + + if all( + ( + dt.kind == "i", + dt.name == "int8", + len(mapping) == 2, + "TRUE" in values_dict, + "FALSE" in values_dict, + ) ): # convert to numpy boolean type dtRet = np.dtype("bool") @@ -505,14 +776,12 @@ def createBaseDataType(typeItem): return dtRet -""" -Create a numpy datatype given a json type -""" - - def createDataType(typeItem): + """ + Create a numpy datatype given a json type + """ dtRet = None - if isinstance(typeItem, (str, bytes)): + if type(typeItem) in (str, bytes): # should be one of the predefined types dtName = getNumpyTypename(typeItem) dtRet = np.dtype(dtName) @@ -543,20 +812,90 @@ def createDataType(typeItem): if "type" not in field: raise KeyError("'type' missing from field") field_name = field["name"] - if isinstance(field_name, str): - # verify the field name is ascii - try: - field_name.encode("ascii") - except UnicodeDecodeError: - raise TypeError("non-ascii field name not allowed") + if not isinstance(field_name, str): + raise TypeError("field names must be strings") + # verify the field name is ascii + try: + field_name.encode("ascii") + except UnicodeEncodeError: + raise TypeError("non-ascii field name not allowed") dt = createDataType(field["type"]) # recursive call if dt is None: raise Exception("unexpected error") - subtypes.append((field_name, dt)) # append tuple + subtypes.append((field["name"], dt)) # append tuple dtRet = np.dtype(subtypes) - else: dtRet = createBaseDataType(typeItem) # create non-compound dt return dtRet + + +def validateTypeItem(typeItem): + """ + Validate a json type - call createDataType and if no exception, + it's valid + """ + createDataType(typeItem) + # throws KeyError, TypeError, or ValueError + + +def getBaseTypeJson(type_name): + """ + Return JSON representation of a predefined type string + """ + predefined_int_types = ( + "H5T_STD_I8", + "H5T_STD_U8", + "H5T_STD_I16", + "H5T_STD_U16", + "H5T_STD_I32", + "H5T_STD_U32", + "H5T_STD_I64", + "H5T_STD_U64", + ) + predefined_float_types = ("H5T_IEEE_F16", "H5T_IEEE_F32", "H5T_IEEE_F64") + type_json = {} + # predefined typenames start with 'H5T' and end with "LE" or "BE" + if all( + ( + type_name.startswith("H5T_"), + type_name[-1] == "E", + type_name[-2] in ("L", "B"), + ) + ): + # trime of the "BE/"LE" + type_prefix = type_name[:-2] + if type_prefix in predefined_int_types: + type_json["class"] = "H5T_INTEGER" + type_json["base"] = type_name + elif type_prefix in predefined_float_types: + type_json["class"] = "H5T_FLOAT" + type_json["base"] = type_name + else: + raise TypeError("Invalid type name") + else: + raise TypeError("Invalid type name") + return type_json + + +def getSubType(dt_parent, fields): + """ Return a dtype that is a compound type composed of + the fields given in the field_names list + """ + if len(dt_parent) == 0: + raise TypeError("getSubType - parent must be compound type") + if not fields: + raise TypeError("null field specification") + if isinstance(fields, str): + fields = [fields,] # convert to a list + + field_names = set(dt_parent.names) + dt_items = [] + for field in fields: + if field not in field_names: + raise TypeError(f"field: {field} is not defined in parent type") + dt_items.append((field, dt_parent[field])) + dt = np.dtype(dt_items) + + return dt diff --git a/src/h5json/objid.py b/src/h5json/objid.py new file mode 100644 index 0000000..598790e --- /dev/null +++ b/src/h5json/objid.py @@ -0,0 +1,481 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HDF (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# objID: +# id (uuid) related functions +# + + +import hashlib +import uuid + +S3_URI = "s3://" +FILE_URI = "file://" +AZURE_URI = "blob.core.windows.net/" # preceded with "https://" +UUID_LEN = 36 # length for uuid strings + + +def _getStorageProtocol(uri): + """ returns 's3://', 'file://', or 'https://...net/' prefix if present. + If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer + (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ + otherwise None """ + + if not uri: + protocol = None + elif uri.startswith(S3_URI): + protocol = S3_URI + elif uri.startswith(FILE_URI): + protocol = FILE_URI + elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: + n = uri.find(AZURE_URI) + len(AZURE_URI) + protocol = uri[:n] + elif uri.find("://") >= 0: + raise ValueError(f"storage uri: {uri} not supported") + else: + protocol = None + return protocol + + +def _getBaseName(uri): + """ Return the part of the URI after the storage protocol (if any) """ + + protocol = _getStorageProtocol(uri) + if not protocol: + return uri + else: + return uri[len(protocol):] + + +def _getPrefixForCollection(collection): + """ Return prefix character for given collection type """ + collection = collection.lower() + + if collection in ("group", "groups"): + return 'g' + elif collection in ("dataset", "datasets"): + return 'd' + elif collection in ("datatype", "datatypes"): + return 't' + elif collection in ("chunk", "chunks"): + return 'c' + else: + raise ValueError(f"unexpected collection type: {collection}") + + +def getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + + +def isSchema2Id(id): + """return true if this is a v2 id""" + # v1 ids are in the standard UUID format: 8-4-4-4-12 + # v2 ids are in the non-standard: 8-8-4-6-6 + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id formation for uuid: {id}") + if len(parts[2]) == 8: + return True + else: + return False + + +def getIdHexChars(id): + """get the hex chars of the given id""" + if id[0] == "c": + # don't include chunk index + index = id.index("_") + parts = id[0:index].split("-") + else: + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id format for uuid: {id}") + return "".join(parts[1:]) + + +def hexRot(ch): + """rotate hex character by 8""" + return format((int(ch, base=16) + 8) % 16, "x") + + +def isRootObjId(id): + """returns true if this is a root id (only for v2 schema)""" + if not isSchema2Id(id): + raise ValueError("isRootObjId can only be used with v2 ids") + validateUuid(id) # will throw ValueError exception if not a objid + if id[0] != "g": + return False # not a group + token = getIdHexChars(id) + # root ids will have last 16 chars rotated version of the first 16 + is_root = True + for i in range(16): + if token[i] != hexRot(token[i + 16]): + is_root = False + break + return is_root + + +def getRootObjId(id): + """returns root id for this objid if this is a root id + (only for v2 schema) + """ + if isRootObjId(id): + return id # this is the root id + token = list(getIdHexChars(id)) + # root ids will have last 16 chars rotated version of the first 16 + for i in range(16): + token[i + 16] = hexRot(token[i]) + token = "".join(token) + root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] + root_id += "-" + token[20:26] + "-" + token[26:32] + + return root_id + + +def createObjId(obj_type=None, root_id=None): + """ create a new objid + + if obj_type is None, return just a bare uuid. + Otherwise a hsds v2 schema obj_id will be created. + In this case obj_type should be one of "groups", + "datasets", "datatypes", "chunks". If rootid is + None, a root group obj_id will be created. Otherwise the + obj_id will be a an id that has root_id as it's root. """ + + prefix = None + if obj_type is None: + # just return a regular uuid + objid = str(uuid.uuid4()) + else: + + prefix = _getPrefixForCollection(obj_type) + # schema v2 + salt = uuid.uuid4().hex + # take a hash to randomize the uuid + token = list(hashlib.sha256(salt.encode()).hexdigest()) + + if root_id: + # replace first 16 chars of token with first 16 chars of root id + root_hex = getIdHexChars(root_id) + token[0:16] = root_hex[0:16] + else: + if obj_type != "groups": + raise ValueError("expected 'groups' obj_type for root group id") + # use only 16 chars, but make it look a 32 char id + for i in range(16): + token[16 + i] = hexRot(token[i]) + # format as a string + token = "".join(token) + objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" + objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] + + return objid + + +def getS3Key(id): + """Return s3 key for given id. + + For schema v1: + A md5 prefix is added to the front of the returned key to better + distribute S3 objects. + For schema v2: + The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and + "db/id[0:16]/{prefix}/id[16-32]" for other ids + Chunk ids have the chunk index added after the slash: + "db/id[0:16]/d/id[16:32]/x_y_z + + For domain id's: + Return a key with the .domain suffix and no preceding slash. + For non-default buckets, use the format: /s3_key + If the id has a storage specifier ("s3://", "file://", etc.) + include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" + """ + + base_id = _getBaseName(id) # strip any s3://, etc. + if base_id.find("/") > 0: + # a domain id + domain_suffix = ".domain.json" + index = base_id.find("/") + 1 + key = base_id[index:] + if not key.endswith(domain_suffix): + if key[-1] != "/": + key += "/" + key += domain_suffix + else: + if isSchema2Id(id): + # schema v2 id + hexid = getIdHexChars(id) + prefix = id[0] # one of g, d, t, c + if prefix not in ("g", "d", "t", "c"): + raise ValueError(f"Unexpected id: {id}") + + if isRootObjId(id): + key = f"db/{hexid[0:8]}-{hexid[8:16]}" + else: + partition = "" + if prefix == "c": + # use 'g' so that chunks will show up under their dataset + s3col = "d" + n = id.find("-") + if n > 1: + # extract the partition index if present + partition = "p" + id[1:n] + else: + s3col = prefix + key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" + key += f"-{hexid[20:26]}-{hexid[26:32]}" + if prefix == "c": + if partition: + key += "/" + key += partition + # add the chunk coordinate + index = id.index("_") # will raise ValueError if not found + n = index + 1 + coord = id[n:] + key += "/" + key += coord + elif prefix == "g": + # add key suffix for group + key += "/.group.json" + elif prefix == "d": + # add key suffix for dataset + key += "/.dataset.json" + else: + # add key suffix for datatype + key += "/.datatype.json" + else: + # v1 id + # schema v1 id + idhash = getIdHash(id) + key = f"{idhash}-{id}" + + return key + + +def getObjId(s3key): + """Return object id given valid s3key""" + if all( + ( + len(s3key) >= 44 and s3key[0:5].isalnum(), + len(s3key) >= 44 and s3key[5] == "-", + len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), + ) + ): + # v1 obj keys + objid = s3key[6:] + elif s3key.endswith("/.domain.json"): + objid = "/" + s3key[: -(len("/.domain.json"))] + elif s3key.startswith("db/"): + # schema v2 object key + parts = s3key.split("/") + chunk_coord = "" # used only for chunk ids + partition = "" # likewise + token = [] + for ch in parts[1]: + if ch != "-": + token.append(ch) + + if len(parts) == 3: + # root id + # last part should be ".group.json" + if parts[2] != ".group.json": + raise ValueError(f"unexpected S3Key: {s3key}") + # add 16 more chars using rotated version of first 16 + for i in range(16): + token.append(hexRot(token[i])) + prefix = "g" + elif len(parts) == 5: + # group, dataset, or datatype or chunk + for ch in parts[3]: + if ch != "-": + token.append(ch) + + if parts[2] == "g" and parts[4] == ".group.json": + prefix = "g" # group json + elif parts[2] == "t" and parts[4] == ".datatype.json": + prefix = "t" # datatype json + elif parts[2] == "d": + if parts[4] == ".dataset.json": + prefix = "d" # dataset json + else: + # chunk object + prefix = "c" + chunk_coord = "_" + parts[4] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + elif len(parts) == 6: + # chunk key with partitioning + for ch in parts[3]: + if ch != "-": + token.append(ch) + if parts[2][0] != "d": + raise ValueError(f"unexpected S3Key: {s3key}") + prefix = "c" + partition = parts[4] + if partition[0] != "p": + raise ValueError(f"unexpected S3Key: {s3key}") + partition = partition[1:] # strip off the p + chunk_coord = "_" + parts[5] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + + token = "".join(token) + objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] + objid += "-" + token[16:20] + "-" + token[20:26] + "-" + objid += token[26:32] + chunk_coord + else: + msg = f"unexpected S3Key: {s3key}" + raise ValueError(msg) + return objid + + +def isS3ObjKey(s3key): + """ return True if this is a storage key """ + valid = False + try: + objid = getObjId(s3key) + if objid: + valid = True + except KeyError: + pass # ignore + except ValueError: + pass # ignore + return valid + + +def getCollectionForId(obj_id): + """return groups/datasets/datatypes based on id""" + if not isinstance(obj_id, str): + raise ValueError("invalid object id") + collection = None + if obj_id.startswith("g-"): + collection = "groups" + elif obj_id.startswith("d-"): + collection = "datasets" + elif obj_id.startswith("t-"): + collection = "datatypes" + else: + raise ValueError("not a collection id") + return collection + + +def validateUuid(id, obj_class=None): + """ verify the UUID is well-formed + schema can be: + None: expecting ordinary UUID + "v1": expecting HSDS v1 format + "v2": expecting HSDS v2 format + if set obj_class can be one of "groups", "datasets", "datatypes" + """ + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) < UUID_LEN: + raise ValueError("id is too short to be an object identifier") + if len(id) == UUID_LEN: + if obj_class: + # expected a prefix + raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") + else: + # does this have a v1 schema hash tag? + # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + if id[:5].isalnum() and id[5] == '-': + id = id[6:] # trim off the hash tag + # validate prefix + if id[0] not in ("g", "d", "t", "c"): + raise ValueError("Unexpected prefix") + if id[0] != "c" and id[1] != "-": + # chunk ids may have a partition index following the c + raise ValueError("Unexpected prefix") + if obj_class is not None: + obj_class = obj_class.lower() + if id[0] != _getPrefixForCollection(obj_class): + raise ValueError(f"unexpected object id {id} for collection: {obj_class}") + if id[0] == "c": + # trim the type char and any partition id + n = id.find("-") + if n == -1: + raise ValueError("Invalid chunk id") + + # trim the chunk index for chunk ids + m = id.find("_") + if m == -1: + raise ValueError("Invalid chunk id") + n += 1 + id = "c-" + id[n:m] + id = id[2:] + if len(id) != UUID_LEN: + # id should be 36 now + raise ValueError("Unexpected id length") + + for ch in id: + if ch.isalnum(): + continue + if ch == "-": + continue + raise ValueError(f"Unexpected character in uuid: {ch}") + + +def isValidUuid(id, obj_class=None): + try: + validateUuid(id, obj_class) + return True + except ValueError: + return False + + +def isValidChunkId(id): + if not isValidUuid(id): + return False + if id[0] != "c": + return False + return True + + +def getClassForObjId(id): + """return domains/chunks/groups/datasets/datatypes based on id""" + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) == 0: + raise ValueError("Empty string") + if id[0] == "/": + return "domains" + if isValidChunkId(id): + return "chunks" + else: + return getCollectionForId(id) + + +def isObjId(id): + """return true if uuid or domain""" + if not isinstance(id, str) or len(id) == 0: + return False + if id.find("/") > 0: + # domain id is any string in the form / + return True + return isValidUuid(id) + + +def getUuidFromId(id): + """strip off the type prefix ('g-' or 'd-', or 't-') + and return the uuid part""" + if len(id) == UUID_LEN: + # just a uuid + return id + elif len(id) == UUID_LEN + 2: + # 'g-', 'd-', or 't-' prefix + return id[2:] + else: + raise ValueError(f"Unexpected obj_id: {id}") diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 6a310c6..9ac6578 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -841,7 +841,6 @@ def testCreateReferenceAttribute(self): ] db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) item = db.getAttributeItem("groups", root_uuid, "A1") - attr_type = item["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") @@ -1275,7 +1274,6 @@ def testGetEvalStr(self): for query in queries.keys(): eval_str = db._getEvalStr(query, fields) self.assertEqual(eval_str, queries[query]) - # print(query, "->", eval_str) def testBadQuery(self): queries = ( diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index 0f67d7b..dbc806b 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -2,8 +2,8 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # @@ -12,11 +12,12 @@ import unittest import logging import numpy as np -from h5py import special_dtype -from h5py import check_dtype -from h5py import Reference -from h5py import RegionReference + from h5json import hdf5dtype +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import Reference +from h5json.hdf5dtype import RegionReference class Hdf5dtypeTest(unittest.TestCase): @@ -26,6 +27,31 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.INFO) + def testGetBaseTypeJson(self): + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F64LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F64LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F16LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F16LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_STD_I32LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_STD_I32LE") + + try: + hdf5dtype.getBaseTypeJson("foobar") + self.assertTrue(False) + except TypeError: + pass # expected + def testBaseIntegerTypeItem(self): dt = np.dtype(" 0: + continue # bucket name gets lost when domain ids get converted to s3keys + objid = getObjId(s3key) + self.assertEqual(objid, item) + for item in bad_ids: + self.assertFalse(isValidUuid(item)) + self.assertFalse(isObjId(item)) + + def testGetCollection(self): + group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" + dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" + ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" + bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + try: + getCollectionForId(bad_id) + self.assertTrue(False) + except ValueError: + pass # expected + try: + getCollectionForId(None) + self.assertTrue(False) + except ValueError: + pass # expected + + def testSchema2Id(self): + root_id = createObjId("groups") + group_id = createObjId("groups", root_id=root_id) + dataset_id = createObjId("datasets", root_id=root_id) + ctype_id = createObjId("datatypes", root_id=root_id) + + self.assertEqual(getCollectionForId(root_id), "groups") + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + chunk_id = "c" + dataset_id[1:] + "_1_2" + chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" + + for id in (chunk_id, chunk_partition_id): + try: + getCollectionForId(id) + self.assertTrue(False) + except ValueError: + pass # expected + valid_ids = ( + group_id, + dataset_id, + ctype_id, + chunk_id, + chunk_partition_id, + root_id, + ) + s3prefix = getS3Key(root_id) + self.assertTrue(s3prefix.endswith("/.group.json")) + s3prefix = s3prefix[: -(len(".group.json"))] + for oid in valid_ids: + self.assertTrue(len(oid) >= 38) + parts = oid.split("-") + self.assertEqual(len(parts), 6) + self.assertTrue(oid[0] in ("g", "d", "t", "c")) + self.assertTrue(isSchema2Id(oid)) + if oid == root_id: + self.assertTrue(isRootObjId(oid)) + else: + self.assertFalse(isRootObjId(oid)) + + s3key = getS3Key(oid) + self.assertTrue(s3key.startswith(s3prefix)) + self.assertEqual(getObjId(s3key), oid) + self.assertTrue(isS3ObjKey(s3key)) + + +if __name__ == "__main__": + # setup test files + + unittest.main()