Create SafeTensors reader implementation

nenb · nenb · commit e47216795594 · 2025-04-21T21:55:52.000-03:00
diff --git a/docs/safetensors.md b/docs/safetensors.md
@@ -0,0 +1,136 @@
+# SafeTensors Reader User Guide
+
+The SafeTensors reader in VirtualiZarr allows you to reference tensors stored in SafeTensors files. This guide explains how to use the reader effectively.
+
+## What is SafeTensors Format?
+
+SafeTensors is a file format for storing tensors (multidimensional arrays) that offers several advantages:
+- Safe: No use of pickle, eliminating security concerns
+- Efficient: Zero-copy access for fast loading
+- Simple: Straightforward binary format with JSON header
+- Language-agnostic: Available across Python, Rust, C++, and JavaScript
+
+The format consists of:
+- 8 bytes (header size): little-endian uint64 containing the size of the header
+- JSON header: Contains metadata for all tensors (shapes, dtypes, offsets)
+- Binary data: Contiguous tensor data
+
+## How VirtualiZarr's SafeTensors Reader Works
+
+VirtualiZarr's SafeTensors reader allows you to:
+- Work with the tensors as xarray DataArrays with named dimensions
+- Access specific slices of tensors from cloud storage
+- Preserve metadata from the original SafeTensors file
+
+## Basic Usage
+
+Opening a SafeTensors file is straightforward:
+
+```python
+import virtualizarr as vz
+
+# Open a SafeTensors file
+vds = vz.open_virtual_dataset("model.safetensors")
+
+# Access tensors as xarray variables
+weight = vds["weight"]
+bias = vds["bias"]
+
+# Convert to numpy arrays when needed
+weight_array = weight.values
+bias_array = bias.values
+```
+
+## Custom Dimension Names
+
+By default, dimensions are named generically (e.g., "weight_dim_0", "weight_dim_1"). You can provide custom dimension names for better semantics:
+
+```python
+# Define custom dimension names
+custom_dims = {
+    "weight": ["input_dims", "output_dims"],
+    "bias": ["output_dims"]
+}
+
+# Open with custom dimension names
+vds = vz.open_virtual_dataset(
+    "model.safetensors",
+    virtual_backend_kwargs={"dimension_names": custom_dims}
+)
+
+# Now dimensions have meaningful names
+print(vds["weight"].dims)  # ('input_dims', 'output_dims')
+print(vds["bias"].dims)    # ('output_dims',)
+```
+
+## Loading Specific Variables
+
+You can specify which variables to load as eager arrays instead of virtual references:
+
+```python
+# Load specific variables as eager arrays
+vds = vz.open_virtual_dataset(
+    "model_weights.safetensors",
+    loadable_variables=["small_tensor1", "small_tensor2"]
+)
+
+# These will be loaded as regular numpy arrays
+small_tensor1 = vds["small_tensor1"]
+# Large tensors remain virtual references
+large_tensor = vds["large_tensor"]
+```
+
+## Working with Remote Files
+
+The SafeTensors reader supports reading from the HuggingFace Hub:
+```python
+# S3
+vds = vz.open_virtual_dataset(
+    "https://huggingface.co/openai-community/gpt2/model.safetensors",
+    virtual_backend_kwargs={"revision": "main"}
+)
+```
+
+It supports reading from object storage:
+
+```python
+# S3
+vds = vz.open_virtual_dataset(
+    "s3://my-bucket/model.safetensors",
+    reader_options={
+        "storage_options": {
+            "key": "ACCESS_KEY",
+            "secret": "SECRET_KEY",
+            "region_name": "us-west-2"
+        }
+    }
+)
+```
+
+## Accessing Metadata
+
+SafeTensors files can contain metadata at the file level and tensor level:
+
+```python
+# Access file-level metadata
+print(vds.attrs)  # File-level metadata
+
+# Access tensor-specific metadata
+print(vds["weight"].attrs)  # Tensor-specific metadata
+
+# Access original SafeTensors dtype information
+original_dtype = vds["weight"].attrs["original_safetensors_dtype"]
+print(f"Original dtype: {original_dtype}")
+```
+
+## Known Limitations
+
+### Performance Considerations
+- Very large tensors (>1GB) are treated as a single chunk, which may impact memory usage when accessing small slices
+- Files with thousands of tiny tensors may have overhead due to metadata handling
+
+## Best Practices
+
+- **For large tensors**: Use slicing to access only the portions you need
+- **For remote files**: Use appropriate credentials and optimize access patterns
+- **For many small tensors**: Consider loading them eagerly using `loadable_variables`
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,10 @@ hdf = [
     "imagecodecs",
     "imagecodecs-numcodecs==2024.6.1",
 ]
+safetensors = [
+    "safetensors",
+    "ml-dtypes",
+]
 
 # kerchunk-based readers
 hdf5 = [
@@ -70,6 +74,7 @@ fits = [
 ]
 all_readers = [
     "virtualizarr[hdf]",
+    "virtualizarr[safetensors]",
     "virtualizarr[hdf5]",
     "virtualizarr[netcdf3]",
     "virtualizarr[fits]",
@@ -175,7 +180,7 @@ rust = "*"
 run-mypy = { cmd = "mypy virtualizarr" }
 # Using '--dist loadscope' (rather than default of '--dist load' when '-n auto'
 # is used), reduces test hangs that appear to be macOS-related.
-run-tests = { cmd = "pytest -n auto --dist loadscope --run-network-tests --verbose" }
+run-tests = { cmd = "pytest -n auto --dist loadscope --run-network-tests --verbose --ignore=codemcp" }
 run-tests-no-network = { cmd = "pytest -n auto --verbose" }
 run-tests-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov=term-missing" }
 run-tests-xml-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov-report=xml" }
@@ -185,12 +190,12 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
 [tool.pixi.environments]
 min-deps = ["dev", "test", "hdf", "hdf5", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
 # Inherit from min-deps to get all the test commands, along with optional dependencies
-test = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore"]
-test-py311 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py311"] # test against python 3.11
-test-py312 = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312"] # test against python 3.12
-minio = ["dev", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312", "minio"]
-upstream = ["dev", "test", "hdf", "hdf5", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev"]
-all = ["dev", "test", "remote", "hdf", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "all_readers", "all_writers"]
+test = ["dev", "test", "remote", "hdf", "safetensors", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore"]
+test-py311 = ["dev", "test", "remote", "hdf", "safetensors", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py311"] # test against python 3.11
+test-py312 = ["dev", "test", "remote", "hdf", "safetensors", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312"] # test against python 3.12
+minio = ["dev", "remote", "hdf", "safetensors", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "py312", "minio"]
+upstream = ["dev", "test", "hdf", "safetensors", "hdf5", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev"]
+all = ["dev", "test", "remote", "hdf", "safetensors", "hdf5", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "obstore", "all_readers", "all_writers"]
 docs = ["docs"]
 
 # Define commands to run within the docs environment
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
@@ -27,6 +27,7 @@
     HDFVirtualBackend,
     KerchunkVirtualBackend,
     NetCDF3VirtualBackend,
+    SafeTensorsVirtualBackend,
     TIFFVirtualBackend,
 )
 from virtualizarr.readers.api import VirtualBackend
@@ -45,6 +46,7 @@
     "kerchunk": KerchunkVirtualBackend,
     "dmrpp": DMRPPVirtualBackend,
     "hdf5": HDFVirtualBackend,
+    "safetensors": SafeTensorsVirtualBackend,
     "netcdf4": HDFVirtualBackend,  # note this is the same as for hdf5
     # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
     "netcdf3": NetCDF3VirtualBackend,
@@ -70,6 +72,7 @@ class FileType(AutoName):
     fits = auto()
     dmrpp = auto()
     kerchunk = auto()
+    safetensors = auto()
 
 
 def automatically_determine_filetype(
@@ -89,6 +92,8 @@ def automatically_determine_filetype(
     if Path(filepath).suffix == ".zarr":
         # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one...
         raise NotImplementedError()
+    elif Path(filepath).suffix.lower() == ".safetensors":
+        return FileType.safetensors
 
     # Read magic bytes from local or remote file
     fpath = _FsspecFSFromFilepath(
diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py
@@ -150,6 +150,10 @@ def default_object_store(filepath: str) -> ObjectStore:
             virtual_hosted_style_request=False,
             region=_find_bucket_region(bucket),
         )
+    if parsed.scheme == "https" and parsed.netloc == "huggingface.co":
+        # TO DO: timeout can be passed here via client_options kwarg e.g. {"timeout":"30s"}
+        # TO DO: how to pass HF token with obstore? requires "authorization": f"Bearer {token}" in header.
+        return obs.store.HTTPStore.from_url(url=f"{parsed.scheme}://{parsed.netloc}")
 
     raise NotImplementedError(f"{parsed.scheme} is not yet supported")
 
diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py
@@ -4,6 +4,7 @@
 from virtualizarr.readers.hdf5 import HDF5VirtualBackend
 from virtualizarr.readers.kerchunk import KerchunkVirtualBackend
 from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend
+from virtualizarr.readers.safetensors import SafeTensorsVirtualBackend
 from virtualizarr.readers.tiff import TIFFVirtualBackend
 
 __all__ = [
@@ -13,5 +14,6 @@
     "HDF5VirtualBackend",
     "KerchunkVirtualBackend",
     "NetCDF3VirtualBackend",
+    "SafeTensorsVirtualBackend",
     "TIFFVirtualBackend",
 ]
diff --git a/virtualizarr/readers/safetensors.py b/virtualizarr/readers/safetensors.py
diff --git a/virtualizarr/tests/test_readers/test_safetensors.py b/virtualizarr/tests/test_readers/test_safetensors.py

Original file line number	Diff line number	Diff line change
`@@ -150,6 +150,10 @@ def default_object_store(filepath: str) -> ObjectStore:`
`150`	`150`	`virtual_hosted_style_request=False,`
`151`	`151`	`region=_find_bucket_region(bucket),`
`152`	`152`	`)`
	`153`	`+ if parsed.scheme == "https" and parsed.netloc == "huggingface.co":`
	`154`	`+ # TO DO: timeout can be passed here via client_options kwarg e.g. {"timeout":"30s"}`
	`155`	`+ # TO DO: how to pass HF token with obstore? requires "authorization": f"Bearer {token}" in header.`
	`156`	`+ return obs.store.HTTPStore.from_url(url=f"{parsed.scheme}://{parsed.netloc}")`
`153`	`157`
`154`	`158`	`raise NotImplementedError(f"{parsed.scheme} is not yet supported")`
`155`	`159`