Skip to content

Commit 005df01

Browse files
committed
Merge commit 'f77d4f5db9cbab4488a85f078dc7b5aaa08d0251' into update-h5z-sperr
2 parents e4abd44 + f77d4f5 commit 005df01

16 files changed

+1660
-137
lines changed

Diff for: src/H5Z-SPERR/.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# vim temp files
2+
*.sw?
3+
14
# Prerequisites
25
*.d
36

Diff for: src/H5Z-SPERR/CMakeLists.txt

+32-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
cmake_minimum_required(VERSION 3.14)
22

3-
project(H5Z-MD5 VERSION 0.1.3 LANGUAGES C DESCRIPTION "HDF5 plugin for SPERR compression")
3+
project(H5Z-SPERR VERSION 0.2.3 LANGUAGES C CXX DESCRIPTION "HDF5 plugin for SPERR compression")
44

55
set(CMAKE_C_STANDARD 11)
6+
set(CMAKE_CXX_STANDARD 14)
67
if(NOT CMAKE_BUILD_TYPE)
78
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
89
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
910
endif()
1011

1112
option( BUILD_SHARED_LIBS "Build shared libraries" ON )
1213
option( BUILD_CLI_UTILITIES "Build a set of command line utilities" ON )
14+
option( BUILD_UNIT_TESTS "Build unit tests using GoogleTest" OFF )
1315
option( H5ZPLUGIN_PREFER_RPATH "Set RPATH; this can fight with package managers
1416
so turn off when building for them" ON )
1517
mark_as_advanced(FORCE H5ZPLUGIN_PREFER_RPATH)
@@ -42,11 +44,39 @@ if( BUILD_CLI_UTILITIES )
4244
add_subdirectory( utilities ${CMAKE_BINARY_DIR}/bin )
4345
endif()
4446

47+
#
48+
# Build unit tests
49+
#
50+
if( BUILD_UNIT_TESTS )
51+
# Control internal options of GoogleTest
52+
#
53+
set( INSTALL_GTEST OFF CACHE INTERNAL "Not install GoogleTest")
54+
set( BUILD_GMOCK ON CACHE INTERNAL "Build gmock")
55+
56+
# Let's use the new mechanism to incorporate GoogleTest
57+
#
58+
include(FetchContent)
59+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24")
60+
FetchContent_Declare( googletest
61+
URL https://github.com/google/googletest/archive/refs/heads/main.zip
62+
DOWNLOAD_EXTRACT_TIMESTAMP NEW )
63+
endif()
64+
65+
# Prevent overriding the parent project's compiler/linker settings on Windows
66+
#
67+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
68+
FetchContent_MakeAvailable(googletest)
69+
70+
enable_testing() # calling this function before adding subdirectory to enable
71+
# invoking ctest from the top-level build directory.
72+
add_subdirectory( test_scripts )
73+
endif()
74+
4575
#
4676
# Start installation using GNU installation rules
4777
#
4878
include( GNUInstallDirs )
49-
install( TARGETS h5z-sperr LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
79+
install( TARGETS h5z-sperr h5z-clamp LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
5080

5181
if( BUILD_CLI_UTILITIES )
5282
install( TARGETS generate_cd_values decode_cd_values

Diff for: src/H5Z-SPERR/README.md

+75-8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,66 @@ export HDF5_PLUGIN_PATH=/path/to/install/this/plugin
2424
```
2525
The user program does not need to link to this plugin or SPERR; it only needs to specify the plugin ID of `32028`.
2626

27+
<!--
28+
## Use in NetCDF-4 APIs
29+
`H5Z-SPERR` also facilitates the application of SPERR compression on
30+
[NetCDF-4 files](https://docs.unidata.ucar.edu/netcdf/NUG/md_filters.html#filters_enable);
31+
one simply needs to define the filter on a variable:
32+
```C
33+
nc_def_var_filter(ncid, varid, 32028, 1, &cd_values);
34+
```
35+
See a complete example [here](https://github.com/NCAR/H5Z-SPERR/blob/main/example/simple_xy_nc4_wr.c).
36+
-->
37+
38+
## Use in Python
39+
`H5Z-SPERR` version `0.1.3` is supported by the Python package [hdf5plugin](https://github.com/silx-kit/hdf5plugin)
40+
since version `5.0.0`.
41+
One can install the package by issuing
42+
```bash
43+
pip install hdf5plugin [--user] # using pip
44+
conda install -c conda-forge hdf5plugin # using conda
45+
```
46+
and use it by importing these two packages:
47+
```python
48+
import h5py # provide general HDF5 support
49+
import hdf5plugin # provide HDF5 plugin support
50+
```
51+
52+
## Handling of Missing Values
53+
Simulation models sometimes use a special value (i.e., missing value) to indicate that there's no meaningful value at that specific location.
54+
For example, in an ocean simulation, all the land area is marked by missing values.
55+
The most often seen missing values are either `NaN`, or an extremely large value, such as `1e35` or `-9.9e35`.
56+
When these missing values participate in compression, they easily introduce numeric error and result in data corruption.
57+
58+
`H5Z-SPERR` can handle common missing values with a little help from the user.
59+
Specifically, a user can indicate that there's potentially missing values,
60+
and `H5Z-SPERR` will use a compact bitmask to keep track of where exactly those missing values are.
61+
`H5Z-SPERR` then replaces them with a value that is friendly to SPERR compression before passing the field to the SPERR compressor.
62+
63+
During decompression, `H5Z-SPERR` fills the original missing value at locations indicated by the compact bitmask.
64+
Specifically, if the original missing values are `NaN`, then `NaN` will be filled. If the orignal missing values
65+
are values with a magnitude larger than `1e35`, then the *first occurance* of such values will be used to fill
66+
in all missing value locations.
67+
68+
Users use an integer to indicate the potential existance of missing values:
69+
- Mode `0`: no missing values;
70+
- Mode `1`: there are potential `NaN`s;
71+
- Mode `2`: there are potential values with a magnitude larger than `1e35`.
72+
73+
`H5Z-SPERR` behaves accordingly: (`1e35` denotes the *first occurance* of such values)
74+
| Mode | Actual Input Data | Filter Behavior |
75+
|-----------|----------------------|------------------|
76+
| 0 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
77+
| 0 | Has `NaN` or `1e35` | :x: Likely numeric error |
78+
| 1 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
79+
| 1 | Has `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression; `NaN` is restored at its exact locations |
80+
| 1 | Regardless of `NaN`, has `1e35` | :x: Likely numeric error |
81+
| 2 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
82+
| 2 | No `NaN`, has `1e35` | :heavy_check_mark: Normal SPERR compression; `1e35` is restored at its exact locations |
83+
| 2 | Has `NaN`, regardless of `1e35` | :x: Likely numeric error |
84+
85+
**Final note:** if a variable is indicated to have missing values, but it actually does not, then there's no bitmasks involved thus no storage overhead!
86+
2787
## Find `cd_values[]`
2888
To apply SPERR compression using the HDF5 plugin, one needs to specify 1) what compression mode and 2)
2989
what compression quality to use. Supported compression modes and qualities are summarized below:
@@ -42,7 +102,8 @@ and the `Z` rank to be varying the slowest, before the data is passed to the com
42102

43103
The HDF5 libraries takes in these compression parameters as one or more 32-bit `unsigned int` values,
44104
which are named `cd_values[]` in most HDF5 routines.
45-
In the case of `H5Z-SPERR`, there is exactly one `unsigned int` used to carry this information.
105+
In the case of `H5Z-SPERR`, there is exactly one `unsigned int` used to carry compression-related information, and
106+
possibly one more `unsigned int` to indicate the potential existance of missing values.
46107

47108
### Find `cd_values[]` Using the Programming Interface
48109
Using the HDF5 programming interface, `cd_values[]` carrying the compression parameters are passed
@@ -75,11 +136,17 @@ Please use this value as a single 32-bit unsigned integer in your applications.
75136
Note: an integer produced by `generate_cd_values` can be decoded by another command line tool, `decode_cd_values`,
76137
to show the coded compression parameters.
77138

78-
## Use in NetCDF-4 APIs
79-
`H5Z-SPERR` also facilitates the application of SPERR compression on
80-
[NetCDF-4 files](https://docs.unidata.ucar.edu/netcdf/NUG/md_filters.html#filters_enable);
81-
one simply needs to define the filter on a variable:
82-
```C
83-
nc_def_var_filter(ncid, varid, 32028, 1, &cd_values);
139+
### Examples
140+
Assume using the `nccopy` tool:
141+
```Bash
142+
# Compress variable VAR0, using fixed-rate compression, bitrate = 3.3, no special handling of missing values.
143+
nccopy -F "VAR0, 268651725u" <input_file> <output_file>
144+
nccopy -F "VAR0, 268651725u, 0" <input_file> <output_file>
145+
146+
# Compress variable VAR1, using fixed-rate compression, bitrate = 3.3. VAR1 might have NaNs!
147+
nccopy -F "VAR1, 268651725u, 1" <input_file> <output_file>
148+
149+
# Compress variable VAR2, using fixed-rate compression, bitrate = 3.3. VAR2 might have values such as 1e35!
150+
nccopy -F "VAR2, 268651725u, 2" <input_file> <output_file>
84151
```
85-
See a complete example [here](https://github.com/NCAR/H5Z-SPERR/blob/main/example/simple_xy_nc4_wr.c).
152+

Diff for: src/H5Z-SPERR/include/compactor.h

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* This is a set of functions that compact a bitmask.
2+
* In the intended use case, a bitmask is produced by masking all
3+
* "missing values" or "fill values" in a model output with zero's,
4+
* whereas the locations with valid data points are marked with one's.
5+
* However, this compactor is likely to be effective with any bit patterns
6+
* that have lots of consecutive 0's or 1's.
7+
*
8+
* The bitmask compactor works in the following way:
9+
* 1. Assume that we use 32-bit ints; the compactor encodes 32 bits at a time.
10+
* 2. Every incoming int is encoded in one of three ways:
11+
* 2.1. For an int with all 0's, use a single 0 bit.
12+
* 2.2. For an int with all 1's, use two bits: 10.
13+
* 2.3. For all other ints, use 34 bits: two bits 11 then followed by the
14+
* verbose presentation of the 32-bit int.
15+
* 3. Obviously, it's the most economical to use a single 0 bit to present
16+
* the most frequent pattern (all 0's or all 1's). The encoder thus does
17+
* a test at the beginning and records the test result.
18+
*/
19+
20+
#ifndef COMPACTOR_H
21+
#define COMPACTOR_H
22+
23+
#include <stdlib.h>
24+
#include <stdint.h>
25+
#include <assert.h>
26+
27+
#ifdef __cplusplus
28+
extern "C" {
29+
#endif
30+
31+
/* Return the compaction strategy to use:
32+
* 0: compact with all 0's being the most frequent
33+
* 1: compact with all 1's being the most frequent
34+
* Note: `bytes` has to be a multiple of 8.
35+
*/
36+
int compactor_strategy(const void* buf, size_t bytes);
37+
38+
/* Return the size in bytes of the resulting compacted bitstream, given an input buf.
39+
* Note: `buf_bytes` has to be a multiple of 8.
40+
*/
41+
size_t compactor_comp_size(const void* buf, size_t buf_bytes);
42+
43+
/* Return the number of useful bytes in a compacted bitstream.
44+
* This value is the same as the output of `compactor_comp_size()` during encoding.
45+
*/
46+
size_t compactor_useful_bytes(const void* comp_buf);
47+
48+
/* Return the useful size of the output bitstream,
49+
* which is the same as the output of `compactor_comp_size()`.
50+
* Note 1: the input bitmask length (in bytes) has to be a multiple of 8.
51+
* This requirement is inheritated from the bitstream implementation.
52+
* Note 2: the output buffer length should be 1) a multiple of 8, and
53+
* 2) no less than the size returned by `compactor_comp_size()`.
54+
*/
55+
size_t compactor_encode(const void* bitmask,
56+
size_t bitmask_bytes,
57+
void* compact_bitstream,
58+
size_t compact_bitstream_bytes);
59+
60+
/* Return the number of useful bytes in the decoded bitmask.
61+
* Note: The number of useful bytes might be bigger than the number of bytes being
62+
* encoded, because of the word size that the compactor operates on.
63+
* Note: `compact_bitstream_bytes` should be a multiple of 8 that is no less than
64+
* the size returned by `compactor_encode()`.
65+
*/
66+
size_t compactor_decode(const void* compact_bitstream,
67+
size_t compact_bitstream_bytes,
68+
void* decoded_bitmask);
69+
70+
#ifdef __cplusplus
71+
}
72+
#endif
73+
74+
#endif

Diff for: src/H5Z-SPERR/include/h5zsperr_helper.h

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* This file contains a few helper functions for the H5Z-SPERR filter.
3+
*/
4+
5+
#ifndef H5ZSPERR_HELPER_H
6+
#define H5ZSPERR_HELPER_H
7+
8+
#include <stdlib.h>
9+
10+
#define LARGE_MAGNITUDE_F 1e35f
11+
#define LARGE_MAGNITUDE_D 1e35
12+
#define H5ZSPERR_COMPATIBILITY 1
13+
14+
#ifdef __cplusplus
15+
namespace C_API {
16+
extern "C" {
17+
#endif
18+
19+
/*
20+
* Pack and unpack additional information about the input data into an integer.
21+
* It returns the encoded unsigned int, which shouldn't be zero.
22+
* The packing function is called by `set_local()` to prepare information
23+
* for `H5Z_filter_sperr()`, which calls the unpack function to extract such info.
24+
*/
25+
unsigned int h5zsperr_pack_extra_info(int rank, int is_float, int missing_val_mode, int magic_num);
26+
void h5zsperr_unpack_extra_info(unsigned int meta,
27+
int* rank,
28+
int* is_float,
29+
int* missing_val_mode,
30+
int* magic_num);
31+
32+
/*
33+
* Check if an input array really has missing values.
34+
*/
35+
int h5zsperr_has_nan(const void* buf, size_t nelem, int is_float);
36+
int h5zsperr_has_large_mag(const void* buf, size_t nelem, int is_float);
37+
38+
/*
39+
* Produce a compact bitmask.
40+
* `mask_buf` is already allocated with length `mask_bytes`.
41+
* Returns 0 upon success.
42+
*/
43+
int h5zsperr_make_mask_nan(const void* data_buf, size_t nelem, int is_float,
44+
void* mask_buf, size_t mask_bytes, size_t* useful_bytes);
45+
int h5zsperr_make_mask_large_mag(const void* data_buf, size_t nelem, int is_float,
46+
void* mask_buf, size_t mask_bytes, size_t* useful_bytes);
47+
48+
/*
49+
* Replace every missing value in the `data_buf` with the mean of the field.
50+
*/
51+
float h5zsperr_treat_nan_f32(float* data_buf, size_t nelem);
52+
double h5zsperr_treat_nan_f64(double* data_buf, size_t nelem);
53+
float h5zsperr_treat_large_mag_f32(float* data_buf, size_t nelem);
54+
double h5zsperr_treat_large_mag_f64(double* data_buf, size_t nelem);
55+
56+
#ifdef __cplusplus
57+
} /* end of extern "C" */
58+
} /* end of namespace C_API */
59+
#endif
60+
61+
#endif

Diff for: src/H5Z-SPERR/include/icecream.h

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* This is a mimic of the Bitstream class in SPERR:
3+
* https://github.com/NCAR/SPERR/blob/main/include/Bitstream.h
4+
*
5+
* The most significant difference is that bitstream here doesn't manage
6+
* any memory; it reads a bit sequence from a user-provided memory buffer,
7+
* or writes a bit sequence to a user-provided memory buffer.
8+
*
9+
* The "object" is named `icecream` and all functions operating on it
10+
* are named with a prefix `icecream`.
11+
*/
12+
13+
#ifndef ICECREAM_H
14+
#define ICECREAM_H
15+
16+
#include <stdlib.h>
17+
#include <stdint.h>
18+
#include <assert.h>
19+
20+
#ifndef NDEBUG
21+
#include <stdio.h>
22+
#endif
23+
24+
#ifdef __cplusplus
25+
extern "C" {
26+
#endif
27+
28+
typedef struct {
29+
uint64_t* begin; /* begin of the stream */
30+
uint64_t* ptr; /* pointer to the next word to be read/written */
31+
uint64_t buffer; /* incoming/outgoing bits */
32+
int bits; /* number of buffered bits (0 <= bits < 64) */
33+
} icecream;
34+
35+
/*
36+
* Specify a bitstream to use memory provided by users.
37+
* NOTE: the memory length (in bytes) have to be a multiplier of 8,
38+
* because the icecream class writes/reads in 64-bit integers.
39+
*/
40+
void icecream_use_mem(icecream* s, void* mem, size_t bytes);
41+
42+
/* Position the bitstream for reading or writing at the beginning. */
43+
void icecream_rewind(icecream* s);
44+
45+
/* Read a bit. Please don't read beyond the end of the stream. */
46+
int icecream_rbit(icecream* s);
47+
48+
/* Write a bit (0 or 1). Please don't write beyond the end of the stream. */
49+
void icecream_wbit(icecream* s, int bit);
50+
51+
/* Return the bit offset to the next bit to be read. */
52+
size_t icecream_rtell(icecream* s);
53+
54+
/* Return the bit offset to the next bit to be written. */
55+
size_t icecream_wtell(icecream* s);
56+
57+
/* Write any remaining buffered bits and align stream on next word boundary. */
58+
void icecream_flush(icecream* s);
59+
60+
#ifdef __cplusplus
61+
}
62+
#endif
63+
64+
#endif

Diff for: src/H5Z-SPERR/src/CMakeLists.txt

+14-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
1-
add_library( h5z-sperr h5z-sperr.c )
1+
#
2+
# The main H5Z-SPERR filter
3+
#
4+
add_library( h5z-sperr h5z-sperr.c
5+
h5zsperr_helper.cpp
6+
icecream.c
7+
compactor.c)
28
target_include_directories( h5z-sperr PUBLIC ${HDF5_INCLUDE_DIR}
39
PUBLIC ${SPERR_INCLUDE_DIRS}
410
PUBLIC ${CMAKE_SOURCE_DIR}/include )
511
target_link_libraries( h5z-sperr PUBLIC ${HDF5_LIBRARIES} PUBLIC PkgConfig::SPERR )
12+
13+
#
14+
# Experimental clamping filter
15+
#
16+
add_library( h5z-clamp h5z-clamp.c )
17+
target_include_directories( h5z-clamp PUBLIC ${HDF5_INCLUDE_DIR} )
18+
target_link_libraries( h5z-clamp PUBLIC ${HDF5_LIBRARIES} )

0 commit comments

Comments
 (0)