Skip to content

Commit f77d4f5

Browse files
committed
Squashed 'src/H5Z-SPERR/' changes from 181c8c2cf..9a825a5ff
9a825a5ff Update version to 0.2.3 7058d2a75 Merge pull request #14 from shaomeng/main 02fd24d65 Merge branch 'NCAR:main' into main 804d0bf66 be able to read binaries produced by version 0.1.x 8775d06fa Merge pull request #13 from shaomeng/main 20ccecb74 Merge branch 'NCAR:main' into main 1e009bcb8 make it an error when compatibility versions don't match d6a744017 Merge pull request #12 from shaomeng/main 13f3f6b73 fix a windows compilation issue c07099a70 Add experimental clamping filter f1d35dcc0 add a mini filter: h5z-clamp, with a filter ID of 45678 18fd6f9ce Update README.md f9f4ee28e Merge pull request #10 from shaomeng/bitmask_compactor 68e7942e3 minor d7604708c bumps version number f25ecfb24 Update README.md e745e5bf4 minor 49fe13695 Update README.md 65ca29cec Update README.md with the filter behavior table 18bd130f6 correct mean calculation 5366f22a6 finish decompression code. Need to run tests 2ea811491 finished compression routine, work on decompression a6b9c5cb5 WIP: compression 9db422acf improve the compactor ca6df4c8f improve the replace functions 01af6d609 implement replace function 743b9d4cc add function h5zsperr_make_mask_nan(), still need to write a unit test for it 2d9e50d88 remove missing_value_mode 3 and 4, so my job is easier now b5a0d05cf check in 69f885ee9 encode the magic number in set_local() 429ee9d61 use C++ to implement helper functions b03cba98e function name change c6de1094e check in f0ff97bf8 finish packing and unpacking cd_values[] 06bc330fd re-work on the pack data type function 4d3954fdd add functions to check if an input array really has the specified type of missing values ea73f7b02 use a separate file h5zsperr_helper to keep helper functions 90ae7d9ed set_local() function considers missing value flag 60148fc3b add another compactor unit test a5eca6efe compactor works with encoding and decoding 6af5b99a6 improve compactor_comp_size() 76a43720e add compactor_comp_size() function 881ea955a change name to be compactor 8b671bfc8 rename bitstream to icecream 7b9af10cb remove the end pointer fee5981c7 specify and test the memory usage of bitstream 3f262de1f implement rtell() 3672f78a0 finish bitstream class, adding unit tests 20b62230f WIP: bitstream class 4cf8a36d5 start working on bitmaskt compactor, add first function ea0aec35a improve README 4c29f3efd Merge pull request #8 from NCAR/chunk_dim_check 6ea4bf485 check that dataspace dimension can be divided by the chunk dimension e344ce900 add link to hdf5plugin git-subtree-dir: src/H5Z-SPERR git-subtree-split: 9a825a5ff4de84458f092bc63135e466bc68c0cb
1 parent 2f4abee commit f77d4f5

16 files changed

+1660
-137
lines changed

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# vim temp files
2+
*.sw?
3+
14
# Prerequisites
25
*.d
36

Diff for: CMakeLists.txt

+32-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
cmake_minimum_required(VERSION 3.14)
22

3-
project(H5Z-MD5 VERSION 0.1.3 LANGUAGES C DESCRIPTION "HDF5 plugin for SPERR compression")
3+
project(H5Z-SPERR VERSION 0.2.3 LANGUAGES C CXX DESCRIPTION "HDF5 plugin for SPERR compression")
44

55
set(CMAKE_C_STANDARD 11)
6+
set(CMAKE_CXX_STANDARD 14)
67
if(NOT CMAKE_BUILD_TYPE)
78
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
89
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
910
endif()
1011

1112
option( BUILD_SHARED_LIBS "Build shared libraries" ON )
1213
option( BUILD_CLI_UTILITIES "Build a set of command line utilities" ON )
14+
option( BUILD_UNIT_TESTS "Build unit tests using GoogleTest" OFF )
1315
option( H5ZPLUGIN_PREFER_RPATH "Set RPATH; this can fight with package managers
1416
so turn off when building for them" ON )
1517
mark_as_advanced(FORCE H5ZPLUGIN_PREFER_RPATH)
@@ -42,11 +44,39 @@ if( BUILD_CLI_UTILITIES )
4244
add_subdirectory( utilities ${CMAKE_BINARY_DIR}/bin )
4345
endif()
4446

47+
#
48+
# Build unit tests
49+
#
50+
if( BUILD_UNIT_TESTS )
51+
# Control internal options of GoogleTest
52+
#
53+
set( INSTALL_GTEST OFF CACHE INTERNAL "Not install GoogleTest")
54+
set( BUILD_GMOCK ON CACHE INTERNAL "Build gmock")
55+
56+
# Let's use the new mechanism to incorporate GoogleTest
57+
#
58+
include(FetchContent)
59+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24")
60+
FetchContent_Declare( googletest
61+
URL https://github.com/google/googletest/archive/refs/heads/main.zip
62+
DOWNLOAD_EXTRACT_TIMESTAMP NEW )
63+
endif()
64+
65+
# Prevent overriding the parent project's compiler/linker settings on Windows
66+
#
67+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
68+
FetchContent_MakeAvailable(googletest)
69+
70+
enable_testing() # calling this function before adding subdirectory to enable
71+
# invoking ctest from the top-level build directory.
72+
add_subdirectory( test_scripts )
73+
endif()
74+
4575
#
4676
# Start installation using GNU installation rules
4777
#
4878
include( GNUInstallDirs )
49-
install( TARGETS h5z-sperr LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
79+
install( TARGETS h5z-sperr h5z-clamp LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
5080

5181
if( BUILD_CLI_UTILITIES )
5282
install( TARGETS generate_cd_values decode_cd_values

Diff for: README.md

+75-8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,66 @@ export HDF5_PLUGIN_PATH=/path/to/install/this/plugin
2424
```
2525
The user program does not need to link to this plugin or SPERR; it only needs to specify the plugin ID of `32028`.
2626

27+
<!--
28+
## Use in NetCDF-4 APIs
29+
`H5Z-SPERR` also facilitates the application of SPERR compression on
30+
[NetCDF-4 files](https://docs.unidata.ucar.edu/netcdf/NUG/md_filters.html#filters_enable);
31+
one simply needs to define the filter on a variable:
32+
```C
33+
nc_def_var_filter(ncid, varid, 32028, 1, &cd_values);
34+
```
35+
See a complete example [here](https://github.com/NCAR/H5Z-SPERR/blob/main/example/simple_xy_nc4_wr.c).
36+
-->
37+
38+
## Use in Python
39+
`H5Z-SPERR` version `0.1.3` is supported by the Python package [hdf5plugin](https://github.com/silx-kit/hdf5plugin)
40+
since version `5.0.0`.
41+
One can install the package by issuing
42+
```bash
43+
pip install hdf5plugin [--user] # using pip
44+
conda install -c conda-forge hdf5plugin # using conda
45+
```
46+
and use it by importing these two packages:
47+
```python
48+
import h5py # provide general HDF5 support
49+
import hdf5plugin # provide HDF5 plugin support
50+
```
51+
52+
## Handling of Missing Values
53+
Simulation models sometimes use a special value (i.e., missing value) to indicate that there's no meaningful value at that specific location.
54+
For example, in an ocean simulation, all the land area is marked by missing values.
55+
The most often seen missing values are either `NaN`, or an extremely large value, such as `1e35` or `-9.9e35`.
56+
When these missing values participate in compression, they easily introduce numeric error and result in data corruption.
57+
58+
`H5Z-SPERR` can handle common missing values with a little help from the user.
59+
Specifically, a user can indicate that there's potentially missing values,
60+
and `H5Z-SPERR` will use a compact bitmask to keep track of where exactly those missing values are.
61+
`H5Z-SPERR` then replaces them with a value that is friendly to SPERR compression before passing the field to the SPERR compressor.
62+
63+
During decompression, `H5Z-SPERR` fills the original missing value at locations indicated by the compact bitmask.
64+
Specifically, if the original missing values are `NaN`, then `NaN` will be filled. If the orignal missing values
65+
are values with a magnitude larger than `1e35`, then the *first occurance* of such values will be used to fill
66+
in all missing value locations.
67+
68+
Users use an integer to indicate the potential existance of missing values:
69+
- Mode `0`: no missing values;
70+
- Mode `1`: there are potential `NaN`s;
71+
- Mode `2`: there are potential values with a magnitude larger than `1e35`.
72+
73+
`H5Z-SPERR` behaves accordingly: (`1e35` denotes the *first occurance* of such values)
74+
| Mode | Actual Input Data | Filter Behavior |
75+
|-----------|----------------------|------------------|
76+
| 0 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
77+
| 0 | Has `NaN` or `1e35` | :x: Likely numeric error |
78+
| 1 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
79+
| 1 | Has `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression; `NaN` is restored at its exact locations |
80+
| 1 | Regardless of `NaN`, has `1e35` | :x: Likely numeric error |
81+
| 2 | No `NaN`, no `1e35` | :heavy_check_mark: Normal SPERR compression |
82+
| 2 | No `NaN`, has `1e35` | :heavy_check_mark: Normal SPERR compression; `1e35` is restored at its exact locations |
83+
| 2 | Has `NaN`, regardless of `1e35` | :x: Likely numeric error |
84+
85+
**Final note:** if a variable is indicated to have missing values, but it actually does not, then there's no bitmasks involved thus no storage overhead!
86+
2787
## Find `cd_values[]`
2888
To apply SPERR compression using the HDF5 plugin, one needs to specify 1) what compression mode and 2)
2989
what compression quality to use. Supported compression modes and qualities are summarized below:
@@ -42,7 +102,8 @@ and the `Z` rank to be varying the slowest, before the data is passed to the com
42102

43103
The HDF5 libraries takes in these compression parameters as one or more 32-bit `unsigned int` values,
44104
which are named `cd_values[]` in most HDF5 routines.
45-
In the case of `H5Z-SPERR`, there is exactly one `unsigned int` used to carry this information.
105+
In the case of `H5Z-SPERR`, there is exactly one `unsigned int` used to carry compression-related information, and
106+
possibly one more `unsigned int` to indicate the potential existance of missing values.
46107

47108
### Find `cd_values[]` Using the Programming Interface
48109
Using the HDF5 programming interface, `cd_values[]` carrying the compression parameters are passed
@@ -75,11 +136,17 @@ Please use this value as a single 32-bit unsigned integer in your applications.
75136
Note: an integer produced by `generate_cd_values` can be decoded by another command line tool, `decode_cd_values`,
76137
to show the coded compression parameters.
77138

78-
## Use in NetCDF-4 APIs
79-
`H5Z-SPERR` also facilitates the application of SPERR compression on
80-
[NetCDF-4 files](https://docs.unidata.ucar.edu/netcdf/NUG/md_filters.html#filters_enable);
81-
one simply needs to define the filter on a variable:
82-
```C
83-
nc_def_var_filter(ncid, varid, 32028, 1, &cd_values);
139+
### Examples
140+
Assume using the `nccopy` tool:
141+
```Bash
142+
# Compress variable VAR0, using fixed-rate compression, bitrate = 3.3, no special handling of missing values.
143+
nccopy -F "VAR0, 268651725u" <input_file> <output_file>
144+
nccopy -F "VAR0, 268651725u, 0" <input_file> <output_file>
145+
146+
# Compress variable VAR1, using fixed-rate compression, bitrate = 3.3. VAR1 might have NaNs!
147+
nccopy -F "VAR1, 268651725u, 1" <input_file> <output_file>
148+
149+
# Compress variable VAR2, using fixed-rate compression, bitrate = 3.3. VAR2 might have values such as 1e35!
150+
nccopy -F "VAR2, 268651725u, 2" <input_file> <output_file>
84151
```
85-
See a complete example [here](https://github.com/NCAR/H5Z-SPERR/blob/main/example/simple_xy_nc4_wr.c).
152+

Diff for: include/compactor.h

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/* This is a set of functions that compact a bitmask.
2+
* In the intended use case, a bitmask is produced by masking all
3+
* "missing values" or "fill values" in a model output with zero's,
4+
* whereas the locations with valid data points are marked with one's.
5+
* However, this compactor is likely to be effective with any bit patterns
6+
* that have lots of consecutive 0's or 1's.
7+
*
8+
* The bitmask compactor works in the following way:
9+
* 1. Assume that we use 32-bit ints; the compactor encodes 32 bits at a time.
10+
* 2. Every incoming int is encoded in one of three ways:
11+
* 2.1. For an int with all 0's, use a single 0 bit.
12+
* 2.2. For an int with all 1's, use two bits: 10.
13+
* 2.3. For all other ints, use 34 bits: two bits 11 then followed by the
14+
* verbose presentation of the 32-bit int.
15+
* 3. Obviously, it's the most economical to use a single 0 bit to present
16+
* the most frequent pattern (all 0's or all 1's). The encoder thus does
17+
* a test at the beginning and records the test result.
18+
*/
19+
20+
#ifndef COMPACTOR_H
21+
#define COMPACTOR_H
22+
23+
#include <stdlib.h>
24+
#include <stdint.h>
25+
#include <assert.h>
26+
27+
#ifdef __cplusplus
28+
extern "C" {
29+
#endif
30+
31+
/* Return the compaction strategy to use:
32+
* 0: compact with all 0's being the most frequent
33+
* 1: compact with all 1's being the most frequent
34+
* Note: `bytes` has to be a multiple of 8.
35+
*/
36+
int compactor_strategy(const void* buf, size_t bytes);
37+
38+
/* Return the size in bytes of the resulting compacted bitstream, given an input buf.
39+
* Note: `buf_bytes` has to be a multiple of 8.
40+
*/
41+
size_t compactor_comp_size(const void* buf, size_t buf_bytes);
42+
43+
/* Return the number of useful bytes in a compacted bitstream.
44+
* This value is the same as the output of `compactor_comp_size()` during encoding.
45+
*/
46+
size_t compactor_useful_bytes(const void* comp_buf);
47+
48+
/* Return the useful size of the output bitstream,
49+
* which is the same as the output of `compactor_comp_size()`.
50+
* Note 1: the input bitmask length (in bytes) has to be a multiple of 8.
51+
* This requirement is inheritated from the bitstream implementation.
52+
* Note 2: the output buffer length should be 1) a multiple of 8, and
53+
* 2) no less than the size returned by `compactor_comp_size()`.
54+
*/
55+
size_t compactor_encode(const void* bitmask,
56+
size_t bitmask_bytes,
57+
void* compact_bitstream,
58+
size_t compact_bitstream_bytes);
59+
60+
/* Return the number of useful bytes in the decoded bitmask.
61+
* Note: The number of useful bytes might be bigger than the number of bytes being
62+
* encoded, because of the word size that the compactor operates on.
63+
* Note: `compact_bitstream_bytes` should be a multiple of 8 that is no less than
64+
* the size returned by `compactor_encode()`.
65+
*/
66+
size_t compactor_decode(const void* compact_bitstream,
67+
size_t compact_bitstream_bytes,
68+
void* decoded_bitmask);
69+
70+
#ifdef __cplusplus
71+
}
72+
#endif
73+
74+
#endif

Diff for: include/h5zsperr_helper.h

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* This file contains a few helper functions for the H5Z-SPERR filter.
3+
*/
4+
5+
#ifndef H5ZSPERR_HELPER_H
6+
#define H5ZSPERR_HELPER_H
7+
8+
#include <stdlib.h>
9+
10+
#define LARGE_MAGNITUDE_F 1e35f
11+
#define LARGE_MAGNITUDE_D 1e35
12+
#define H5ZSPERR_COMPATIBILITY 1
13+
14+
#ifdef __cplusplus
15+
namespace C_API {
16+
extern "C" {
17+
#endif
18+
19+
/*
20+
* Pack and unpack additional information about the input data into an integer.
21+
* It returns the encoded unsigned int, which shouldn't be zero.
22+
* The packing function is called by `set_local()` to prepare information
23+
* for `H5Z_filter_sperr()`, which calls the unpack function to extract such info.
24+
*/
25+
unsigned int h5zsperr_pack_extra_info(int rank, int is_float, int missing_val_mode, int magic_num);
26+
void h5zsperr_unpack_extra_info(unsigned int meta,
27+
int* rank,
28+
int* is_float,
29+
int* missing_val_mode,
30+
int* magic_num);
31+
32+
/*
33+
* Check if an input array really has missing values.
34+
*/
35+
int h5zsperr_has_nan(const void* buf, size_t nelem, int is_float);
36+
int h5zsperr_has_large_mag(const void* buf, size_t nelem, int is_float);
37+
38+
/*
39+
* Produce a compact bitmask.
40+
* `mask_buf` is already allocated with length `mask_bytes`.
41+
* Returns 0 upon success.
42+
*/
43+
int h5zsperr_make_mask_nan(const void* data_buf, size_t nelem, int is_float,
44+
void* mask_buf, size_t mask_bytes, size_t* useful_bytes);
45+
int h5zsperr_make_mask_large_mag(const void* data_buf, size_t nelem, int is_float,
46+
void* mask_buf, size_t mask_bytes, size_t* useful_bytes);
47+
48+
/*
49+
* Replace every missing value in the `data_buf` with the mean of the field.
50+
*/
51+
float h5zsperr_treat_nan_f32(float* data_buf, size_t nelem);
52+
double h5zsperr_treat_nan_f64(double* data_buf, size_t nelem);
53+
float h5zsperr_treat_large_mag_f32(float* data_buf, size_t nelem);
54+
double h5zsperr_treat_large_mag_f64(double* data_buf, size_t nelem);
55+
56+
#ifdef __cplusplus
57+
} /* end of extern "C" */
58+
} /* end of namespace C_API */
59+
#endif
60+
61+
#endif

Diff for: include/icecream.h

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* This is a mimic of the Bitstream class in SPERR:
3+
* https://github.com/NCAR/SPERR/blob/main/include/Bitstream.h
4+
*
5+
* The most significant difference is that bitstream here doesn't manage
6+
* any memory; it reads a bit sequence from a user-provided memory buffer,
7+
* or writes a bit sequence to a user-provided memory buffer.
8+
*
9+
* The "object" is named `icecream` and all functions operating on it
10+
* are named with a prefix `icecream`.
11+
*/
12+
13+
#ifndef ICECREAM_H
14+
#define ICECREAM_H
15+
16+
#include <stdlib.h>
17+
#include <stdint.h>
18+
#include <assert.h>
19+
20+
#ifndef NDEBUG
21+
#include <stdio.h>
22+
#endif
23+
24+
#ifdef __cplusplus
25+
extern "C" {
26+
#endif
27+
28+
typedef struct {
29+
uint64_t* begin; /* begin of the stream */
30+
uint64_t* ptr; /* pointer to the next word to be read/written */
31+
uint64_t buffer; /* incoming/outgoing bits */
32+
int bits; /* number of buffered bits (0 <= bits < 64) */
33+
} icecream;
34+
35+
/*
36+
* Specify a bitstream to use memory provided by users.
37+
* NOTE: the memory length (in bytes) have to be a multiplier of 8,
38+
* because the icecream class writes/reads in 64-bit integers.
39+
*/
40+
void icecream_use_mem(icecream* s, void* mem, size_t bytes);
41+
42+
/* Position the bitstream for reading or writing at the beginning. */
43+
void icecream_rewind(icecream* s);
44+
45+
/* Read a bit. Please don't read beyond the end of the stream. */
46+
int icecream_rbit(icecream* s);
47+
48+
/* Write a bit (0 or 1). Please don't write beyond the end of the stream. */
49+
void icecream_wbit(icecream* s, int bit);
50+
51+
/* Return the bit offset to the next bit to be read. */
52+
size_t icecream_rtell(icecream* s);
53+
54+
/* Return the bit offset to the next bit to be written. */
55+
size_t icecream_wtell(icecream* s);
56+
57+
/* Write any remaining buffered bits and align stream on next word boundary. */
58+
void icecream_flush(icecream* s);
59+
60+
#ifdef __cplusplus
61+
}
62+
#endif
63+
64+
#endif

Diff for: src/CMakeLists.txt

+14-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
1-
add_library( h5z-sperr h5z-sperr.c )
1+
#
2+
# The main H5Z-SPERR filter
3+
#
4+
add_library( h5z-sperr h5z-sperr.c
5+
h5zsperr_helper.cpp
6+
icecream.c
7+
compactor.c)
28
target_include_directories( h5z-sperr PUBLIC ${HDF5_INCLUDE_DIR}
39
PUBLIC ${SPERR_INCLUDE_DIRS}
410
PUBLIC ${CMAKE_SOURCE_DIR}/include )
511
target_link_libraries( h5z-sperr PUBLIC ${HDF5_LIBRARIES} PUBLIC PkgConfig::SPERR )
12+
13+
#
14+
# Experimental clamping filter
15+
#
16+
add_library( h5z-clamp h5z-clamp.c )
17+
target_include_directories( h5z-clamp PUBLIC ${HDF5_INCLUDE_DIR} )
18+
target_link_libraries( h5z-clamp PUBLIC ${HDF5_LIBRARIES} )

0 commit comments

Comments
 (0)