Skip to content

Commit 5ef112e

Browse files
committed
zap: TinyZAP for multi-uint64 entries.
MicroZAP is limited to 1×uint64 values and 49-char keys, any wider entry forces a full FatZAP upgrade. TinyZAP avoids this for the common case of multi-integer values (e.g. Lustre FIDs) and long keys. Introduce TinyZAP, a MicroZAP variant reuses mzap_phys_t, repurposing the padding bytes after mz_normflags as three independent uint8_t fields: mz_flags bit 0 = MZAP_FLAG_TINY mz_chunk_shift log2(chunk): 6=64B, 7=128B, 8=256B mz_value_ints stride / 8 (number of uint64 values per entry) Geometry is stamped automatically on the first zap_add() based on observed entry shape. no create-time hint is required. Subsequent adds must match the stamped geometry or a FatZAP upgrade is triggered. All ZAP operations (add, update, remove, lookup, cursor, byteswap, upgrade to FatZAP) dispatch to TinyZAP paths when zap_stride != 0. Signed-off-by: Akash B <akash-b@hpe.com>
1 parent 88656cc commit 5ef112e

30 files changed

Lines changed: 2650 additions & 78 deletions

cmd/zdb/zdb.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
* Copyright (c) 2023, 2024, Klara Inc.
3838
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
3939
* Copyright (c) 2026, TrueNAS.
40+
* Copyright (c) 2026, Hewlett Packard Enterprise Development LP.
4041
*/
4142

4243
#include <stdio.h>
@@ -1032,9 +1033,21 @@ dump_zap_stats(objset_t *os, uint64_t object)
10321033

10331034
if (zs.zs_ptrtbl_len == 0) {
10341035
ASSERT(zs.zs_num_blocks == 1);
1035-
(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
1036-
(u_longlong_t)zs.zs_blocksize,
1037-
(u_longlong_t)zs.zs_num_entries);
1036+
if (zs.zs_is_tinyzap) {
1037+
/* TinyZAP */
1038+
(void) printf("\ttinyzap: %llu bytes, %llu entries, "
1039+
"stride %llu chunk=%llu num_chunks=%llu\n",
1040+
(u_longlong_t)zs.zs_blocksize,
1041+
(u_longlong_t)zs.zs_num_entries,
1042+
(u_longlong_t)zs.zs_tinyzap_stride,
1043+
(u_longlong_t)zs.zs_tinyzap_chunk,
1044+
(u_longlong_t)zs.zs_tinyzap_num_chunks);
1045+
} else {
1046+
/* Plain MicroZAP */
1047+
(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
1048+
(u_longlong_t)zs.zs_blocksize,
1049+
(u_longlong_t)zs.zs_num_entries);
1050+
}
10381051
return;
10391052
}
10401053

include/sys/zap.h

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
2626
* Copyright 2017 Nexenta Systems, Inc.
2727
* Copyright (c) 2026, TrueNAS.
28+
* Copyright (c) 2026, Hewlett Packard Enterprise Development LP.
2829
*/
2930

3031
#ifndef _SYS_ZAP_H
@@ -61,11 +62,24 @@
6162
*
6263
* Implementation / Performance Notes:
6364
*
64-
* The ZAP is intended to operate most efficiently on attributes with
65-
* short (49 bytes or less) names and single 8-byte values, for which
66-
* the microzap will be used. The ZAP should be efficient enough so
65+
* The ZAP operates in three modes, selected automatically:
66+
*
67+
* MicroZAP: most efficient for attributes with short names (up to 49
68+
* characters, 50 bytes including NULL ('\0')) and a single 8-byte value.
69+
* Fixed 64-byte chunk layout. The ZAP should be efficient enough
6770
* that the user does not need to cache these attributes.
6871
*
72+
* TinyZAP: used when an entry cannot fit MicroZAP, i.e. when either
73+
* condition is true:
74+
* - num_integers > 1 (value too wide for MicroZAP), OR
75+
* - strlen(key) >= MZAP_NAME_LEN (name too long for MicroZAP)
76+
* AND at least one chunk size (64/128/256 bytes) can accommodate the
77+
* entry. The chunk size and stride are stamped automatically on the
78+
* first zap_add(). No create-time hint is required.
79+
*
80+
* FatZAP: used for all other cases, or when the ZAP grows beyond the
81+
* capacity of a single block. Supports arbitrary name/value sizes.
82+
*
6983
* The ZAP's locking scheme makes its routines thread-safe. Operations
7084
* on different zapobjs will be processed concurrently. Operations on
7185
* the same zapobj which only read data will be processed concurrently.
@@ -181,7 +195,7 @@ int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj,
181195
dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
182196

183197
/*
184-
* All operations on a zapobj take either the the objset/objectid pair
198+
* All operations on a zapobj take either the objset/objectid pair
185199
* that "names" the object, or an existing dnode_t for the object. The
186200
* zapobj passed in must be a valid ZAP object.
187201
*/
@@ -262,7 +276,7 @@ int zap_contains(objset_t *os, uint64_t zapobj, const char *name);
262276

263277
/*
264278
* Prefetch the blocks within the ZAP where the given key is stored. The
265-
* prefetch IO will occure in the background.
279+
* prefetch IO will occur in the background.
266280
*/
267281
int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name);
268282

@@ -533,6 +547,20 @@ typedef struct zap_stats {
533547
uint64_t zs_num_entries; /* The number of zap entries */
534548
uint64_t zs_salt; /* salt to stir into hash function */
535549

550+
/*
551+
* TinyZAP statistics. Only meaningful when zs_is_tinyzap is B_TRUE.
552+
*
553+
* zs_is_tinyzap: B_TRUE if MZAP_FLAG_TINYZAP is set.
554+
* zs_tinyzap_stride: value width in bytes (8..255, mult of 8).
555+
* zs_tinyzap_chunk: chunk size in bytes (1 << mz_chunk_shift).
556+
* zs_tinyzap_flags: raw mz_flags uint8 (for zdb diagnostics).
557+
*/
558+
boolean_t zs_is_tinyzap;
559+
uint64_t zs_tinyzap_stride; /* value width: 8..255 bytes */
560+
uint64_t zs_tinyzap_chunk; /* chunk size: 64 / 128 / 256 */
561+
uint64_t zs_tinyzap_num_chunks; /* number of chunks used */
562+
uint64_t zs_tinyzap_flags; /* raw mz_flags for zdb */
563+
536564
/*
537565
* Histograms. For all histograms, the last index
538566
* (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater

0 commit comments

Comments
 (0)