fix: allocator EOA, SNOD sorting, B-tree key comparison (Issue #28, #29) (#31)

kolkov · web-flow · commit 910cbeb0e515 · 2026-03-14T13:57:17.000+03:00
* fix: allocator EOA, SNOD sorting, B-tree key comparison (Issue #28, #29) - Bug A: update allocator after OHDR grows from attribute addition, so superblock EOA covers entire file (h5dump/h5py rejected files) - Bug B: B-tree v1 right key now uses lexicographically largest name (strcmp comparison per H5Gnode.c), not numerically largest heap offset - Bug C: SNOD entries sorted by name after insertion (per H5G__node_insert) - ObjectHeaderSizeFromParsed supports both v1 and v2 headers - 4 regression tests covering all three bugs * chore: fix pre-existing gosec G115 and unused nolint directives Suppress safe integer conversions (signed-to-unsigned for binary serialization) and remove stale nolint directives that no longer match current golangci-lint rules. * style: fix gofmt alignment in messages_write.go
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,52 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ---
 
+## [v0.13.11] - 2026-03-14
+
+### Bug Fix
+
+#### Write interoperability: attributes, SNOD sorting, B-tree keys (Issue #28, #29)
+
+Fixed 3 bugs that caused files with groups + attributes or multiple root-level datasets
+to be unreadable by h5dump, h5ls, and h5py.
+
+**Bug A: Superblock EOA too small after adding attributes**
+
+Adding an attribute to a dataset inside a group grew the V2 object header beyond the
+allocator's tracked end-of-file. The superblock EOA was not updated, causing h5dump/h5py
+to reject the file with "truncated file" or "actual len exceeds EOA" errors.
+
+Fix: after writing the modified object header, compare its new end address with the
+allocator's EOF and advance the allocator if needed. `ObjectHeaderSizeFromParsed()` now
+supports both v1 and v2 headers for future-proofing.
+
+**Bug B: B-tree v1 right key used numeric offset instead of lexicographic comparison**
+
+Per C reference (`H5Gnode.c:340-373`, `H5G__node_cmp2`), B-tree v1 group node keys are
+compared by looking up strings in the local heap and using `strcmp`. Our code set the right
+key to the numerically largest heap offset, which is not necessarily the offset of the
+lexicographically largest name. This caused h5dump/h5ls to miss entries whose names sorted
+after the right key's name.
+
+Fix: iterate all SNOD entries, resolve names from the local heap, and select the offset of
+the string that sorts last by Go `>` comparison (equivalent to `strcmp`).
+
+**Bug C: Symbol table node entries not sorted by name**
+
+Per C reference (`H5Gnode.c:573-591`, `H5G__node_insert`), the C library uses binary search
+with `strncmp` to find the insertion point, keeping entries sorted at all times. Our code
+appended entries in insertion order via `AddEntry`. When datasets were created in
+non-alphabetical order (e.g., `/uint` before `/float`), h5dump/h5ls could not find them.
+
+Fix: `sort.Slice` SNOD entries by name after each insertion in `linkToParent()`.
+
+**Validation**: 4 regression tests added. All 8 h5dump test scenarios pass. Full test suite
+green (all packages).
+
+Reported by [@vrv-bit](https://github.com/vrv-bit) in Issue #28 and Issue #29.
+
+---
+
 ## [v0.13.10] - 2026-03-06
 
 ### 🐛 Bug Fix
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -3,7 +3,7 @@
 > **Strategic Advantage**: We have official HDF5 C library as reference implementation!
 > **Approach**: Port proven algorithms, not invent from scratch - Senior Go Developer mindset
 
-**Last Updated**: 2026-03-06 | **Current Version**: v0.13.10 | **Strategy**: HDF5 2.0.0 compatible → security hardened → v1.0.0 LTS | **Milestone**: v0.13.10 RELEASED! (2026-03-06) → v1.0.0 LTS (Q3 2026)
+**Last Updated**: 2026-03-14 | **Current Version**: v0.13.11 | **Strategy**: HDF5 2.0.0 compatible → security hardened → v1.0.0 LTS | **Milestone**: v0.13.11 RELEASED! (2026-03-14) → v1.0.0 LTS (Q3 2026)
 
 ---
 
@@ -58,6 +58,8 @@ v0.13.8 (HOTFIX - EOA compatibility) ✅ RELEASED 2026-03-04
 v0.13.9 (HOTFIX - V2 object header checksum) ✅ RELEASED 2026-03-04
          ↓ (interoperability fix)
 v0.13.10 (BUGFIX - h5dump/h5ls/h5py interop) ✅ RELEASED 2026-03-06
+         ↓ (attribute + SNOD + B-tree key fixes)
+v0.13.11 (HOTFIX - write interop: attrs, sorting, keys) ✅ RELEASED 2026-03-14
          ↓ (maintenance continues)
 v0.13.x (maintenance phase) → Stable maintenance, bug fixes, minor enhancements
          ↓ (6-9 months production validation)
diff --git a/attribute_write.go b/attribute_write.go
@@ -294,6 +294,21 @@ func writeCompactAttribute(fw *FileWriter, objectAddr uint64, oh *core.ObjectHea
 		return fmt.Errorf("failed to write object header: %w", err)
 	}
 
+	// 6. Update allocator if the object header grew beyond currently tracked EOF.
+	// Adding an attribute message increases the OHDR size. If the OHDR is at the
+	// end of the file, the extra bytes extend past what the allocator knows about.
+	// Without this, the superblock EOA will be too small and h5dump/h5py will
+	// reject the file ("actual len exceeds EOA").
+	newHeaderSize := core.ObjectHeaderSizeFromParsed(oh)
+	objectHeaderEnd := objectAddr + newHeaderSize
+	allocator := fw.writer.Allocator()
+	if allocator.EndOfFile() < objectHeaderEnd {
+		bytesToAdvance := objectHeaderEnd - allocator.EndOfFile()
+		if _, allocErr := allocator.Allocate(bytesToAdvance); allocErr != nil {
+			return fmt.Errorf("failed to advance allocator past grown object header: %w", allocErr)
+		}
+	}
+
 	return nil
 }
 
diff --git a/dataset_read_hyperslab.go b/dataset_read_hyperslab.go
@@ -760,7 +760,7 @@ func generateChunkCoordinates(first, last []uint64) [][]uint64 {
 	// Calculate total number of chunks
 	totalChunks := 1
 	for i := 0; i < ndims; i++ {
-		totalChunks *= int(last[i] - first[i] + 1)
+		totalChunks *= int(last[i] - first[i] + 1) //nolint:gosec // G115: chunk count bounded by dataset dimensions
 	}
 
 	result := make([][]uint64, 0, totalChunks)
diff --git a/dataset_write.go b/dataset_write.go
@@ -238,7 +238,7 @@ func (h *arrayTypeHandler) GetInfo(config *datasetConfig) (*datatypeInfo, error)
 	// Calculate total array size (product of all dimensions * element size)
 	arraySize := uint32(1)
 	for _, dim := range config.arrayDims {
-		arraySize *= uint32(dim)
+		arraySize *= uint32(dim) //nolint:gosec // G115: array dims bounded by HDF5 format limits
 	}
 	arraySize *= baseInfo.size
 
@@ -320,13 +320,13 @@ func (h *enumTypeHandler) EncodeDatatypeMessage(info *datatypeInfo) ([]byte, err
 		offset := i * int(info.baseType.size)
 		switch info.baseType.size {
 		case 1:
-			valueBytes[offset] = byte(val)
+			valueBytes[offset] = byte(val) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		case 2:
-			binary.LittleEndian.PutUint16(valueBytes[offset:], uint16(val))
+			binary.LittleEndian.PutUint16(valueBytes[offset:], uint16(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		case 4:
-			binary.LittleEndian.PutUint32(valueBytes[offset:], uint32(val))
+			binary.LittleEndian.PutUint32(valueBytes[offset:], uint32(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		case 8:
-			binary.LittleEndian.PutUint64(valueBytes[offset:], uint64(val))
+			binary.LittleEndian.PutUint64(valueBytes[offset:], uint64(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		}
 	}
 
@@ -1349,7 +1349,7 @@ func (dw *DatasetWriter) writeVLen(data interface{}) error {
 			// Convert sequence to bytes (little-endian)
 			seqBytes := make([]byte, len(seq)*4)
 			for j, val := range seq {
-				binary.LittleEndian.PutUint32(seqBytes[j*4:], uint32(val))
+				binary.LittleEndian.PutUint32(seqBytes[j*4:], uint32(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 			}
 
 			// Write to global heap
@@ -1369,7 +1369,7 @@ func (dw *DatasetWriter) writeVLen(data interface{}) error {
 		for i, seq := range v {
 			seqBytes := make([]byte, len(seq)*8)
 			for j, val := range seq {
-				binary.LittleEndian.PutUint64(seqBytes[j*8:], uint64(val))
+				binary.LittleEndian.PutUint64(seqBytes[j*8:], uint64(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 			}
 
 			heapID, err := dw.fileWriter.globalHeapWriter.WriteToGlobalHeap(seqBytes)
@@ -1650,7 +1650,7 @@ func encode1ByteIntegers(data interface{}, buf []byte) ([]byte, error) {
 	switch v := data.(type) {
 	case []int8:
 		for i, val := range v {
-			buf[i] = byte(val)
+			buf[i] = byte(val) //nolint:gosec // G115: intentional int8-to-byte for serialization
 		}
 	case []uint8:
 		copy(buf, v)
@@ -1665,7 +1665,7 @@ func encode2ByteIntegers(data interface{}, buf []byte) ([]byte, error) {
 	switch v := data.(type) {
 	case []int16:
 		for i, val := range v {
-			binary.LittleEndian.PutUint16(buf[i*2:], uint16(val))
+			binary.LittleEndian.PutUint16(buf[i*2:], uint16(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		}
 	case []uint16:
 		for i, val := range v {
@@ -1682,7 +1682,7 @@ func encode4ByteIntegers(data interface{}, buf []byte) ([]byte, error) {
 	switch v := data.(type) {
 	case []int32:
 		for i, val := range v {
-			binary.LittleEndian.PutUint32(buf[i*4:], uint32(val))
+			binary.LittleEndian.PutUint32(buf[i*4:], uint32(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		}
 	case []uint32:
 		for i, val := range v {
@@ -1699,7 +1699,7 @@ func encode8ByteIntegers(data interface{}, buf []byte) ([]byte, error) {
 	switch v := data.(type) {
 	case []int64:
 		for i, val := range v {
-			binary.LittleEndian.PutUint64(buf[i*8:], uint64(val))
+			binary.LittleEndian.PutUint64(buf[i*8:], uint64(val)) //nolint:gosec // G115: intentional signed-to-unsigned for serialization
 		}
 	case []uint64:
 		for i, val := range v {
diff --git a/group_write.go b/group_write.go
@@ -2,6 +2,7 @@ package hdf5
 
 import (
 	"fmt"
+	"sort"
 	"strings"
 
 	"github.com/scigolib/hdf5/internal/core"
@@ -294,6 +295,8 @@ func parsePath(path string) (parent, name string) {
 //
 // Returns:
 //   - error: If linking fails
+//
+//nolint:gocognit,gocyclo,cyclop // Complex but necessary: sorted insertion + string-based B-tree key update
 func (fw *FileWriter) linkToParent(parentPath, childName string, childAddr uint64) error {
 	// Get parent group metadata
 	var heapAddr, stNodeAddr, btreeAddr uint64
@@ -342,6 +345,26 @@ func (fw *FileWriter) linkToParent(parentPath, childName string, childAddr uint6
 		return fmt.Errorf("add entry to symbol table: %w", err)
 	}
 
+	// Step 4b: Sort SNOD entries by name (HDF5 format requirement).
+	// The C library expects symbol table entries sorted by strcmp on the name
+	// looked up from the local heap. Without sorting, h5dump/h5ls fail when
+	// entries are added in non-alphabetical order.
+	sort.Slice(stNode.Entries, func(i, j int) bool {
+		ni, nj := stNode.Entries[i].LinkNameOffset, stNode.Entries[j].LinkNameOffset
+		var si, sj string
+		if ni == nameOffset {
+			si = childName
+		} else {
+			si, _ = heap.GetString(ni)
+		}
+		if nj == nameOffset {
+			sj = childName
+		} else {
+			sj, _ = heap.GetString(nj)
+		}
+		return si < sj
+	})
+
 	// Step 5: Write updated heap
 	if err := heap.WriteTo(fw.writer, heapAddr); err != nil {
 		return fmt.Errorf("write heap: %w", err)
@@ -353,13 +376,33 @@ func (fw *FileWriter) linkToParent(parentPath, childName string, childAddr uint6
 		return fmt.Errorf("write symbol table: %w", err)
 	}
 
-	// Step 7: Update B-tree right key (key[1]) to reflect max name offset.
-	// Per HDF5 spec, B-tree v1 with N children has N+1 keys.
-	// Key[0] = 0 (left boundary), Key[N] = max name offset (right boundary).
-	// Without this, h5ls/h5dump cannot find children in the group.
+	// Step 7: Update B-tree right key (key[1]) to reflect the lexicographically
+	// largest name's local heap offset. Per HDF5 spec, B-tree v1 group nodes
+	// compare keys by looking up strings in the local heap and using strcmp.
+	// The right key must be the offset of the string that sorts LAST, not the
+	// numerically largest offset. Without this, h5dump/h5ls cannot find entries
+	// whose names sort after the right key's name.
+	//
+	// Note: heap.GetString() reads from heap.Data (on-disk snapshot). The entry
+	// we just added (at nameOffset) is only in heap.strings (not yet flushed to
+	// Data), so we use childName directly for that entry.
 	var maxNameOffset uint64
+	var maxName string
 	for _, e := range stNode.Entries {
-		if e.LinkNameOffset > maxNameOffset {
+		var entryName string
+		if e.LinkNameOffset == nameOffset {
+			// This is the entry we just added — use childName directly
+			// because heap.Data hasn't been updated yet.
+			entryName = childName
+		} else {
+			var nameErr error
+			entryName, nameErr = heap.GetString(e.LinkNameOffset)
+			if nameErr != nil {
+				continue
+			}
+		}
+		if entryName > maxName {
+			maxName = entryName
 			maxNameOffset = e.LinkNameOffset
 		}
 	}
diff --git a/internal/core/messages_write.go b/internal/core/messages_write.go
@@ -141,7 +141,7 @@ func encodeChunkedLayout(chunkDims []uint64, btreeAddress uint64, sb *Superblock
 
 	// Chunk dimensions (each 4 bytes, uint32)
 	for _, dim := range chunkDims {
-		binary.LittleEndian.PutUint32(buf[offset:], uint32(dim))
+		binary.LittleEndian.PutUint32(buf[offset:], uint32(dim)) //nolint:gosec // G115: chunk dims bounded by HDF5 format limits
 		offset += 4
 	}
 
@@ -252,8 +252,8 @@ func encodeDatatypeNumeric(dt *DatatypeMessage) ([]byte, error) {
 		}
 
 		properties = make([]byte, 12)
-		binary.LittleEndian.PutUint16(properties[0:2], 0)                 // bit_offset = 0
-		binary.LittleEndian.PutUint16(properties[2:4], uint16(dt.Size*8)) //nolint:gosec // G115: precision bits
+		binary.LittleEndian.PutUint16(properties[0:2], 0) // bit_offset = 0
+		binary.LittleEndian.PutUint16(properties[2:4], uint16(dt.Size*8))
 		properties[4] = epos
 		properties[5] = esize
 		properties[6] = mpos
@@ -265,7 +265,7 @@ func encodeDatatypeNumeric(dt *DatatypeMessage) ([]byte, error) {
 		//   uint16: bit_precision (total bits)
 		properties = make([]byte, 4)
 		binary.LittleEndian.PutUint16(properties[0:2], 0)                 // bit_offset = 0
-		binary.LittleEndian.PutUint16(properties[2:4], uint16(dt.Size*8)) //nolint:gosec // G115: precision bits
+		binary.LittleEndian.PutUint16(properties[2:4], uint16(dt.Size*8)) //nolint:gosec // G115: dt.Size is 1/2/4/8, max value 64 fits uint16
 	}
 
 	// Build message: header (8 bytes) + properties
diff --git a/internal/core/objectheader_write.go b/internal/core/objectheader_write.go
@@ -514,6 +514,31 @@ func WriteObjectHeader(w io.WriterAt, addr uint64, oh *ObjectHeader, sb *Superbl
 	return nil
 }
 
+// ObjectHeaderSizeFromParsed calculates the on-disk size of an ObjectHeader
+// (as returned by ReadObjectHeader). This is used to determine how much space
+// the header occupies after modification (e.g., adding attributes).
+// Supports both v1 and v2 object headers.
+func ObjectHeaderSizeFromParsed(oh *ObjectHeader) uint64 {
+	if oh == nil {
+		return 0
+	}
+	if oh.Version != 1 && oh.Version != 2 {
+		return 0
+	}
+	ohw := &ObjectHeaderWriter{
+		Version:  oh.Version,
+		Flags:    oh.Flags,
+		Messages: make([]MessageWriter, len(oh.Messages)),
+	}
+	for i, msg := range oh.Messages {
+		ohw.Messages[i] = MessageWriter{
+			Type: msg.Type,
+			Data: msg.Data,
+		}
+	}
+	return ohw.Size()
+}
+
 // RewriteObjectHeaderV2 rewrites an object header v2 with updated messages.
 // This handles the case where we need to modify an existing object header
 // by reading it, modifying it, and writing it back.
diff --git a/internal/testing/mock_reader.go b/internal/testing/mock_reader.go
@@ -1,5 +1,5 @@
 // Package testing provides test utilities for HDF5 library testing.
-package testing //nolint:revive // internal test utilities, not a public package
+package testing
 
 import "errors"
 
diff --git a/interop_regression_test.go b/interop_regression_test.go

Original file line number	Diff line number	Diff line change
`@@ -760,7 +760,7 @@ func generateChunkCoordinates(first, last []uint64) [][]uint64 {`
`760`	`760`	`// Calculate total number of chunks`
`761`	`761`	`totalChunks := 1`
`762`	`762`	`for i := 0; i < ndims; i++ {`
`763`		`- totalChunks *= int(last[i] - first[i] + 1)`
	`763`	`+ totalChunks *= int(last[i] - first[i] + 1) //nolint:gosec // G115: chunk count bounded by dataset dimensions`
`764`	`764`	`}`
`765`	`765`
`766`	`766`	`result := make([][]uint64, 0, totalChunks)`