perf: replace offset helper functions with OffsetsIterator type

Beforerr · Beforerr · commit 34af43b44eab · 2026-02-03T12:08:53.000-08:00
- Introduced OffsetsIterator struct to encapsulate offset traversal logic
- Replaced get_offsets, get_offsets!, and get_offsets_lazy with iterator-based approach
- Changed cdf_magic_bytes from Vector to Tuple for immutability
- Optimized attribute name comparison using codeunits instead of String conversion
diff --git a/src/loading/attribute.jl b/src/loading/attribute.jl
@@ -8,7 +8,7 @@ Load all attribute entries for a given attribute from its AEDRs.
 """
 @inline function load_attribute_entries(buffer::Vector{UInt8}, adr, RecordSizeType, cdf_encoding)
     head = max(adr.AgrEDRhead, adr.AzEDRhead)
-    offsets = get_offsets(buffer, head, RecordSizeType)
+    offsets = OffsetsIterator{RecordSizeType}(buffer, head)
     needs_byte_swap = is_big_endian_encoding(cdf_encoding)
     return map(offsets) do offset
         load_aedr_data(buffer, offset, RecordSizeType, needs_byte_swap)
@@ -24,7 +24,7 @@ function attrib(cdf::CDFDataset; predicate = is_global)
     RecordSizeType = recordsize_type(cdf)
     buffer = cdf.buffer
     cdf_encoding = cdf.cdr.encoding
-    offsets = get_offsets(buffer, cdf.gdr.ADRhead, RecordSizeType)
+    offsets = collect(OffsetsIterator(cdf))
     adrs = map(of -> ADR(buffer, of, RecordSizeType), offsets)
     adrs = filter!(predicate, adrs)
     names = map(adr -> String(adr.Name), adrs)
@@ -43,10 +43,11 @@ function attrib(cdf::CDFDataset, name::String)
     RecordSizeType = recordsize_type(cdf)
     buffer = cdf.buffer
     cdf_encoding = cdf.cdr.encoding
-    offsets = get_offsets_lazy(buffer, cdf.gdr.ADRhead, RecordSizeType)
+    offsets = OffsetsIterator(cdf)
+    cu = codeunits(name)
     for offset in offsets
         adr = ADR(buffer, offset, RecordSizeType)
-        name == String(adr.Name) && return load_attribute_entries(buffer, adr, RecordSizeType, cdf_encoding)
+        cu == adr.Name && return load_attribute_entries(buffer, adr, RecordSizeType, cdf_encoding)
     end
     error("Attribute '$name' not found in CDF file")
 end
@@ -61,7 +62,7 @@ function vattrib(cdf::CDFDataset, varnum::Integer)
     buffer = cdf.buffer
     cdf_encoding = cdf.cdr.encoding
     attributes = Dict{String, Union{String, Vector}}()
-    offsets = get_offsets_lazy(buffer, cdf.gdr.ADRhead, RecordSizeType)
+    offsets = OffsetsIterator(cdf)
     needs_byte_swap = is_big_endian_encoding(cdf_encoding)
     for offset in offsets
         is_global(buffer, offset, RecordSizeType) && continue
@@ -99,7 +100,7 @@ function vattrib(cdf, varnum, name)
     cdf_encoding = cdf.cdr.encoding
 
     # Search for the specific attribute by name first
-    offsets = get_offsets_lazy(buffer, cdf.gdr.ADRhead, RecordSizeType)
+    offsets = OffsetsIterator(cdf)
     name_bytes = codeunits(name)
     needs_byte_swap = is_big_endian_encoding(cdf_encoding)
     for offset in offsets
@@ -141,7 +142,7 @@ function attribnames(cdf::CDFDataset; predicate = is_global)
     names = String[]
     buffer = cdf.buffer
     RecordSizeType = recordsize_type(cdf)
-    offsets = get_offsets_lazy(buffer, cdf.gdr.ADRhead, RecordSizeType)
+    offsets = OffsetsIterator(cdf)
     for offset in offsets
         adr = ADR(buffer, offset, RecordSizeType)
         predicate(adr) && push!(names, String(adr.Name))
diff --git a/src/parsing.jl b/src/parsing.jl
@@ -122,24 +122,6 @@ function readname(buf::Vector{UInt8}, offset::Int)
     return @views buf[offset:(offset + 255)]
 end
 
-@resumable function get_offsets_lazy(buffer::Vector{UInt8}, pos, ::Type{RecordSizeType}) where {RecordSizeType}
-    pos = Int(pos)
-    while pos != 0
-        @yield pos
-        pos = Int(read_be(buffer, pos + 1 + sizeof(RecordSizeType) + 4, RecordSizeType))
-    end
-end
-
-function get_offsets!(offsets, buffer::Vector{UInt8}, pos, FieldSizeType)
-    pos = Int(pos)
-    while pos != 0
-        push!(offsets, pos)
-        pos = Int(read_be(buffer, pos + 1 + sizeof(FieldSizeType) + 4, FieldSizeType))
-    end
-    return offsets
-end
-get_offsets(args...) = get_offsets!(Int[], args...)
-
 """
     is_cdf_v3(magic_bytes)
 
@@ -157,7 +139,7 @@ function is_big_endian_encoding(encoding)
     return encoding in (1, 2, 5, 7, 9, 12, 19)
 end
 
-const cdf_magic_bytes = [0xCDF30001, 0xCDF26002, 0x0000FFFF] # CDF format uses different magic numbers: CDF3.0, CDF2.x versions
+const cdf_magic_bytes = (0xCDF30001, 0xCDF26002, 0x0000FFFF) # CDF format uses different magic numbers: CDF3.0, CDF2.x versions
 
 function validate_cdf_magic(magic_bytes)
     return magic_bytes in cdf_magic_bytes
diff --git a/src/types.jl b/src/types.jl
@@ -5,3 +5,21 @@ struct RInt32 <: ReservedField end
 
 _sizeof(x) = sizeof(x)
 _sizeof(::Type{RInt32}) = sizeof(Int32)
+
+struct OffsetsIterator{RecordSizeType}
+    buffer::Vector{UInt8}
+    start_pos::Int
+end
+
+Base.IteratorSize(::Type{<:OffsetsIterator}) = Base.SizeUnknown()
+Base.eltype(::Type{<:OffsetsIterator}) = Int
+
+function OffsetsIterator(cdf::CDFDataset)
+    return OffsetsIterator{recordsize_type(cdf)}(cdf.buffer, cdf.gdr.ADRhead)
+end
+
+function Base.iterate(iter::OffsetsIterator{RecordSizeType}, pos::Int = iter.start_pos) where {RecordSizeType}
+    pos == 0 && return nothing
+    next_pos = Int(read_be(iter.buffer, pos + 1 + sizeof(RecordSizeType) + 4, RecordSizeType))
+    return (pos, next_pos)
+end