feat: add CommonDataModel extension and expose CDF data type constants (#10)

Beforerr · web-flow · commit 8c6ea1735c96 · 2025-09-22T18:34:06.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 *.jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest*.toml
+Manifest*.toml
 /docs/Manifest*.toml
 /docs/build/
 ref/
diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,12 @@ Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
 StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
 UnixTimes = "ab1a18e7-b408-4913-896c-624bb82ed7f4"
 
+[weakdeps]
+CommonDataModel = "1fbeeb36-5f17-413c-809b-666fb144f157"
+
+[extensions]
+CommonDataFormatCommonDataModelExt = ["CommonDataModel"]
+
 [compat]
 CodecZlib = "0.7"
 Dates = "1"
diff --git a/README.md b/README.md
@@ -4,12 +4,12 @@
 [![Build Status](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/JuliaSpacePhysics/CommonDataFormat.jl/actions/workflows/CI.yml?query=branch%3Amain)
 [![Coverage](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/JuliaSpacePhysics/CommonDataFormat.jl)
 
-A Julia package for reading Common Data Format (CDF) files, widely used in space physics and other scientific domains for storing multidimensional data arrays and metadata.
+A Julia package for reading Common Data Format (CDF) files, widely used in space physics for storing multidimensional data arrays and metadata.
 
 ## Features
 
 - **Pure Julia implementation** - No external dependencies on CDF libraries
-- **Efficient data access** - Lazy loading and memory-mapped access
+- **Efficient data access** - Lazy memory-mapped access for data and attributes, super fast decompression using [`LibDeflate`](https://github.com/jakobnissen/LibDeflate.jl)
 
 ## Installation
 
diff --git a/ext/CommonDataFormatCommonDataModelExt.jl b/ext/CommonDataFormatCommonDataModelExt.jl
@@ -0,0 +1,41 @@
+module CommonDataFormatCommonDataModelExt
+
+using CommonDataFormat
+import CommonDataFormat as CDF
+import CommonDataModel
+import CommonDataModel as CDM
+using CommonDataFormat: CDFDataset, CDFVariable
+
+const SymbolOrString = Union{Symbol, AbstractString}
+
+# Dataset level -----------------------------------------------------------------
+
+CDM.path(ds::CDFDataset) = CDF.filename(ds)
+CDM.varnames(ds::CDFDataset) = keys(ds)
+
+function CDM.variable(ds::CDFDataset, name::SymbolOrString)
+    return CDF.variable(ds, String(name))
+end
+
+CDM.attribnames(ds::CDFDataset) = CDF.attribnames(ds)
+CDM.attrib(ds::CDFDataset, args...) = CDF.attrib(ds, args...)
+
+# Variable level ----------------------------------------------------------------
+
+CDM.name(var::CDFVariable) = var.name
+CDM.dataset(var::CDFVariable) = var.parentdataset
+CDM.attribnames(var::CDFVariable) = keys(CDF.attrib(var))
+CDM.attrib(var::CDFVariable, args...) = CDF.attrib(var, args...)
+@inline function CDM.dimnames(var::CDFVariable, i)
+    @assert i <= ndims(var) DimensionMismatch()
+    key = if i == 1
+        "DEPEND_0"
+    elseif i == 2
+        "DEPEND_1"
+    elseif i == 3
+        "DEPEND_2"
+    end
+    return CDF.attrib(var, key)
+end
+
+end
diff --git a/src/CommonDataFormat.jl b/src/CommonDataFormat.jl
@@ -12,6 +12,7 @@ using LibDeflate: GzipDecompressResult
 export CDFDataset, CDFVariable
 export Majority, CompressionType, DataType
 export Epoch, Epoch16, TT2000
+export CDF_EPOCH, CDF_EPOCH16, CDF_TIME_TT2000, CDF_CHAR, CDF_UCHAR
 
 include("epochs.jl")
 include("enums.jl")
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -33,7 +33,7 @@ function CDFDataset(filename)
             buffer, compression = decompress_bytes(buffer, RecordSizeType)
         end
         # Parse CDF header
-        cdr = CDR(buffer, 9, RecordSizeType)
+        cdr = CDR(buffer, 8, RecordSizeType)
         gdr = GDR(buffer, cdr.gdr_offset, RecordSizeType)
         return CDFDataset{compression, RecordSizeType}(filename, cdr, gdr, buffer)
     end
@@ -63,10 +63,11 @@ function find_vdr(cdf::CDFDataset, var_name::String)
     gdr = GDR(cdf)
     RecordSizeType = recordsize_type(cdf)
     buffer = cdf.buffer
+    var_name_bytes = codeunits(var_name)
     for current_offset in (gdr.rVDRhead, gdr.zVDRhead)
         while current_offset != 0
             vdr = zVDR(buffer, current_offset, RecordSizeType)
-            if String(vdr.name) == var_name
+            if vdr.name == var_name_bytes
                 return vdr
             end
             current_offset = vdr.vdr_next
diff --git a/src/epochs.jl b/src/epochs.jl
@@ -45,6 +45,10 @@ struct TT2000 <: CDFDateTime
     value::Int64
 end
 
+fillvalue(::Epoch) = -1.0e31
+fillvalue(::Epoch16) = -1.0e31
+fillvalue(::TT2000) = 9999
+
 # Conversion to DateTime
 function Dates.DateTime(epoch::Epoch)
     return DateTime(0) + Millisecond(round(Int64, epoch.value))
@@ -88,7 +92,16 @@ for f in (:year, :month, :day, :hour, :minute, :second, :millisecond)
     @eval Dates.$f(epoch::CDFDateTime) = Dates.$f(DateTime(epoch))
 end
 
-Base.show(io::IO, epoch::CDFDateTime) = print(io, typeof(epoch), "(", DateTime(epoch), ")")
+Dates.value(epoch::CDFDateTime) = epoch.value
+
+function Base.show(io::IO, epoch::CDFDateTime)
+    fillval = fillvalue(epoch)
+    if fillval == epoch.value
+        print(io, "FILLVAL")
+    else
+        print(io, DateTime(epoch))
+    end
+end
 Base.promote_rule(::Type{<:CDFDateTime}, ::Type{Dates.DateTime}) = Dates.DateTime
 Base.convert(::Type{Dates.DateTime}, x::CDFDateTime) = Dates.DateTime(x)
 Base.bswap(x::T) where {T <: CDFDateTime} = T(Base.bswap(x.value))
diff --git a/src/loading/variable.jl b/src/loading/variable.jl
@@ -37,14 +37,17 @@ function load_variable_data(source, vxr_head, ::Type{T}, dims, ::Type{RecordSize
     return reshape(data, dims)
 end
 
-function read_variable_data!(data::Vector{T}, source, vvrs, compression, record_size, RecordSizeType; nbuffers = nthreads()) where {T}
+function read_variable_data!(data::Vector{T}, source, vvrs, compression, record_size, ::Type{FieldSizeT}; nbuffers = nthreads()) where {T, FieldSizeT}
     pos = 1
     if compression == NoCompression || first(vvrs).RecordType == VVR_ # vvr records is the ultimative source
         for entry in vvrs
             N = min(length(data) - pos + 1, length(entry) * record_size)
-            load_vvr_data!(data, pos, source, entry.offset, N, RecordSizeType)
+            load_vvr_data!(data, pos, source, entry.offset, N, FieldSizeT)
             pos += N
         end
+    elseif length(vvrs) == 1
+        load_cvvr_data!(data, 1, source, vvrs[1].offset, length(data), FieldSizeT, compression)
+        pos = length(data) + 1
     else
         n_ch = min(nbuffers, length(vvrs))
         chnl = Channel{Decompressor}(n_ch)
@@ -54,7 +57,7 @@ function read_variable_data!(data::Vector{T}, source, vvrs, compression, record_
         Base.@inbounds Threads.@threads for i in eachindex(vvrs)
             decompressor = take!(chnl)
             N = Ns[i]
-            load_cvvr_data!(data, positions[i] + 1, source, vvrs[i].offset, N, RecordSizeType, compression; decompressor)
+            load_cvvr_data!(data, positions[i] + 1, source, vvrs[i].offset, N, FieldSizeT, compression; decompressor)
             put!(chnl, decompressor)
         end
         pos = positions[end] + 1
@@ -69,17 +72,18 @@ function read_vvrs(src, vxr_head, RecordSizeType)
     return entries
 end
 
-function collect_vxr_entries!(entries::Vector{VVREntry}, src::Vector{UInt8}, offset, RecordSizeType)
+function collect_vxr_entries!(entries::Vector{VVREntry}, src::Vector{UInt8}, offset, ::Type{FieldSizeT}) where FieldSizeT
     while offset != 0
-        vxr = VXR(src, offset, RecordSizeType)
-        foreach(vxr.first, vxr.last, vxr.offset) do first_rec, last_rec, raw_offset
-            leaf_offset = Int(raw_offset)
-            record_type = Header(src, leaf_offset + 1, RecordSizeType).record_type
+
+        vxr = VXR(src, offset, FieldSizeT)
+        for (first, last, offset) in vxr
+            leaf_offset = Int(offset)
+            record_type = Header(src, leaf_offset + 1, FieldSizeT).record_type
             @assert record_type in (VVR_, CVVR_, VXR_)
             if record_type == VXR_
-                collect_vxr_entries!(entries, src, leaf_offset, RecordSizeType)
+                collect_vxr_entries!(entries, src, leaf_offset, FieldSizeT)
             else
-                push!(entries, VVREntry(record_type, first_rec, last_rec, leaf_offset))
+                push!(entries, VVREntry(record_type, Int(first), Int(last), leaf_offset))
             end
         end
         offset = Int(vxr.vxr_next)
diff --git a/src/parsing.jl b/src/parsing.jl
@@ -14,6 +14,10 @@
     end
 end
 
+@inline function read_be(p::Ptr{T}, i) where T
+    return ntoh(unsafe_load(p + (i - 1) * sizeof(T)))
+end
+
 @inline function read_be(v::Vector{UInt8}, i, n, T)
     S = sizeof(T)
     return ntuple(j -> read_be(v, i + (j - 1) * S, T), n)
diff --git a/src/records/cdr.jl b/src/records/cdr.jl
@@ -3,8 +3,8 @@ CDF Descriptor Record (CDR) - the main file header record
 Contains version, encoding, format information, and pointer to GDR
 """
 struct CDR <: Record
-    header::Header
-    gdr_offset::UInt64   # Can be UssInt32 for v2, UInt64 for v3
+    # header::Header
+    gdr_offset::UInt64   # Can be UInt32 for v2, UInt64 for v3
     version::Int32
     release::Int32
     encoding::Int32
@@ -13,7 +13,7 @@ struct CDR <: Record
     rfu_b::Int32       # Reserved field B
     increment::Int32
     identifier::Int32
-    rfu_e::Int32       # Reserved field E
+    # rfu_e::Int32       # Reserved field E
     # Note: copyright string follows but we'll handle it separately
 end
 
@@ -22,17 +22,15 @@ Majority(cdr::CDR) = (cdr.flags & 0x01) != 0 ? Majority(0) : Majority(1)  # Row=
 is_cdf_v3(cdr::CDR) = cdr.version == 3
 
 """
-    CDR(buffer, pos, RecordSizeType)
+    CDR(buffer, pos, FieldSizeT)
 
 Load a CDF Descriptor Record from the IO stream at the specified offset.
 This follows the CDF specification for CDR record structure.
 """
-@inline function CDR(buffer::Vector{UInt8}, pos, RecordSizeType)
-    header = Header(buffer, pos, RecordSizeType)
-    @assert header.record_type == 1 "Invalid CDR record type"
-    pos += sizeof(RecordSizeType) + 4
+@inline function CDR(buffer::Vector{UInt8}, offset, FieldSizeT)
+    pos = check_record_type(1, buffer, offset, FieldSizeT)
     # Read remaining CDR fields in order as per CDF specification
-    gdr_offset, pos = read_be_i(buffer, pos, RecordSizeType)
-    fields = read_be(buffer, pos, 9, Int32)
-    return CDR(header, gdr_offset, fields...)
+    gdr_offset, pos = read_be_i(buffer, pos, FieldSizeT)
+    fields, pos = @read_be_fields(buffer, pos, fieldtypes(CDR)[2:end]...)
+    return CDR(gdr_offset, fields...)
 end
diff --git a/src/records/gdr.jl b/src/records/gdr.jl
@@ -3,7 +3,7 @@ Global Descriptor Record (GDR) - contains global information about the CDF file
 Points to variable and attribute descriptor records
 """
 struct GDR
-    header::Header
+    # header::Header
     rVDRhead::Int64    # Offset to first r-variable descriptor record
     zVDRhead::Int64    # Offset to first z-variable descriptor record
     ADRhead::Int64     # Offset to first attribute descriptor record
@@ -13,28 +13,23 @@ struct GDR
     r_max_rec::Int32    # Maximum record number for r-variables
     r_num_dims::Int32   # Number of dimensions for r-variables
     NzVars::Int32      # Number of z-variables
-    uir_head::Int64     # Unused internal record head
-    rfu_c::Int32        # Reserved field C
-    leap_second_last_updated::Int32
-    rfu_e::Int32        # Reserved field E
-    r_dim_sizes::Tuple{Vararg{Int32}}  # Dimension sizes for r-variables
+    # uir_head::Int64     # Unused internal record head
+    # rfu_c::Int32        # Reserved field C
+    # leap_second_last_updated::Int32
+    # rfu_e::Int32        # Reserved field E
+    # r_dim_sizes::Tuple{Vararg{Int32}}  # Dimension sizes for r-variables
 end
 
 
 """
-    GDR(buffer::Vector{UInt8}, pos, RecordSizeType)
+    GDR(buffer::Vector{UInt8}, pos, FieldSizeT)
 
 Load a Global Descriptor Record from the buffer at the specified offset.
 """
-@inline function GDR(buffer::Vector{UInt8}, offset, RecordSizeType)
-    pos = offset + 1
-    header = Header(buffer, pos, RecordSizeType)
-    @assert header.record_type == 2
-    pos += sizeof(RecordSizeType) + 4
-
-    # Read GDR fields
-    fields, pos = @read_be_fields(buffer, pos, fieldtypes(GDR)[2:(end - 1)]...)
-    r_num_dims = fields[8]
-    r_dim_sizes = read_be(buffer, pos, r_num_dims, Int32)
-    return GDR(header, fields..., r_dim_sizes)
+@inline function GDR(buffer::Vector{UInt8}, offset, FieldSizeT)
+    pos = check_record_type(2, buffer, offset, FieldSizeT)
+    fields, pos = @read_be_fields(buffer, pos, fieldtypes(GDR)...)
+    # r_num_dims = fields[8]
+    # r_dim_sizes = read_be(buffer, pos, r_num_dims, Int32)
+    return GDR(fields...)
 end
diff --git a/src/records/records.jl b/src/records/records.jl
@@ -13,18 +13,25 @@ struct Header
     record_type::Int32
 end
 
-@inline function Header(io::IO, RecordSizeType)
-    record_size = Int64(read_be(io, RecordSizeType))
+@inline function Header(io::IO, FieldSizeT)
+    record_size = Int64(read_be(io, FieldSizeT))
     record_type = read_be(io, Int32)
     return Header(record_size, record_type)
 end
 
-@inline function Header(buf::Vector{UInt8}, pos, RecordSizeType)
-    record_size = Int64(read_be(buf, pos, RecordSizeType))
-    record_type = read_be(buf, pos + sizeof(RecordSizeType), Int32)
+@inline function Header(buf::Vector{UInt8}, pos, FieldSizeT)
+    record_size = Int64(read_be(buf, pos, FieldSizeT))
+    record_type = read_be(buf, pos + sizeof(FieldSizeT), Int32)
     return Header(record_size, record_type)
 end
 
+@inline function check_record_type(record_type, buffer, offset, FieldSizeT)
+    pos = offset + sizeof(FieldSizeT) + 1
+    header_type = read_be(buffer, pos, Int32)
+    @assert header_type == record_type
+    return pos + sizeof(Int32)
+end
+
 include("cdr.jl")
 include("vdr.jl")
 include("vxr.jl")
@@ -72,8 +79,8 @@ function Base.show(io::IO, cdr::CDR)
     flag_info = decode_cdr_flags(cdr.flags)
 
     println(io, "CDR (CDF Descriptor Record):")
-    println(io, "  Record Size: $(cdr.header.record_size) bytes")
-    println(io, "  Record Type: $(cdr.header.record_type)")
+    # println(io, "  Record Size: $(cdr.header.record_size) bytes")
+    # println(io, "  Record Type: $(cdr.header.record_type)")
     println(io, "  GDR Offset: 0x$(string(cdr.gdr_offset, base = 16, pad = 8))")
     println(io, "  Version: $(cdr.version).$(cdr.release).$(cdr.increment)")
     println(io, "  Encoding: $(cdr.encoding)")
diff --git a/src/records/vxr.jl b/src/records/vxr.jl
@@ -6,9 +6,10 @@ struct VXR
     vxr_next::Int64        # Next VXR in chain
     n_entries::Int32       # Number of entries
     n_used_entries::Int32  # Number of used entries
-    first::Tuple{Vararg{Int32}}   # First record numbers , Unused entries in this array contain 0xFFFFFFFF.
-    last::Tuple{Vararg{Int32}}    # Last record numbers, Unused entries in this array contain 0xFFFFFFFF.
-    offset::Tuple{Vararg{Int64}}  # Offsets to VVR/CVVR records
+    pointer::Ptr{Int32}
+    # first::Tuple{Vararg{Int32}}   # First record numbers , Unused entries in this array contain 0xFFFFFFFF.
+    # last::Tuple{Vararg{Int32}}    # Last record numbers, Unused entries in this array contain 0xFFFFFFFF.
+    # offset::Tuple{Vararg{Int64}}  # Offsets to VVR/CVVR records
 end
 
 
@@ -26,8 +27,16 @@ function VXR(source::Vector{UInt8}, offset, RecordSizeType)
     vxr_next, pos = read_be_i(source, pos, RecordSizeType)
     n_entries, pos = read_be_i(source, pos, Int32)
     n_used_entries, pos = read_be_i(source, pos, Int32)
-    first = read_be(source, pos, n_used_entries, Int32)
-    last = read_be(source, pos + 4 * n_entries, n_used_entries, Int32)
-    offset = read_be(source, pos + 8 * n_entries, n_used_entries, RecordSizeType)
-    return VXR(header, vxr_next, n_entries, n_used_entries, first, last, offset)
+    p = convert(Ptr{Int32}, pointer(source, pos))
+    return VXR(header, vxr_next, n_entries, n_used_entries, p)
 end
+
+function Base.iterate(vxr::VXR, state = 1)
+    state > vxr.n_used_entries && return nothing
+    pointer = vxr.pointer
+    first = read_be(pointer, state)
+    last = read_be(pointer, state + vxr.n_entries)
+    offset_pointer = convert(Ptr{Int64}, pointer + (2 * vxr.n_entries) * sizeof(Int32))
+    offset = read_be(offset_pointer, state)
+    return ((first, last, offset), state + 1)
+end
diff --git a/src/variable.jl b/src/variable.jl
@@ -41,6 +41,8 @@ function Base.haskey(var::CDFVariable, name::String)
     return !isnothing(at)
 end
 
+attrib(var::CDFVariable, name::String) = vattrib(var.parentdataset, var.vdr.num, name)
+
 function CPR(var::CDFVariable)
     vdr = var.vdr
     cdf = var.parentdataset
diff --git a/test/CommonDataModelExt_test.jl b/test/CommonDataModelExt_test.jl
diff --git a/test/Manifest.toml b/test/Manifest.toml
diff --git a/test/Project.toml b/test/Project.toml
diff --git a/test/debug.jl b/test/debug.jl
diff --git a/test/perf_test.jl b/test/perf_test.jl
diff --git a/test/runtests.jl b/test/runtests.jl