Skip to content

Commit d777011

Browse files
authored
feat: add decompression support for CDF files (#8)
1 parent 4d1335f commit d777011

24 files changed

+377
-98
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@
55
/docs/Manifest*.toml
66
/docs/build/
77
ref/
8-
.claude/
8+
.claude/
9+
data/
10+
!data/a_*.cdf
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11

22
There are two main reference implementation in ref/ folder. When having trouble implement the related function, refer to the reference implementation to understand the logic and the details.
33
- CDFpp is a C++ implementation.
4-
- cdflib is a python implementation.
4+
- cdflib is a python implementation.
5+
6+
Simple and clear implementation is preferred than verbose and complex implementation.

Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@ version = "0.1.0"
44
authors = ["Beforerr <zzj956959688@gmail.com> and contributors"]
55

66
[deps]
7+
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
78
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
89
Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
10+
LibDeflate = "9255714d-24a7-4b30-8ea3-d46a97f7e13b"
911
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
1012
StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
1113
UnixTimes = "ab1a18e7-b408-4913-896c-624bb82ed7f4"
1214

1315
[compat]
16+
CodecZlib = "0.7"
1417
Dates = "1"
1518
Dictionaries = "0.4"
19+
LibDeflate = "0.4.3"
1620
Mmap = "1"
1721
StaticStrings = "0.2.6"
1822
UnixTimes = "1.7.2"

README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@ println("Variables: ", keys(cdf))
3636

3737
# Access a variable
3838
var = cdf["temperature"]
39-
println("Variable data: ", var.data)
40-
println("Data type: ", var.data_type)
41-
println("Dimensions: ", var.dimensions)
4239
```
4340

4441
## Elsewhere
42.5 KB
Binary file not shown.

data/a_compressed_cdf.cdf

6.01 KB
Binary file not shown.

data/a_rle_compressed_cdf.cdf

73.1 KB
Binary file not shown.

src/CommonDataFormat.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ using Dates, UnixTimes
44
using Mmap
55
using Dictionaries
66
using StaticStrings
7+
using Base.Threads
8+
using CodecZlib: GzipDecompressor, transcode
9+
using LibDeflate
10+
using LibDeflate: GzipDecompressResult
711

812
export CDFDataset, CDFVariable
913
export Majority, CompressionType, DataType
@@ -12,9 +16,10 @@ export Epoch, Epoch16, TT2000
1216
include("epochs.jl")
1317
include("enums.jl")
1418
include("parsing.jl")
19+
include("decompress.jl")
1520
include("records/records.jl")
16-
include("dataset.jl")
1721
include("variable.jl")
22+
include("dataset.jl")
1823
include("loading/attribute.jl")
1924
include("loading/variable.jl")
2025

src/dataset.jl

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@ function CDFDataset(filename)
2626
magic_bytes = read_be(buffer, 1, UInt32)
2727
@assert validate_cdf_magic(magic_bytes)
2828

29-
# Read compression info
30-
compression_bytes = read_be(buffer, 5, UInt32)
31-
compression = CompressionType(compression_bytes)
32-
RecordSizeType = is_cdf_v3(magic_bytes) ? UInt64 : UInt32
29+
RecordSizeType = is_cdf_v3(magic_bytes) ? Int64 : Int32
30+
compression_flag = read_be(buffer, 5, UInt32)
31+
compression = NoCompression
32+
if compression_flag != 0x0000FFFF
33+
buffer, compression = decompress_bytes(buffer, RecordSizeType)
34+
end
3335
# Parse CDF header
3436
cdr = CDR(buffer, 9, RecordSizeType)
3537
gdr = GDR(buffer, cdr.gdr_offset, RecordSizeType)
@@ -98,3 +100,15 @@ function Base.keys(cdf::CDFDataset)
98100
end
99101

100102
Base.haskey(cdf::CDFDataset, var_name::String) = var_name in keys(cdf)
103+
104+
Base.iterate(cdf::CDFDataset, state = 1) = state > length(cdf) ? nothing : (cdf[keys(cdf)[state]], state + 1)
105+
106+
function Base.show(io::IO, ::MIME"text/plain", cdf::CDFDataset)
107+
println(io, typeof(cdf), ":", cdf.filename)
108+
println(io, "variables")
109+
for var in keys(cdf)
110+
println(io, " $var")
111+
end
112+
println(io, cdf.cdr)
113+
return
114+
end

src/decompress.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
include("decompress/rle.jl")
2+
include("decompress/gzip.jl")
3+
4+
5+
function decompress_bytes(buffer::Vector{UInt8}, RecordSizeType)
6+
ccr = CCR(buffer, 8, RecordSizeType)
7+
cpr = CPR(buffer, Int(ccr.cpr_offset), RecordSizeType)
8+
compression = CompressionType(cpr.compression_type)
9+
payload = data_view(ccr, buffer)
10+
expected = Int(ccr.uncompressed_size)
11+
decompressed = decompress_bytes(payload, compression; expected_bytes = expected)
12+
new_size = 8 + length(decompressed)
13+
new_buffer = Vector{UInt8}(undef, new_size)
14+
copyto!(new_buffer, 1, buffer, 1, 4)
15+
new_buffer[5] = 0x00
16+
new_buffer[6] = 0x00
17+
new_buffer[7] = 0xFF
18+
new_buffer[8] = 0xFF
19+
copyto!(new_buffer, 9, decompressed, 1, length(decompressed))
20+
return new_buffer, compression
21+
end
22+
23+
function decompress_bytes(data::AbstractVector{UInt8}, compression::CompressionType; expected_bytes::Union{Nothing, Int} = nothing)
24+
compression == NoCompression && return data
25+
@assert compression in (GzipCompression, RLECompression)
26+
if compression == GzipCompression
27+
result = transcode(GzipDecompressor, Vector{UInt8}(data))
28+
elseif compression == RLECompression
29+
isnothing(expected_bytes) && throw(ArgumentError("RLE decompression requires expected size"))
30+
result = _rle_decompress(data, expected_bytes)
31+
end
32+
if !isnothing(expected_bytes) && length(result) != expected_bytes
33+
throw(ArgumentError("Decompressed payload size mismatch (expected $(expected_bytes), got $(length(result)))"))
34+
end
35+
return result
36+
end
37+
38+
function decompress_bytes!(decompressor, dest, doffs, src::AbstractVector{UInt8}, soffs, N, n_in, compression::CompressionType)
39+
if compression == NoCompression
40+
_copy_to!(dest, doffs, src, soffs, N)
41+
return
42+
end
43+
@assert compression in (GzipCompression, RLECompression)
44+
n_out = N * sizeof(eltype(dest))
45+
out_ptr = pointer(dest, doffs)
46+
in_ptr = pointer(src, soffs)
47+
return if compression == GzipCompression
48+
out = _unsafe_gzip_decompress!(decompressor, out_ptr, n_out, in_ptr, n_in)
49+
@assert !(out isa LibDeflateError) out
50+
elseif compression == RLECompression
51+
end
52+
end

0 commit comments

Comments
 (0)