Skip to content

Commit 6ff465c

Browse files
committed
persistent DuckDB connection to avoid Windows heap corruption
1 parent 560b930 commit 6ff465c

5 files changed

Lines changed: 297 additions & 614 deletions

File tree

.github/workflows/julia_fix.yml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
name: Julia Fix CI
2+
3+
# Scoped to the julia_fix branch so it doesn't fire on every push.
4+
# Workflow_dispatch lets you replay it manually after a force-push.
5+
on:
6+
push:
7+
branches: [julia_fix]
8+
pull_request:
9+
branches: [julia_fix]
10+
workflow_dispatch:
11+
12+
permissions:
13+
contents: read
14+
15+
16+
jobs:
17+
build-lib:
18+
name: libcozip (${{ matrix.name }})
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
include:
23+
- name: linux-x86_64
24+
runner: ubuntu-24.04
25+
container: quay.io/pypa/manylinux_2_28_x86_64
26+
lib_ext: so
27+
shell: bash
28+
- name: macos-universal
29+
runner: macos-14
30+
container: ''
31+
lib_ext: dylib
32+
shell: bash
33+
cmake_extra: >-
34+
"-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64"
35+
-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
36+
- name: windows-x86_64
37+
runner: windows-2022
38+
container: ''
39+
lib_ext: dll
40+
shell: pwsh
41+
runs-on: ${{ matrix.runner }}
42+
container: ${{ matrix.container }}
43+
defaults:
44+
run:
45+
shell: ${{ matrix.shell }}
46+
steps:
47+
- uses: actions/checkout@v4
48+
49+
# manylinux ships gcc/cmake but not ninja.
50+
- if: startsWith(matrix.name, 'linux')
51+
run: yum install -y ninja-build
52+
- if: runner.os == 'macOS'
53+
run: brew install ninja
54+
- if: runner.os == 'Windows'
55+
run: choco install ninja -y
56+
57+
# cl.exe + nmake on PATH for cmake.
58+
- if: runner.os == 'Windows'
59+
uses: ilammy/msvc-dev-cmd@v1
60+
with:
61+
arch: x64
62+
63+
- name: Configure
64+
run: >
65+
cmake -B build -S core -G Ninja
66+
-DCMAKE_BUILD_TYPE=Release
67+
-DCMAKE_INSTALL_PREFIX=install
68+
-DCMAKE_INSTALL_LIBDIR=lib
69+
-DBUILD_SHARED_LIBS=ON
70+
${{ matrix.cmake_extra }}
71+
72+
- run: cmake --build build --config Release --parallel
73+
- run: cmake --install build --config Release
74+
75+
# Julia's _resolve_lib_path expects the shared lib at the root of
76+
# the artifact (COZIP_LIB_PATH points straight at it). No header
77+
# or license needed for the test step.
78+
- name: Stage lib (Unix)
79+
if: runner.os != 'Windows'
80+
run: |
81+
mkdir -p staged
82+
cp install/lib/cozip.${{ matrix.lib_ext }} staged/
83+
84+
- name: Stage lib (Windows)
85+
if: runner.os == 'Windows'
86+
run: |
87+
New-Item -ItemType Directory -Force -Path staged | Out-Null
88+
Copy-Item install\bin\cozip.dll staged\
89+
90+
- uses: actions/upload-artifact@v4
91+
with:
92+
name: libcozip-${{ matrix.name }}
93+
path: staged/
94+
if-no-files-found: error
95+
96+
97+
build-julia-package:
98+
name: Julia package (${{ matrix.name }})
99+
needs: build-lib
100+
strategy:
101+
fail-fast: false
102+
matrix:
103+
include:
104+
- name: linux-x86_64
105+
runner: ubuntu-24.04
106+
lib_ext: so
107+
- name: macos-universal
108+
runner: macos-14
109+
lib_ext: dylib
110+
- name: windows-x86_64
111+
runner: windows-2022
112+
lib_ext: dll
113+
runs-on: ${{ matrix.runner }}
114+
steps:
115+
- uses: actions/checkout@v4
116+
117+
- uses: julia-actions/setup-julia@v2
118+
with:
119+
version: '1'
120+
121+
- uses: julia-actions/cache@v2
122+
123+
- uses: actions/download-artifact@v4
124+
with:
125+
name: libcozip-${{ matrix.name }}
126+
path: libcozip-bin/
127+
128+
# Override Artifacts.toml so the test uses the just-built lib,
129+
# not whatever the last published release points at.
130+
- name: Set COZIP_LIB_PATH (Unix)
131+
if: runner.os != 'Windows'
132+
run: echo "COZIP_LIB_PATH=$(pwd)/libcozip-bin/cozip.${{ matrix.lib_ext }}" >> "$GITHUB_ENV"
133+
134+
- name: Set COZIP_LIB_PATH (Windows)
135+
if: runner.os == 'Windows'
136+
shell: pwsh
137+
run: |
138+
$p = Join-Path (Get-Location) "libcozip-bin\cozip.dll"
139+
Add-Content -Path $env:GITHUB_ENV -Value "COZIP_LIB_PATH=$p"
140+
141+
- name: Test
142+
run: julia --project=julia -e 'using Pkg; Pkg.instantiate(); Pkg.test()'

julia/src/Cozip.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ include("Reader.jl")
66

77
const write = create
88

9+
# Open the shared DuckDB connection once at module load and register
10+
# an atexit hook to close it before Julia unloads DLLs.
11+
function __init__()
12+
_init_duckdb!()
13+
atexit(_close_duckdb!)
14+
end
15+
916
export stage_metadata, stage_create, create
1017

1118
end # module

julia/src/Reader.jl

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,26 @@ using DuckDB
66
function read end
77

88

9+
# Cozip and httpfs are loaded lazily on the first Reader.read call,
10+
# not in Cozip.__init__, so `using Cozip` stays fully offline. The
11+
# writer path doesn't need either extension and shouldn't pay for
12+
# the community-registry roundtrip on cold cache.
13+
const _DUCKDB_EXTS_LOADED = Ref(false)
14+
15+
16+
# Always called while holding _DUCKDB_LOCK (from Writer.jl), so the
17+
# check-then-set is race-free without extra locking.
18+
function _ensure_duckdb_extensions!(con)
19+
_DUCKDB_EXTS_LOADED[] && return nothing
20+
DBInterface.execute(con, "INSTALL httpfs")
21+
DBInterface.execute(con, "LOAD httpfs")
22+
DBInterface.execute(con, "INSTALL cozip FROM community")
23+
DBInterface.execute(con, "LOAD cozip")
24+
_DUCKDB_EXTS_LOADED[] = true
25+
return nothing
26+
end
27+
28+
929
"""
1030
read(source; columns=nothing, gdal_vsi=true) -> DataFrame
1131
@@ -33,22 +53,16 @@ function read(
3353
throw(ArgumentError("cozip: `columns` entries must be non-empty"))
3454
end
3555

36-
db = DuckDB.DB()
37-
con = DBInterface.connect(db)
38-
try
39-
DBInterface.execute(con, "INSTALL httpfs")
40-
DBInterface.execute(con, "LOAD httpfs")
41-
DBInterface.execute(con, "INSTALL cozip FROM community")
42-
DBInterface.execute(con, "LOAD cozip")
56+
con = _duckdb_con()
57+
58+
return lock(_DUCKDB_LOCK) do
59+
_ensure_duckdb_extensions!(con)
4360

4461
sql = string(
4562
"SELECT ", _build_select(columns, gdal_vsi),
4663
" FROM read_cozip(?, gdal_vsi := ", gdal_vsi ? "true" : "false", ")",
4764
)
48-
return DataFrame(DBInterface.execute(con, sql, [src]))
49-
finally
50-
DBInterface.close!(con)
51-
DBInterface.close!(db)
65+
DataFrame(DBInterface.execute(con, sql, [src]))
5266
end
5367
end
5468

julia/src/Writer.jl

Lines changed: 74 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,49 @@ const _RESERVED_INPUT_COLUMNS = Set(["offset", "size"])
1818
const _REQUIRED_METADATA_COLUMNS = Set(["name", "offset", "size"])
1919

2020

21+
# Shared DuckDB connection used by every parquet helper here and by
22+
# Reader.read.
23+
24+
const _DUCKDB_DB = Ref{Union{Nothing,DuckDB.DB}}(nothing)
25+
const _DUCKDB_CON = Ref{Union{Nothing,DuckDB.Connection}}(nothing)
26+
const _DUCKDB_LOCK = ReentrantLock()
27+
28+
29+
function _init_duckdb!()
30+
_DUCKDB_CON[] === nothing || return nothing
31+
_DUCKDB_DB[] = DuckDB.DB()
32+
_DUCKDB_CON[] = DBInterface.connect(_DUCKDB_DB[])
33+
return nothing
34+
end
35+
36+
37+
function _close_duckdb!()
38+
if _DUCKDB_CON[] !== nothing
39+
try
40+
DBInterface.close!(_DUCKDB_CON[])
41+
catch
42+
end
43+
_DUCKDB_CON[] = nothing
44+
end
45+
if _DUCKDB_DB[] !== nothing
46+
try
47+
DBInterface.close!(_DUCKDB_DB[])
48+
catch
49+
end
50+
_DUCKDB_DB[] = nothing
51+
end
52+
return nothing
53+
end
54+
55+
56+
function _duckdb_con()
57+
con = _DUCKDB_CON[]
58+
con === nothing &&
59+
error("cozip: DuckDB connection not initialized; Cozip.__init__ must run first")
60+
return con
61+
end
62+
63+
2164
"""
2265
stage_metadata(table) -> (metadata, paths)
2366
@@ -305,31 +348,25 @@ function _alloc_user_entries(
305348
end
306349

307350

308-
# Cached: short-lived connections trigger Windows heap corruption (duckdb#10787).
309-
const _DUCKDB_DB = Ref{Any}(nothing)
310-
const _DUCKDB_CON = Ref{Any}(nothing)
311-
312-
function _duckdb_con()
313-
if _DUCKDB_CON[] === nothing
314-
_DUCKDB_DB[] = DuckDB.DB()
315-
_DUCKDB_CON[] = DBInterface.connect(_DUCKDB_DB[])
351+
function _read_parquet_columns(path::String)::Vector{String}
352+
con = _duckdb_con()
353+
return lock(_DUCKDB_LOCK) do
354+
result = DBInterface.execute(con,
355+
"SELECT * FROM read_parquet('$(_sql_escape(path))') LIMIT 0")
356+
names(DataFrame(result))
316357
end
317-
return _DUCKDB_CON[]
318358
end
319359

320360

321-
function _read_parquet_columns(path::String)::Vector{String}
322-
result = DBInterface.execute(_duckdb_con(),
323-
"SELECT * FROM read_parquet('$(_sql_escape(path))') LIMIT 0")
324-
return names(DataFrame(result))
325-
end
326-
327361
function _read_parquet_subset(path::String, columns::Vector{String})::DataFrame
328-
# `offset` and `size` are reserved in DuckDB SQL.
329-
cols_sql = join(["\"$c\"" for c in columns], ", ")
330-
result = DBInterface.execute(_duckdb_con(),
331-
"SELECT $cols_sql FROM read_parquet('$(_sql_escape(path))')")
332-
return DataFrame(result)
362+
con = _duckdb_con()
363+
return lock(_DUCKDB_LOCK) do
364+
# `offset` and `size` are reserved in DuckDB SQL.
365+
cols_sql = join(["\"$c\"" for c in columns], ", ")
366+
result = DBInterface.execute(con,
367+
"SELECT $cols_sql FROM read_parquet('$(_sql_escape(path))')")
368+
DataFrame(result)
369+
end
333370
end
334371

335372

@@ -390,22 +427,27 @@ function _write_metadata_parquet(df::DataFrame, out_path::AbstractString)
390427
cols = names(df)
391428
col_types = [string("\"", c, "\" ", _julia_to_duckdb_type(eltype(df[!, c])))
392429
for c in cols]
393-
tbl = "tmp_cozip_$(getpid())_$(time_ns())"
430+
394431
con = _duckdb_con()
395432

396-
try
397-
DBInterface.execute(con,
398-
"CREATE TABLE $tbl ($(join(col_types, ", ")))")
399-
for i in 1:nrow(df)
400-
vals = join((_sql_literal(df[i, c]) for c in cols), ", ")
401-
DBInterface.execute(con, "INSERT INTO $tbl VALUES ($vals)")
433+
lock(_DUCKDB_LOCK) do
434+
try
435+
DBInterface.execute(con,
436+
"CREATE OR REPLACE TEMP TABLE tmp_cozip ($(join(col_types, ", ")))")
437+
438+
for i in 1:nrow(df)
439+
vals = join((_sql_literal(df[i, c]) for c in cols), ", ")
440+
DBInterface.execute(con, "INSERT INTO tmp_cozip VALUES ($vals)")
441+
end
442+
443+
kv_sql = _kv_metadata_sql(_table_metadata(df))
444+
DBInterface.execute(con,
445+
"COPY tmp_cozip TO '$(_sql_escape(String(out_path)))' (FORMAT parquet$kv_sql)")
446+
finally
447+
DBInterface.execute(con, "DROP TABLE IF EXISTS tmp_cozip")
402448
end
403-
kv_sql = _kv_metadata_sql(_table_metadata(df))
404-
DBInterface.execute(con,
405-
"COPY $tbl TO '$(_sql_escape(String(out_path)))' (FORMAT parquet$kv_sql)")
406-
finally
407-
DBInterface.execute(con, "DROP TABLE IF EXISTS $tbl")
408449
end
450+
409451
return nothing
410452
end
411453

0 commit comments

Comments
 (0)