Skip to content

Add support for zero-copy conversions from an xarray to a DimArray #972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions docs/src/xarray.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,27 @@ converting these Xarray types to their DimensionalData equivalent:
```julia
import PythonCall: pyconvert

# By default this will share the underlying array
my_dimarray = pyconvert(DimArray, my_dataarray)

my_dimstack = pyconvert(DimStack, my_dataset)
```

Note that:
- The current implementation will make a copy of the underlying arrays.
Here are some things to keep in mind when converting:
- `pyconvert(DimArray, x)` is zero-copy by default, i.e. it will share the
underlying array with Python and register itself with Pythons GC to ensure
that the memory isn't garbage-collected prematurely. If you want to make a
copy you can call it like `pyconvert(DimArray, x; copy=true)`.
- When doing a zero-copy conversion from `x` to `x_jl`, `parent(x_jl)` will be a
[PyArray](https://juliapy.github.io/PythonCall.jl/stable/pythoncall-reference/#PythonCall.Wrap.PyArray). In
most situations there should be no overhead from this but note that a
`PyArray` is not a `DenseArray` so some operations that dispatch on
`DenseArray` may not be performant, e.g. BLAS calls. See these issues for more
information:
- https://github.com/JuliaPy/PythonCall.jl/issues/319
- https://github.com/JuliaPy/PythonCall.jl/issues/182

When `copy=true`, `parent(x_jl)` will always be a standard `Array`.
- Python stores arrays in row-major order whereas Julia stores them in
column-major order, hence the dimensions on a converted `DimArray` will be in
reverse order from the original `DataArray`. This is done to ensure that the
Expand Down
76 changes: 26 additions & 50 deletions ext/DimensionalDataPythonCall.jl
Original file line number Diff line number Diff line change
@@ -1,41 +1,12 @@
module DimensionalDataPythonCall

using DimensionalData
import DimensionalData as DD
import PythonCall
import PythonCall: Py, pyis, pyconvert, pytype, pybuiltins
import DimensionalData.Lookups: NoLookup
import PythonCall: Py, PyArray, pyis, pyconvert, pytype, pybuiltins, pylen

function dtype2type(dtype::String)
if dtype == "float16"
Float16
elseif dtype == "float32"
Float32
elseif dtype == "float64"
Float64
elseif dtype == "int8"
Int8
elseif dtype == "int16"
Int16
elseif dtype == "int32"
Int32
elseif dtype == "int64"
Int64
elseif dtype == "uint8"
UInt8
elseif dtype == "uint16"
UInt16
elseif dtype == "uint32"
UInt32
elseif dtype == "uint64"
UInt64
elseif dtype == "bool"
Bool
else
error("Unsupported dtype: '$dtype'")
end
end

function PythonCall.pyconvert(::Type{DimArray}, x::Py, d=nothing)
function PythonCall.pyconvert(::Type{DimArray}, x::Py, d=nothing; copy=false)
x_pytype = string(pytype(x).__name__)
if x_pytype != "DataArray"
if isnothing(d)
Expand All @@ -46,36 +17,30 @@ function PythonCall.pyconvert(::Type{DimArray}, x::Py, d=nothing)
end

# Transpose here so that the fast axis remains the same in the Julia array
data_npy = x.data.T
data_type = dtype2type(string(data_npy.dtype.name))
data_ndim = pyconvert(Int, data_npy.ndim)
data = pyconvert(Array{data_type, data_ndim}, data_npy)
data_py = PyArray(x.data.T; copy=false)
data = copy ? pyconvert(Array, data_py) : data_py

dim_names = Symbol.(collect(x.dims))
coord_names = Symbol.(collect(x.coords.keys()))
lookups_vec = Pair{Symbol, Any}[]
new_dims = Dim[]
for dim in reverse(dim_names) # Iterate in reverse order because of row/col major
if dim in coord_names
coord = getproperty(x, dim).data
coord_type = dtype2type(string(coord.dtype.name))
coord_ndim = pyconvert(Int, coord.ndim)

push!(lookups_vec, dim => pyconvert(Array{coord_type, coord_ndim}, coord))
coord_py = PyArray(getproperty(x, dim).data; copy=false)
coord = copy ? pyconvert(Array, coord_py) : coord_py
push!(new_dims, Dim{dim}(coord))
else
push!(lookups_vec, dim => NoLookup())
push!(new_dims, Dim{dim}())
end
end

lookups = NamedTuple(lookups_vec)

metadata = pyconvert(Dict, x.attrs)
metadata = pylen(x.attrs) == 0 ? DD.NoMetadata() : pyconvert(Dict, x.attrs)

array_name = pyis(x.name, pybuiltins.None) ? nothing : string(x.name)
array_name = pyis(x.name, pybuiltins.None) ? DD.NoName() : string(x.name)

return DimArray(data, lookups; name=array_name, metadata)
return DimArray(data, Tuple(new_dims); name=array_name, metadata)
end

function PythonCall.pyconvert(::Type{DimStack}, x::Py, d=nothing)
function PythonCall.pyconvert(::Type{DimStack}, x::Py, d=nothing; copy=false)
x_pytype = string(pytype(x).__name__)
if x_pytype != "Dataset"
if isnothing(d)
Expand All @@ -88,12 +53,23 @@ function PythonCall.pyconvert(::Type{DimStack}, x::Py, d=nothing)
variable_names = Symbol.(collect(x.data_vars.keys()))
arrays = Dict{Symbol, DimArray}()
for name in variable_names
arrays[name] = pyconvert(DimArray, getproperty(x, name))
arrays[name] = pyconvert(DimArray, getproperty(x, name); copy)
end

metadata = pyconvert(Dict, x.attrs)

return DimStack(NamedTuple(arrays); metadata)
end

# Precompile main calls to pyconvert(::DimArray) with copy=true and copy=false
precompile(Tuple{typeof(PythonCall.Core.pyconvert), Type{DimensionalData.DimArray{T, N, D, R, A, Na, Me} where Me where Na where A<:AbstractArray{T, N} where R<:Tuple where D<:Tuple where N where T}, PythonCall.Core.Py})
precompile(Tuple{typeof(Core.kwcall), NamedTuple{(:copy,), Tuple{Bool}}, typeof(PythonCall.Core.pyconvert), Type{DimensionalData.DimArray{T, N, D, R, A, Na, Me} where Me where Na where A<:AbstractArray{T, N} where R<:Tuple where D<:Tuple where N where T}, PythonCall.Core.Py})

# Precompile lower-level conversion calls for common types and dimensions
for T in (Int32, Int64, UInt32, UInt64, Float32, Float64)
for N in (1, 2, 3, 4, 5)
precompile(Tuple{typeof(PythonCall.Core.pyconvert), Type{Array{T, N}}, PythonCall.Core.Py})
end
end

end
18 changes: 17 additions & 1 deletion test/xarray.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
ENV["JULIA_CONDAPKG_ENV"] = "@dimensionaldata-tests"
ENV["JULIA_CONDAPKG_BACKEND"] = "MicroMamba"

# If you've already run the tests once to create the test Python environment,
# you can comment out the lines above and uncomment the lines below. That will
# re-use the environment without re-resolving it, which is a bit faster.
# ENV["JULIA_PYTHONCALL_EXE"] = joinpath(Base.DEPOT_PATH[1], "conda_environments", "dimensionaldata-tests", "bin", "python")
# ENV["JULIA_CONDAPKG_BACKEND"] = "Null"

using DimensionalData, Test, PythonCall
import DimensionalData.Dimensions: NoLookup, NoMetadata

Expand Down Expand Up @@ -36,11 +42,21 @@ x2 = xr.DataArray(data2,
"pos" => 0.48,
"foo" => [1, 2, 3])

# Test the zero-copy support
y[1, 1] = 42f0
@test parent(y) isa PyArray
@test pyconvert(Float32, x[0, 0].item()) == 42f0

# Test copying
y_copy = pyconvert(DimArray, x; copy=true)
@test y == y_copy
@test parent(y_copy) isa Array

@test_throws ArgumentError pyconvert(DimArray, xr)
@test pyconvert(DimArray, xr, 42) == 42

# Sanity test for higher-dimensional arrays
x3 = xr.DataArray(rand(2, 5, 5, 3),
x3 = xr.DataArray(np.random.rand(2, 5, 5, 3).astype(np.float32),
dims=("w", "x", "y", "z"),
coords=Dict("w" => [1, 2], "z" => [1, 2, 3]))
y = pyconvert(DimArray, x3)
Expand Down
Loading