Skip to content

Add ingestion of matrices #1150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/include/typesr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ enum class RTypeId {
LOGICAL,
INTEGER,
NUMERIC,
COMPLEX,
STRING,
FACTOR,
DATE,
Expand All @@ -43,12 +44,14 @@ enum class RTypeId {
// No RType equivalent
BYTE,
LIST,
MATRIX,
STRUCT,
};

struct RType {
RType();
RType(RTypeId id); // NOLINT: Allow implicit conversion from `RTypeId`
RType(RTypeId id, R_len_t size); // NOLINT: Allow implicit conversion from `RTypeId`
RType(const RType &other);
RType(RType &&other) noexcept;

Expand All @@ -57,12 +60,14 @@ struct RType {
// copy assignment
inline RType &operator=(const RType &other) {
id_ = other.id_;
size_ = other.size_;
aux_ = other.aux_;
return *this;
}
// move assignment
inline RType &operator=(RType &&other) noexcept {
id_ = other.id_;
size_ = other.size_;
std::swap(aux_, other.aux_);
return *this;
}
Expand All @@ -76,6 +81,7 @@ struct RType {
static constexpr const RTypeId LOGICAL = RTypeId::LOGICAL;
static constexpr const RTypeId INTEGER = RTypeId::INTEGER;
static constexpr const RTypeId NUMERIC = RTypeId::NUMERIC;
static constexpr const RTypeId COMPLEX = RTypeId::COMPLEX;
static constexpr const RTypeId STRING = RTypeId::STRING;
static constexpr const RTypeId DATE = RTypeId::DATE;
static constexpr const RTypeId DATE_INTEGER = RTypeId::DATE_INTEGER;
Expand Down Expand Up @@ -105,8 +111,13 @@ struct RType {
static RType STRUCT(child_list_t<RType> &&children);
child_list_t<RType> GetStructChildTypes() const;

static RType MATRIX(const RType &child, R_len_t ncols);
RType GetMatrixElementType() const;
R_len_t GetMatrixNcols() const;

private:
RTypeId id_;
R_len_t size_;
child_list_t<RType> aux_;
};

Expand Down
84 changes: 84 additions & 0 deletions src/scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using namespace duckdb;
using namespace cpp11;

static
data_ptr_t GetColDataPtr(const RType &rtype, SEXP coldata) {
switch (rtype.id()) {
case RType::LOGICAL:
Expand Down Expand Up @@ -51,6 +52,7 @@ data_ptr_t GetColDataPtr(const RType &rtype, SEXP coldata) {
return (data_ptr_t)DATAPTR_RO(coldata);
case RTypeId::LIST:
return (data_ptr_t)DATAPTR_RO(coldata);
case RTypeId::MATRIX:
case RTypeId::STRUCT:
// Will bind child columns dynamically. Could also optimize by descending early and recording.
return (data_ptr_t)coldata;
Expand Down Expand Up @@ -83,6 +85,7 @@ static void AppendColumnSegment(SRC *source_data, idx_t sexp_offset, Vector &res
}
}

static
void AppendListColumnSegment(const RType &rtype, SEXP *source_data, idx_t sexp_offset, Vector &result, idx_t count) {
source_data += sexp_offset;
auto &result_mask = FlatVector::Validity(result);
Expand All @@ -104,9 +107,84 @@ void AppendListColumnSegment(const RType &rtype, SEXP *source_data, idx_t sexp_o
}
}

template <class SRC, class DST, class RTYPE>
static inline
void AppendMatrixSegmentAtomic(SRC *src_ptr, int nrows, int ncols, idx_t sexp_offset,
Vector &child_vector, idx_t count) {
auto child_data = FlatVector::GetData<DST>(child_vector);
auto &child_mask = FlatVector::Validity(child_vector);
idx_t vector_idx = 0;
for (idx_t i = 0; i < count; i++) {
auto matrix_elt_idx = sexp_offset + i;
for (idx_t k = 0; k < ncols; k++) {
auto val = src_ptr[matrix_elt_idx];
if (RTYPE::IsNull(val)) {
child_mask.SetInvalid(vector_idx++);
} else {
child_data[vector_idx++] = RTYPE::Convert(val);
}
matrix_elt_idx += nrows;
}
}
}

static
void AppendMatrixColumnSegment(const RType &rtype, bool experimental, SEXP source_data, idx_t sexp_offset, Vector &result, idx_t count) {
auto element_rtype = rtype.GetMatrixElementType();
auto nrows = Rf_nrows(source_data);
auto ncols = Rf_ncols(source_data);
auto &child_vector = ArrayVector::GetEntry(result);

switch (element_rtype.id()) {
case RType::LOGICAL: //LGLSXP
AppendMatrixSegmentAtomic<int, bool, RBooleanType>(LOGICAL_POINTER(source_data),
nrows, ncols, sexp_offset, child_vector, count);
break;

case RType::INTEGER: //INTSXP
AppendMatrixSegmentAtomic<int, int, RIntegerType>(INTEGER_POINTER(source_data),
nrows, ncols, sexp_offset, child_vector, count);
break;

case RType::INTEGER64: //REALSXP
AppendMatrixSegmentAtomic<int64_t, int64_t, RInteger64Type>((int64_t *)NUMERIC_POINTER(source_data),
nrows, ncols, sexp_offset, child_vector, count);
break;

case RType::NUMERIC: //REALSXP
AppendMatrixSegmentAtomic<double, double, RDoubleType>(NUMERIC_POINTER(source_data),
nrows, ncols, sexp_offset, child_vector, count);
break;

case RType::COMPLEX: //CPLXSXP
cpp11::stop("Matrix with complex numbers are not supported.");
break;

case RTypeId::BYTE: // RAWSXP
cpp11::stop("Matrix of type raw is not supported.");
break;

case RType::STRING: //STRSXP
if (experimental) {
D_ASSERT(result.GetType().id() == LogicalTypeId::POINTER);
AppendMatrixSegmentAtomic<SEXP, uintptr_t, DedupPointerEnumType>((SEXP *)DATAPTR_RO(source_data),
nrows, ncols, sexp_offset, child_vector, count);
} else {
AppendMatrixSegmentAtomic<SEXP, string_t, RStringSexpType>((SEXP *)DATAPTR_RO(source_data),
nrows, ncols, sexp_offset, child_vector, count);
}
break;

default:
cpp11::stop("AppendMatrixColumnSegment: Unsupported matrix type for scan");
}
}

static
void AppendAnyColumnSegment(const RType &rtype, bool experimental, data_ptr_t coldata_ptr, idx_t sexp_offset, Vector &v,
idx_t this_count);

static
void AppendStructColumnSegment(const RType &rtype, bool experimental, SEXP source_data, idx_t sexp_offset,
Vector &result, idx_t count) {
// No NULL values for STRUCTs.
Expand All @@ -120,6 +198,7 @@ void AppendStructColumnSegment(const RType &rtype, bool experimental, SEXP sourc
}
}

static
void AppendAnyColumnSegment(const RType &rtype, bool experimental, data_ptr_t coldata_ptr, idx_t sexp_offset, Vector &v,
idx_t this_count) {
switch (rtype.id()) {
Expand Down Expand Up @@ -253,6 +332,11 @@ void AppendAnyColumnSegment(const RType &rtype, bool experimental, data_ptr_t co
AppendListColumnSegment(rtype, data_ptr, sexp_offset, v, this_count);
break;
}
case RTypeId::MATRIX: {
auto data_ptr = (SEXP)coldata_ptr;
AppendMatrixColumnSegment(rtype, experimental, data_ptr, sexp_offset, v, this_count);
break;
}
case RTypeId::STRUCT: {
auto data_ptr = (SEXP)coldata_ptr;
AppendStructColumnSegment(rtype, experimental, data_ptr, sexp_offset, v, this_count);
Expand Down
50 changes: 46 additions & 4 deletions src/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@ using namespace duckdb;
RType::RType() : id_(RTypeId::UNKNOWN) {
}

RType::RType(RTypeId id) : id_(id) {
RType::RType(RTypeId id) : id_(id), size_(0) {
}

RType::RType(const RType &other) : id_(other.id_), aux_(other.aux_) {
RType::RType(RTypeId id, R_len_t size) : id_(id), size_(size) {
}

RType::RType(RType &&other) noexcept : id_(other.id_), aux_(std::move(other.aux_)) {
RType::RType(const RType &other) : id_(other.id_), size_(other.size_), aux_(other.aux_) {
}

RType::RType(RType &&other) noexcept : id_(other.id_), size_(other.size_), aux_(std::move(other.aux_)) {
}

RTypeId RType::id() const {
return id_;
}

bool RType::operator==(const RType &rhs) const {
return id_ == rhs.id_ && aux_ == rhs.aux_;
return id_ == rhs.id_ && size_ == rhs.size_ && aux_ == rhs.aux_;
}

RType RType::FACTOR(cpp11::strings levels) {
Expand Down Expand Up @@ -74,6 +77,22 @@ RType RType::GetListChildType() const {
return aux_.front().second;
}

RType RType::MATRIX(const RType &child, R_len_t ncols) {
RType out = RType(RTypeId::MATRIX, ncols);
out.aux_.push_back(std::make_pair("", child));
return out;
}

RType RType::GetMatrixElementType() const {
D_ASSERT(id_ == RTypeId::MATRIX);
return aux_.front().second;
}

R_len_t RType::GetMatrixNcols() const {
D_ASSERT(id_ == RTypeId::MATRIX);
return size_;
}

RType RType::STRUCT(child_list_t<RType> &&children) {
RType out = RType(RTypeId::STRUCT);
std::swap(out.aux_, children);
Expand Down Expand Up @@ -132,6 +151,23 @@ RType RApiTypes::DetectRType(SEXP v, bool integer64) {
}
} else if (Rf_isFactor(v) && TYPEOF(v) == INTSXP) {
return RType::FACTOR(GET_LEVELS(v));
} else if (Rf_isMatrix(v)) {
if (TYPEOF(v) == LGLSXP) {
return RType::MATRIX(RType::LOGICAL, Rf_ncols(v));
} else if (TYPEOF(v) == INTSXP) {
return RType::MATRIX(RType::INTEGER, Rf_ncols(v));
} else if (TYPEOF(v) == REALSXP) {
if (integer64 && Rf_inherits(v, "integer64")) {
return RType::MATRIX(RType::INTEGER64, Rf_ncols(v));
}
return RType::MATRIX(RType::NUMERIC, Rf_ncols(v));
} else if (TYPEOF(v) == CPLXSXP) {
return RType::MATRIX(RType::COMPLEX, Rf_ncols(v));
} else if (TYPEOF(v) == STRSXP) {
return RType::MATRIX(RType::STRING, Rf_ncols(v));
} else {
return RType::UNKNOWN;
}
} else if (TYPEOF(v) == LGLSXP) {
return RType::LOGICAL;
} else if (TYPEOF(v) == INTSXP) {
Expand All @@ -145,6 +181,8 @@ RType RApiTypes::DetectRType(SEXP v, bool integer64) {
return RType::NUMERIC;
} else if (TYPEOF(v) == STRSXP) {
return RType::STRING;
} else if (TYPEOF(v) == CPLXSXP) {
return RType::COMPLEX;
} else if (TYPEOF(v) == VECSXP) {
if (Rf_inherits(v, "blob")) {
return RType::BLOB;
Expand Down Expand Up @@ -211,6 +249,8 @@ LogicalType RApiTypes::LogicalTypeFromRType(const RType &rtype, bool experimenta
return LogicalType::DOUBLE;
case RType::INTEGER64:
return LogicalType::BIGINT;
case RType::COMPLEX:
return LogicalType::ARRAY(LogicalType::DOUBLE, 2);
case RTypeId::FACTOR: {
auto duckdb_levels = rtype.GetFactorLevels();
return LogicalType::ENUM(duckdb_levels, rtype.GetFactorLevelsCount());
Expand Down Expand Up @@ -244,6 +284,8 @@ LogicalType RApiTypes::LogicalTypeFromRType(const RType &rtype, bool experimenta
return LogicalType::BLOB;
case RTypeId::LIST:
return LogicalType::LIST(RApiTypes::LogicalTypeFromRType(rtype.GetListChildType(), experimental));
case RTypeId::MATRIX:
return LogicalType::ARRAY(RApiTypes::LogicalTypeFromRType(rtype.GetMatrixElementType(), experimental), rtype.GetMatrixNcols());
case RTypeId::STRUCT: {
child_list_t<LogicalType> children;
for (const auto &child : rtype.GetStructChildTypes()) {
Expand Down
3 changes: 3 additions & 0 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ R_len_t RApiTypes::GetVecSize(RType rtype, SEXP coldata) {
D_ASSERT(TYPEOF(coldata) == VECSXP);
coldata = VECTOR_ELT(coldata, 0);
}
if (rtype.id() == RTypeId::MATRIX) {
return Rf_nrows(coldata);
}
// This still isn't quite accurate, but good enough for the types we support.
return Rf_length(coldata);
}
Expand Down
11 changes: 11 additions & 0 deletions tests/testthat/_snaps/array.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,14 @@
Error in `duckdb_result()`:
! Use `dbConnect(array = "matrix")` to enable arrays to be returned to R.

# array errors when writing matrix of complex numbers

Code
dbWriteTable(con, "tbl", df)
Condition
Error in `duckdb_result()`:
! Matrix with complex numbers are not supported.
Error in `duckdb_result()`:
! rapi_execute: Failed to run query
Error: Invalid Error: std::exception

Loading
Loading