Skip to content

Commit bc9a296

Browse files
authored
Merge branch 'apache:main' into main
2 parents 7e4c3f1 + 6110f43 commit bc9a296

23 files changed

Lines changed: 1838 additions & 9 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ __pycache__
5151
dev/release/.env
5252

5353
/.luarc.json
54+
.positai

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ repos:
3131
- id: mixed-line-ending
3232
args: [--fix=lf]
3333
- id: trailing-whitespace
34+
# R snapshot test files may have arbitrary file endings based on test results
35+
exclude: "_snaps"
3436

3537
- repo: https://github.com/codespell-project/codespell
3638
rev: v2.4.1

r/sedonadb/.Rbuildignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@
1212
^README\.Rmd$
1313
^bootstrap\.R$
1414
^\.lintr\.R$
15+
^\.positai$
16+
^\.claude$

r/sedonadb/NAMESPACE

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ S3method(print,"sedonadb::SedonaDBExprFactory__bundle")
3939
S3method(print,"sedonadb::SedonaDBExpr__bundle")
4040
S3method(print,SedonaDBExpr)
4141
S3method(print,sedonadb_dataframe)
42+
S3method(print,sedonadb_join_by)
43+
S3method(print,sedonadb_join_select)
44+
S3method(print,sedonadb_join_select_default)
4245
export(.fns)
4346
export(as_sd_expr)
4447
export(as_sedonadb_dataframe)
@@ -66,9 +69,14 @@ export(sd_expr_column)
6669
export(sd_expr_factory)
6770
export(sd_expr_literal)
6871
export(sd_expr_negative)
72+
export(sd_expr_parse_binary)
6973
export(sd_expr_scalar_function)
7074
export(sd_filter)
7175
export(sd_group_by)
76+
export(sd_join)
77+
export(sd_join_by)
78+
export(sd_join_select)
79+
export(sd_join_select_default)
7280
export(sd_preview)
7381
export(sd_read_parquet)
7482
export(sd_read_sf)

r/sedonadb/R/000-wrappers.R

Lines changed: 37 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/sedonadb/R/dataframe.R

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -460,22 +460,22 @@ sd_ungroup <- function(.data) {
460460
#' @param ... Aggregate expressions. These are evaluated in the same way as
461461
#' [dplyr::summarise()] except the outer expression must be an aggregate
462462
#' expression (e.g., `sum(x) + 1` is not currently possible).
463+
#' @param .env The calling environment for programmatic usage
463464
#'
464465
#' @returns An object of class sedonadb_dataframe
465466
#' @export
466467
#'
467468
#' @examples
468469
#' data.frame(x = c(10:1, NA)) |> sd_summarise(x = sum(x, na.rm = TRUE))
469470
#'
470-
sd_summarise <- function(.data, ...) {
471+
sd_summarise <- function(.data, ..., .env = parent.frame()) {
471472
.data <- as_sedonadb_dataframe(.data)
472473

473474
expr_quos <- rlang::enquos(...)
474-
env <- parent.frame()
475475

476-
expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), env, ctx = .data$ctx)
476+
expr_ctx <- sd_expr_ctx(infer_nanoarrow_schema(.data), .env, ctx = .data$ctx)
477477
r_exprs <- expr_quos |> rlang::quos_auto_name() |> lapply(rlang::quo_get_expr)
478-
sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx, env = env)
478+
sd_exprs <- lapply(r_exprs, sd_eval_expr, expr_ctx = expr_ctx)
479479

480480
# Ensure inputs are given aliases to account for the expected column name
481481
exprs_names <- names(r_exprs)
@@ -492,8 +492,88 @@ sd_summarise <- function(.data, ...) {
492492

493493
#' @rdname sd_summarise
494494
#' @export
495-
sd_summarize <- function(.data, ...) {
496-
sd_summarise(.data, ...)
495+
sd_summarize <- function(.data, ..., .env = parent.frame()) {
496+
sd_summarise(.data, ..., .env = .env)
497+
}
498+
499+
#' Join two SedonaDB DataFrames
500+
#'
501+
#' Perform a join operation between two dataframes. Use [sd_join_by()] to
502+
#' specify join conditions using `x$column` and `y$column` syntax to
503+
#' reference columns from the left and right tables respectively.
504+
#'
505+
#' @param x The left dataframe
506+
#' @param y The right dataframe (will use the same context as x)
507+
#' @param by Join specification. One of:
508+
#' - A `sedonadb_join_by` object from [sd_join_by()]
509+
#' - A character vector of column names to join on in both tables
510+
#' - A named character vector mapping left-table column names to
511+
#' right-table column names, e.g. `c(x_val = "y_val")`
512+
#' - `NULL` for a natural join on columns with matching names
513+
#' @param join_type The type of join to perform. One of "inner", "left", "right",
514+
#' "full", "leftsemi", "rightsemi", "leftanti", "rightanti", "leftmark",
515+
#' or "rightmark".
516+
#' @param select Post-join column selection. One of
517+
#' - `NULL` for no modification, which may result in duplicate (unqualified)
518+
#' column names. The column may still be
519+
#' referred to with a qualifier in advanced usage using [sd_expr_column()].
520+
#' - [sd_join_select_default()] for dplyr-like behaviour (equi-join keys
521+
#' removed, intersecting names suffixed)
522+
#' - [sd_join_select()] for a custom selection
523+
#'
524+
#' @returns An object of class sedonadb_dataframe
525+
#' @export
526+
#'
527+
#' @examples
528+
#' df1 <- data.frame(x = letters[1:10], y = 1:10)
529+
#' df2 <- data.frame(y = 10:1, z = LETTERS[1:10])
530+
#' df1 |> sd_join(df2)
531+
#'
532+
sd_join <- function(
533+
x,
534+
y,
535+
by = NULL,
536+
join_type = "inner",
537+
select = sd_join_select_default()
538+
) {
539+
x <- as_sedonadb_dataframe(x)
540+
y <- as_sedonadb_dataframe(y, ctx = x$ctx)
541+
542+
x_schema <- infer_nanoarrow_schema(x)
543+
y_schema <- infer_nanoarrow_schema(y)
544+
join_expr_ctx <- sd_join_expr_ctx(x_schema, y_schema, ctx = x$ctx)
545+
join_conditions <- sd_build_join_conditions(join_expr_ctx, by, ctx = x$ctx)
546+
547+
df <- x$df$join(y$df, join_conditions, join_type, left_alias = "x", right_alias = "y")
548+
out <- new_sedonadb_dataframe(x$ctx, df)
549+
550+
# Apply post-join column selection if needed
551+
if (is.null(select)) {
552+
projection <- NULL
553+
} else if (inherits(select, "sedonadb_join_select_default")) {
554+
# Default select: remove duplicate equijoin keys, apply suffixes
555+
projection <- sd_build_default_select(
556+
join_expr_ctx,
557+
join_conditions,
558+
select$suffix,
559+
join_type
560+
)
561+
} else if (inherits(select, "sedonadb_join_select")) {
562+
# Custom select: evaluate user expressions
563+
projection <- sd_eval_join_select_exprs(select, join_expr_ctx)
564+
} else {
565+
stop(
566+
"`select` must be NULL, sd_join_select_default(), or sd_join_select()",
567+
call. = FALSE
568+
)
569+
}
570+
571+
# NULL return from these functions means that no extra projecting is needed
572+
if (is.null(projection)) {
573+
out
574+
} else {
575+
sd_transmute(out, !!!projection)
576+
}
497577
}
498578

499579
#' Write DataFrame to (Geo)Parquet files

r/sedonadb/R/expression.R

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,19 @@ sd_expr_alias <- function(expr, alias, factory = sd_expr_factory()) {
126126
expr$alias(alias)
127127
}
128128

129+
#' @rdname sd_expr_column
130+
#' @export
131+
sd_expr_parse_binary <- function(expr) {
132+
result <- expr$parse_binary()
133+
if (is.null(result)) {
134+
return(NULL)
135+
}
136+
137+
result$left <- .savvy_wrap_SedonaDBExpr(result$left)
138+
result$right <- .savvy_wrap_SedonaDBExpr(result$right)
139+
result
140+
}
141+
129142
#' @rdname sd_expr_column
130143
#' @export
131144
as_sd_expr <- function(x, factory = sd_expr_factory()) {
@@ -321,6 +334,11 @@ sd_expr_ctx <- function(schema = NULL, env = parent.frame(), ctx = NULL) {
321334

322335
schema <- nanoarrow::as_nanoarrow_schema(schema)
323336
data_names <- as.character(names(schema$children))
337+
338+
# Duplicate names can't be referred to with the mask. We could install these
339+
# as an active binding to give an error message if they are referred to.
340+
ambiguous_names <- unique(data_names[duplicated(data_names)])
341+
data_names <- setdiff(data_names, ambiguous_names)
324342
data <- lapply(data_names, sd_expr_column)
325343
names(data) <- data_names
326344

0 commit comments

Comments
 (0)