Skip to content

Commit d9ee741

Browse files
GH-39811: [R] better documentation for col_types argument in open_delim_dataset (#45719)
### Rationale for this change Hi, can you please consider this tiny update to the docs? In the current documentation, it's misleading how to specify col_types when a delimited file is scanned using `open_csv_dataset`, `open_delim_dataset`, etc. Reading what is currently written, one may assume that they can declare column types by providing the compact string representation that `readr` uses. https://github.com/apache/arrow/blob/3c8fe098c7f5e0e40bd06bc6afca8412eb81f56e/r/man/open_delim_dataset.Rd#L164-L165 But it doesn't work. See reprex below ```r library(arrow) #> #> Attaching package: 'arrow' #> The following object is masked from 'package:utils': #> #> timestamp tf <- tempfile() dir.create(tf) df <- data.frame(x = c("1", "2", "NULL")) file_path <- file.path(tf, "file1.txt") write.table(df, file_path, sep = ",", row.names = FALSE) open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "c") #> Error: #> ! Unsupported `col_types` specification. #> ℹ `col_types` must be NULL, or a <Schema>. unlink(tf) ``` ### What changes are included in this PR? The current PR provides a clearer explanation of what should be passed to the `col_types` argument, along with a basic example for the `open_csv_dataset()`. ### Are these changes tested? Not needed, as only the R documentation has been updated ### Are there any user-facing changes? Only the R documentation has been updated Lead-authored-by: Anatolii Tsyplenkov <[email protected]> Co-authored-by: Nic Crane <[email protected]> Signed-off-by: Nic Crane <[email protected]>
1 parent fd1919f commit d9ee741

File tree

7 files changed

+133
-41
lines changed

7 files changed

+133
-41
lines changed

r/R/csv.R

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -842,35 +842,7 @@ readr_to_csv_convert_options <- function(na,
842842
include_columns <- character()
843843

844844
if (is.character(col_types)) {
845-
if (length(col_types) != 1L) {
846-
abort("`col_types` is a character vector that is not of size 1")
847-
}
848-
n <- nchar(col_types)
849-
specs <- substring(col_types, seq_len(n), seq_len(n))
850-
if (!is_bare_character(col_names, n)) {
851-
abort("Compact specification for `col_types` requires `col_names`")
852-
}
853-
854-
col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
855-
switch(.x,
856-
"c" = utf8(),
857-
"i" = int32(),
858-
"n" = float64(),
859-
"d" = float64(),
860-
"l" = bool(),
861-
"f" = dictionary(),
862-
"D" = date32(),
863-
"T" = timestamp(unit = "ns"),
864-
"t" = time32(),
865-
"_" = null(),
866-
"-" = null(),
867-
"?" = NULL,
868-
abort("Unsupported compact specification: '", .x, "' for column '", .y, "'")
869-
)
870-
}))
871-
# To "guess" types, omit them from col_types
872-
col_types <- keep(col_types, ~ !is.null(.x))
873-
col_types <- schema(col_types)
845+
col_types <- parse_compact_col_spec(col_types, col_names)
874846
}
875847

876848
if (!is.null(col_types)) {

r/R/dataset-format.R

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ JsonFileFormat$create <- function(...) {
191191
#' @export
192192
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
193193
CsvFileFormat$create <- function(..., partitioning = NULL) {
194-
195194
dots <- list(...)
196195

197196
options <- check_csv_file_format_args(dots, partitioning = partitioning)
@@ -202,7 +201,6 @@ CsvFileFormat$create <- function(..., partitioning = NULL) {
202201

203202
# Check all arguments are valid
204203
check_csv_file_format_args <- function(args, partitioning = NULL) {
205-
206204
options <- list(
207205
parse_options = args$parse_options,
208206
convert_options = args$convert_options,
@@ -220,18 +218,24 @@ check_csv_file_format_args <- function(args, partitioning = NULL) {
220218
options$parse_options <- do.call(csv_parse_options, args$parse_options)
221219
}
222220

223-
if (is.null(args$convert_options)) {
224-
options$convert_options <- do.call(csv_file_format_convert_opts, args)
225-
} else if (is.list(args$convert_options)) {
226-
options$convert_options <- do.call(csv_convert_options, args$convert_options)
227-
}
228-
221+
# Set up read_options before convert_options since convert_options needs column names
229222
if (is.null(args$read_options)) {
230223
options$read_options <- do.call(csv_file_format_read_opts, c(args, list(partitioning = partitioning)))
231224
} else if (is.list(args$read_options)) {
232225
options$read_options <- do.call(csv_read_options, args$read_options)
233226
}
234227

228+
# If col_names is provided, add it to read_options
229+
if ("col_names" %in% names(args)) {
230+
args$read_options <- list(col_names = args$col_names)
231+
}
232+
233+
if (is.null(args$convert_options)) {
234+
options$convert_options <- do.call(csv_file_format_convert_opts, c(args, list(read_options = options$read_options)))
235+
} else if (is.list(args$convert_options)) {
236+
options$convert_options <- do.call(csv_convert_options, args$convert_options)
237+
}
238+
235239
options
236240
}
237241

@@ -458,11 +462,32 @@ csv_file_format_convert_opts <- function(...) {
458462
opts[["quoted_na"]] <- NULL
459463
}
460464

465+
# Handle readr-style col_types specification
466+
if ("col_types" %in% names(opts) && is.character(opts[["col_types"]])) {
467+
# Get column names from read_options if available
468+
col_names <- if (!is.null(opts[["read_options"]])) {
469+
if (!is.null(opts[["read_options"]]$column_names)) {
470+
opts[["read_options"]]$column_names
471+
} else if (!is.null(opts[["read_options"]]$col_names)) {
472+
opts[["read_options"]]$col_names
473+
} else {
474+
abort("Compact specification for `col_types` requires column names in read_options")
475+
}
476+
} else if ("col_names" %in% names(opts)) {
477+
opts[["col_names"]]
478+
} else {
479+
abort("Compact specification for `col_types` requires column names")
480+
}
481+
482+
opts[["col_types"]] <- parse_compact_col_spec(opts[["col_types"]], col_names)
483+
}
484+
485+
# Remove read_options from opts before calling csv_convert_options
486+
opts[["read_options"]] <- NULL
461487
do.call(csv_convert_options, opts)
462488
}
463489

464490
csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...) {
465-
466491
opts <- list(...)
467492
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
468493
arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")

r/R/dataset.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ open_dataset <- function(sources,
255255
#'
256256
#' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
257257
#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
258+
#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = schema(list(x = int32())))
259+
#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i", col_names = "y", skip = 1)
258260
#'
259261
#' unlink(tf)
260262
#' @seealso [open_dataset()]

r/R/util.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,50 @@ check_named_cols <- function(df) {
248248
)
249249
}
250250
}
251+
252+
#' Parse a compact column type specification into Arrow schema
253+
#'
254+
#' @param col_types A single character string where each character represents
255+
#' a column type, like in readr
256+
#' @param col_names Character vector of column names (must match the length of
257+
#' col_types characters)
258+
#' @return A Schema object
259+
#'
260+
#' @examples
261+
#' parse_compact_col_spec("ci", colnames = c("x", "y"))
262+
#'
263+
#' @keywords internal
264+
parse_compact_col_spec <- function(col_types, col_names) {
265+
if (length(col_types) != 1L) {
266+
abort("`col_types` must be a character vector of size 1")
267+
}
268+
n <- nchar(col_types)
269+
specs <- substring(col_types, seq_len(n), seq_len(n))
270+
271+
if (!is_bare_character(col_names, n)) {
272+
abort("Compact specification for `col_types` requires `col_names` of matching length")
273+
}
274+
275+
col_types <- set_names(nm = col_names, map2(specs, col_names, ~ col_type_from_compact(.x, .y)))
276+
# To "guess" types, omit them from col_types
277+
col_types <- keep(col_types, ~ !is.null(.x))
278+
schema(col_types)
279+
}
280+
281+
col_type_from_compact <- function(x, y) {
282+
switch(x,
283+
"c" = utf8(),
284+
"i" = int32(),
285+
"n" = float64(),
286+
"d" = float64(),
287+
"l" = bool(),
288+
"f" = dictionary(),
289+
"D" = date32(),
290+
"T" = timestamp(unit = "ns"),
291+
"t" = time32(),
292+
"_" = null(),
293+
"-" = null(),
294+
"?" = NULL,
295+
abort(paste0("Unsupported compact specification: '", x, "' for column '", y, "'"))
296+
)
297+
}

r/man/open_delim_dataset.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/tests/testthat/test-dataset-csv.R

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,21 @@ test_that("open_delim_dataset params passed through to open_dataset", {
523523
ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema)
524524
expect_equal(ds_strings$schema, schema(a = string(), b = string()))
525525

526+
# col_types - as compact schema
527+
compact_schema <- schema(
528+
int = int32(), dbl = float64(), lgl = bool(), chr = utf8(),
529+
fct = dictionary(), ts = timestamp(unit = "ns")
530+
)
531+
532+
ds <- open_csv_dataset(
533+
csv_dir,
534+
col_names = c("int", "dbl", "lgl", "chr", "fct", "ts"),
535+
col_types = "idlcfT",
536+
skip = 1
537+
)
538+
539+
expect_equal(schema(ds), compact_schema)
540+
526541
# skip_empty_rows
527542
tf <- tempfile()
528543
writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf)
@@ -553,7 +568,7 @@ test_that("open_delim_dataset params passed through to open_dataset", {
553568
ds <- open_csv_dataset(
554569
csv_dir,
555570
schema = schema(
556-
int = int64(), dbl = int64(), lgl = bool(), chr = utf8(),
571+
int = int64(), dbl = float64(), lgl = bool(), chr = utf8(),
557572
fct = utf8(), ts = timestamp(unit = "s")
558573
),
559574
skip = 1

r/tests/testthat/test-util.R

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,34 @@ test_that("all_funs() identifies namespace-qualified and unqualified functions",
7070
c("other_fun", "fun", "sum", "base::log")
7171
)
7272
})
73+
74+
test_that("parse_compact_col_spec() converts string specs to schema", {
75+
compact_schema <- parse_compact_col_spec(
76+
col_types = "cidlDTtf_-?",
77+
col_names = c("c", "i", "d", "l", "D", "T", "t", "f", "_", "-", "?")
78+
)
79+
80+
expect_equal(
81+
compact_schema,
82+
schema(
83+
c = utf8(), i = int32(), d = float64(), l = bool(), D = date32(),
84+
T = timestamp(unit = "ns"), t = time32(unit = "ms"), f = dictionary(),
85+
`_` = null(), `-` = null()
86+
)
87+
)
88+
89+
expect_error(
90+
parse_compact_col_spec(c("i", "d"), c("a", "b")),
91+
"`col_types` must be a character vector of size 1"
92+
)
93+
94+
expect_error(
95+
parse_compact_col_spec("idc", c("a", "b")),
96+
"Compact specification for `col_types` requires `col_names` of matching length"
97+
)
98+
99+
expect_error(
100+
parse_compact_col_spec("y", "a"),
101+
"Unsupported compact specification: 'y' for column 'a'"
102+
)
103+
})

0 commit comments

Comments
 (0)