GH-39811: [R] better documentation for col_types argument in open_delim_dataset (#45719)

atsyplenkov · thisisnic · web-flow · commit d9ee741b19c9 · 2025-04-11T10:05:28.000+01:00
### Rationale for this change Hi, can you please consider this tiny update to the docs? In the current documentation, it's misleading how to specify col_types when a delimited file is scanned using `open_csv_dataset`, `open_delim_dataset`, etc. Reading what is currently written, one may assume that they can declare column types by providing the compact string representation that `readr` uses. https://github.com/apache/arrow/blob/3c8fe098c7f5e0e40bd06bc6afca8412eb81f56e/r/man/open_delim_dataset.Rd#L164-L165 But it doesn't work. See reprex below ```r library(arrow) #> #> Attaching package: 'arrow' #> The following object is masked from 'package:utils': #> #> timestamp tf <- tempfile() dir.create(tf) df <- data.frame(x = c("1", "2", "NULL")) file_path <- file.path(tf, "file1.txt") write.table(df, file_path, sep = ",", row.names = FALSE) open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "c") #> Error: #> ! Unsupported `col_types` specification. #> ℹ `col_types` must be NULL, or a <Schema>. unlink(tf) ``` ### What changes are included in this PR? The current PR provides a clearer explanation of what should be passed to the `col_types` argument, along with a basic example for the `open_csv_dataset()`. ### Are these changes tested? Not needed, as only the R documentation has been updated ### Are there any user-facing changes? Only the R documentation has been updated Lead-authored-by: Anatolii Tsyplenkov <atsyplenkov@fastmail.com> Co-authored-by: Nic Crane <thisisnic@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -842,35 +842,7 @@ readr_to_csv_convert_options <- function(na,
   include_columns <- character()
 
   if (is.character(col_types)) {
-    if (length(col_types) != 1L) {
-      abort("`col_types` is a character vector that is not of size 1")
-    }
-    n <- nchar(col_types)
-    specs <- substring(col_types, seq_len(n), seq_len(n))
-    if (!is_bare_character(col_names, n)) {
-      abort("Compact specification for `col_types` requires `col_names`")
-    }
-
-    col_types <- set_names(nm = col_names, map2(specs, col_names, ~ {
-      switch(.x,
-        "c" = utf8(),
-        "i" = int32(),
-        "n" = float64(),
-        "d" = float64(),
-        "l" = bool(),
-        "f" = dictionary(),
-        "D" = date32(),
-        "T" = timestamp(unit = "ns"),
-        "t" = time32(),
-        "_" = null(),
-        "-" = null(),
-        "?" = NULL,
-        abort("Unsupported compact specification: '", .x, "' for column '", .y, "'")
-      )
-    }))
-    # To "guess" types, omit them from col_types
-    col_types <- keep(col_types, ~ !is.null(.x))
-    col_types <- schema(col_types)
+    col_types <- parse_compact_col_spec(col_types, col_names)
   }
 
   if (!is.null(col_types)) {
diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R
@@ -191,7 +191,6 @@ JsonFileFormat$create <- function(...) {
 #' @export
 CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
 CsvFileFormat$create <- function(..., partitioning = NULL) {
-
   dots <- list(...)
 
   options <- check_csv_file_format_args(dots, partitioning = partitioning)
@@ -202,7 +201,6 @@ CsvFileFormat$create <- function(..., partitioning = NULL) {
 
 # Check all arguments are valid
 check_csv_file_format_args <- function(args, partitioning = NULL) {
-
   options <- list(
     parse_options = args$parse_options,
     convert_options = args$convert_options,
@@ -220,18 +218,24 @@ check_csv_file_format_args <- function(args, partitioning = NULL) {
     options$parse_options <- do.call(csv_parse_options, args$parse_options)
   }
 
-  if (is.null(args$convert_options)) {
-    options$convert_options <- do.call(csv_file_format_convert_opts, args)
-  } else if (is.list(args$convert_options)) {
-    options$convert_options <- do.call(csv_convert_options, args$convert_options)
-  }
-
+  # Set up read_options before convert_options since convert_options needs column names
   if (is.null(args$read_options)) {
     options$read_options <- do.call(csv_file_format_read_opts, c(args, list(partitioning = partitioning)))
   } else if (is.list(args$read_options)) {
     options$read_options <- do.call(csv_read_options, args$read_options)
   }
 
+  # If col_names is provided, add it to read_options
+  if ("col_names" %in% names(args)) {
+    args$read_options <- list(col_names = args$col_names)
+  }
+
+  if (is.null(args$convert_options)) {
+    options$convert_options <- do.call(csv_file_format_convert_opts, c(args, list(read_options = options$read_options)))
+  } else if (is.list(args$convert_options)) {
+    options$convert_options <- do.call(csv_convert_options, args$convert_options)
+  }
+
   options
 }
 
@@ -458,11 +462,32 @@ csv_file_format_convert_opts <- function(...) {
     opts[["quoted_na"]] <- NULL
   }
 
+  # Handle readr-style col_types specification
+  if ("col_types" %in% names(opts) && is.character(opts[["col_types"]])) {
+    # Get column names from read_options if available
+    col_names <- if (!is.null(opts[["read_options"]])) {
+      if (!is.null(opts[["read_options"]]$column_names)) {
+        opts[["read_options"]]$column_names
+      } else if (!is.null(opts[["read_options"]]$col_names)) {
+        opts[["read_options"]]$col_names
+      } else {
+        abort("Compact specification for `col_types` requires column names in read_options")
+      }
+    } else if ("col_names" %in% names(opts)) {
+      opts[["col_names"]]
+    } else {
+      abort("Compact specification for `col_types` requires column names")
+    }
+
+    opts[["col_types"]] <- parse_compact_col_spec(opts[["col_types"]], col_names)
+  }
+
+  # Remove read_options from opts before calling csv_convert_options
+  opts[["read_options"]] <- NULL
   do.call(csv_convert_options, opts)
 }
 
 csv_file_format_read_opts <- function(schema = NULL, partitioning = NULL, ...) {
-
   opts <- list(...)
   # Filter out arguments meant for CsvParseOptions/CsvConvertOptions
   arrow_opts <- c(names(formals(csv_parse_options)), "parse_options")
diff --git a/r/R/dataset.R b/r/R/dataset.R
@@ -255,6 +255,8 @@ open_dataset <- function(sources,
 #'
 #' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
 #' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1)
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = schema(list(x = int32())))
+#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_types = "i", col_names = "y", skip = 1)
 #'
 #' unlink(tf)
 #' @seealso [open_dataset()]
diff --git a/r/R/util.R b/r/R/util.R
@@ -248,3 +248,50 @@ check_named_cols <- function(df) {
     )
   }
 }
+
+#' Parse a compact column type specification into Arrow schema
+#'
+#' @param col_types A single character string where each character represents
+#' a column type, like in readr
+#' @param col_names Character vector of column names (must match the length of
+#' col_types characters)
+#' @return A Schema object
+#'
+#' @examples
+#' parse_compact_col_spec("ci", colnames = c("x", "y"))
+#'
+#' @keywords internal
+parse_compact_col_spec <- function(col_types, col_names) {
+  if (length(col_types) != 1L) {
+    abort("`col_types` must be a character vector of size 1")
+  }
+  n <- nchar(col_types)
+  specs <- substring(col_types, seq_len(n), seq_len(n))
+
+  if (!is_bare_character(col_names, n)) {
+    abort("Compact specification for `col_types` requires `col_names` of matching length")
+  }
+
+  col_types <- set_names(nm = col_names, map2(specs, col_names, ~ col_type_from_compact(.x, .y)))
+  # To "guess" types, omit them from col_types
+  col_types <- keep(col_types, ~ !is.null(.x))
+  schema(col_types)
+}
+
+col_type_from_compact <- function(x, y) {
+  switch(x,
+    "c" = utf8(),
+    "i" = int32(),
+    "n" = float64(),
+    "d" = float64(),
+    "l" = bool(),
+    "f" = dictionary(),
+    "D" = date32(),
+    "T" = timestamp(unit = "ns"),
+    "t" = time32(),
+    "_" = null(),
+    "-" = null(),
+    "?" = NULL,
+    abort(paste0("Unsupported compact specification: '", x, "' for column '", y, "'"))
+  )
+}
diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
@@ -523,6 +523,21 @@ test_that("open_delim_dataset params passed through to open_dataset", {
   ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema)
   expect_equal(ds_strings$schema, schema(a = string(), b = string()))
 
+  # col_types - as compact schema
+  compact_schema <- schema(
+    int = int32(), dbl = float64(), lgl = bool(), chr = utf8(),
+    fct = dictionary(), ts = timestamp(unit = "ns")
+  )
+
+  ds <- open_csv_dataset(
+    csv_dir,
+    col_names = c("int", "dbl", "lgl", "chr", "fct", "ts"),
+    col_types = "idlcfT",
+    skip = 1
+  )
+
+  expect_equal(schema(ds), compact_schema)
+
   # skip_empty_rows
   tf <- tempfile()
   writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf)
@@ -553,7 +568,7 @@ test_that("open_delim_dataset params passed through to open_dataset", {
   ds <- open_csv_dataset(
     csv_dir,
     schema = schema(
-      int = int64(), dbl = int64(), lgl = bool(), chr = utf8(),
+      int = int64(), dbl = float64(), lgl = bool(), chr = utf8(),
       fct = utf8(), ts = timestamp(unit = "s")
     ),
     skip = 1
diff --git a/r/tests/testthat/test-util.R b/r/tests/testthat/test-util.R
@@ -70,3 +70,34 @@ test_that("all_funs() identifies namespace-qualified and unqualified functions",
     c("other_fun", "fun", "sum", "base::log")
   )
 })
+
+test_that("parse_compact_col_spec() converts string specs to schema", {
+  compact_schema <- parse_compact_col_spec(
+    col_types = "cidlDTtf_-?",
+    col_names = c("c", "i", "d", "l", "D", "T", "t", "f", "_", "-", "?")
+  )
+
+  expect_equal(
+    compact_schema,
+    schema(
+      c = utf8(), i = int32(), d = float64(), l = bool(), D = date32(),
+      T = timestamp(unit = "ns"), t = time32(unit = "ms"), f = dictionary(),
+      `_` = null(), `-` = null()
+    )
+  )
+
+  expect_error(
+    parse_compact_col_spec(c("i", "d"), c("a", "b")),
+    "`col_types` must be a character vector of size 1"
+  )
+
+  expect_error(
+    parse_compact_col_spec("idc", c("a", "b")),
+    "Compact specification for `col_types` requires `col_names` of matching length"
+  )
+
+  expect_error(
+    parse_compact_col_spec("y", "a"),
+    "Unsupported compact specification: 'y' for column 'a'"
+  )
+})