|
22 | 22 |
|
23 | 23 | #' Read and write fst files. |
24 | 24 | #' |
25 | | -#' Read and write data frames from and to a fast-storage (fst) file. |
| 25 | +#' Read and write data frames from and to a fast-storage (`fst`) file. |
26 | 26 | #' Allows for compression and (file level) random access of stored data, even for compressed datasets. |
27 | | -#' When using a \code{data.table} object for \code{x}, the key (if any) is preserved, |
| 27 | +#' Multiple threads are used to obtain high (de-)serialization speeds but all background threads are |
| 28 | +#' re-joined before `write_fst` and `read_fst` return (reads and writes are stable). |
| 29 | +#' When using a `data.table` object for `x`, the key (if any) is preserved, |
28 | 30 | #' allowing storage of sorted data. |
29 | | -#' Methods \code{read_fst} and \code{write_fst} are equivalent to \code{read.fst} and \code{write.fst} (but the |
| 31 | +#' Methods `read_fst` and `write_fst` are equivalent to `read.fst` and `write.fst` (but the |
30 | 32 | #' former syntax is preferred). |
31 | 33 | #' |
32 | 34 | #' @param x a data frame to write to disk |
33 | 35 | #' @param path path to fst file |
34 | 36 | #' @param compress value in the range 0 to 100, indicating the amount of compression to use. |
35 | | -#' Lower values mean larger file sizes. |
36 | | -#' @param uniform_encoding If TRUE, all character vectors will be assumed to have elements with equal encoding. |
| 37 | +#' Lower values mean larger file sizes. The default compression is set to 50. |
| 38 | +#' @param uniform_encoding If `TRUE`, all character vectors will be assumed to have elements with equal encoding. |
37 | 39 | #' The encoding (latin1, UTF8 or native) of the first non-NA element will used as encoding for the whole column. |
38 | 40 | #' This will be a correct assumption for most use cases. |
39 | | -#' If \code{uniform.encoding} is set to FALSE, no such assumption will be made and all elements will be converted |
| 41 | +#' If `uniform.encoding` is set to `FALSE`, no such assumption will be made and all elements will be converted |
40 | 42 | #' to the same encoding. The latter is a relatively expensive operation and will reduce write performance for |
41 | 43 | #' character columns. |
42 | | -#' @return \code{read_fst} returns a data frame with the selected columns and rows. \code{read_fst} |
43 | | -#' invisibly returns \code{x} (so you can use this function in a pipeline). |
| 44 | +#' @return `read_fst` returns a data frame with the selected columns and rows. `read_fst` |
| 45 | +#' invisibly returns `x` (so you can use this function in a pipeline). |
44 | 46 | #' @examples |
45 | 47 | #' # Sample dataset |
46 | 48 | #' x <- data.frame(A = 1:10000, B = sample(c(TRUE, FALSE, NA), 10000, replace = TRUE)) |
47 | 49 | #' |
48 | | -#' # Uncompressed |
49 | | -#' write_fst(x, "dataset.fst") # filesize: 41 KB |
50 | | -#' y <- read_fst("dataset.fst") # read uncompressed data |
| 50 | +#' # Default compression |
| 51 | +#' write_fst(x, "dataset.fst") # filesize: 17 KB |
| 52 | +#' y <- read_fst("dataset.fst") # read fst file |
51 | 53 | #' |
52 | | -#' # Compressed |
| 54 | +#' # Maximum compression |
53 | 55 | #' write_fst(x, "dataset.fst", 100) # fileSize: 4 KB |
54 | | -#' y <- read_fst("dataset.fst") # read compressed data |
| 56 | +#' y <- read_fst("dataset.fst") # read fst file |
55 | 57 | #' |
56 | 58 | #' # Random access |
57 | 59 | #' y <- read_fst("dataset.fst", "B") # read selection of columns |
58 | 60 | #' y <- read_fst("dataset.fst", "A", 100, 200) # read selection of columns and rows |
59 | 61 | #' @export |
| 62 | +#' @md |
60 | 63 | write_fst <- function(x, path, compress = 50, uniform_encoding = TRUE) { |
61 | 64 | if (!is.character(path)) stop("Please specify a correct path.") |
62 | 65 |
|
@@ -156,7 +159,7 @@ print.fstmetadata <- function(x, ...) { |
156 | 159 | #' |
157 | 160 | #' @export |
158 | 161 | read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = FALSE, old_format = FALSE) { |
159 | | - fileName <- normalizePath(path, mustWork = TRUE) |
| 162 | + fileName <- normalizePath(path, mustWork = FALSE) |
160 | 163 |
|
161 | 164 | if (!is.null(columns)) { |
162 | 165 | if (!is.character(columns)) { |
@@ -200,8 +203,21 @@ read_fst <- function(path, columns = NULL, from = 1, to = NULL, as.data.table = |
200 | 203 | return(res) |
201 | 204 | } |
202 | 205 |
|
203 | | - as.data.frame(res$resTable, row.names = NULL, stringsAsFactors = FALSE, |
204 | | - optional = TRUE) |
| 206 | + # use setters from data.table to improve performance |
| 207 | + if (requireNamespace("data.table")) { |
| 208 | + |
| 209 | + data.table::setattr(res$resTable, "class", "data.frame") |
| 210 | + data.table::setattr(res$resTable, "row.names", 1:length(res$resTable[[1]])) |
| 211 | + |
| 212 | + return(res$resTable) |
| 213 | + } |
| 214 | + |
| 215 | + res_table <- res$resTable |
| 216 | + |
| 217 | + class(res_table) <- "data.frame" |
| 218 | + attr(res_table, "row.names") <- 1:length(res$resTable[[1]]) |
| 219 | + |
| 220 | + res_table |
205 | 221 | } |
206 | 222 |
|
207 | 223 |
|
|
0 commit comments