Skip to content

Commit 7c93517

Browse files
committed
Use air
1 parent fc28fbc commit 7c93517

36 files changed

+1462
-855
lines changed

.Rbuildignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@
1111
^_pkgdown.yml$
1212
^vignettes/articles$
1313
^src/Makevars$
14+
^[\.]?air\.toml$
15+
^\.vscode$

.vscode/extensions.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"recommendations": [
3+
"Posit.air-vscode"
4+
]
5+
}

.vscode/settings.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,9 @@
55
},
66
"clangd.arguments": [
77
"-header-insertion=never"
8-
]
8+
],
9+
"[r]": {
10+
"editor.formatOnSave": true,
11+
"editor.defaultFormatter": "Posit.air-vscode"
12+
}
913
}

R/arrow-schema.R

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,14 @@ read_arrow_schema <- function(file) {
77
}
88
}
99

10-
apply_arrow_schema <- function(tab, file, arrow_schema, dicts, types,
11-
col_select) {
10+
apply_arrow_schema <- function(
11+
tab,
12+
file,
13+
arrow_schema,
14+
dicts,
15+
types,
16+
col_select
17+
) {
1218
if (is.na(arrow_schema)) {
1319
return(tab)
1420
}
@@ -19,7 +25,9 @@ apply_arrow_schema <- function(tab, file, arrow_schema, dicts, types,
1925
}
2026
for (idx in spec$difftime) {
2127
# only if INT64, otherwise hms, probably
22-
if (types[[idx]] != 2) next
28+
if (types[[idx]] != 2) {
29+
next
30+
}
2331
mult <- switch(
2432
spec$columns$type[[idx]]$unit,
2533
SECOND = 1,
@@ -37,10 +45,13 @@ arrow_find_special <- function(asch, file, col_select = NULL) {
3745
amd <- tryCatch(
3846
parse_arrow_schema(asch)$columns,
3947
error = function(e) {
40-
warning(sprintf(
41-
"Failed to parse Arrow schema from parquet file at '%s'",
42-
file
43-
), call. = TRUE)
48+
warning(
49+
sprintf(
50+
"Failed to parse Arrow schema from parquet file at '%s'",
51+
file
52+
),
53+
call. = TRUE
54+
)
4455
NULL
4556
}
4657
)
@@ -98,7 +109,6 @@ float_precision_names <- c(
98109
date_unit_names <- c(
99110
DAY = 0L,
100111
MILLISECOND = 1L
101-
102112
)
103113

104114
time_unit_names <- c(
@@ -174,8 +184,12 @@ encode_arrow_schema_r <- function(df, schema) {
174184
dates <- vapply(df, function(c) inherits(c, "Date"), logical(1))
175185
hmss <- vapply(df, function(c) inherits(c, "hms"), logical(1))
176186
psxcts <- vapply(df, function(c) inherits(c, "POSIXct"), logical(1))
177-
fctrs <- vapply(df, function(c) inherits(c, "factor"), logical(1))
178-
dfts <- vapply(df, function(c) !inherits(c, "hms") && inherits(c, "difftime"), logical(1))
187+
fctrs <- vapply(df, function(c) inherits(c, "factor"), logical(1))
188+
dfts <- vapply(
189+
df,
190+
function(c) !inherits(c, "hms") && inherits(c, "difftime"),
191+
logical(1)
192+
)
179193
typemap <- c(
180194
"integer" = "Int",
181195
"double" = "FloatingPoint",
@@ -233,7 +247,8 @@ encode_arrow_schema_r <- function(df, schema) {
233247

234248
# Replace strings with numeric IDs, so we can use them in C++
235249
fill_arrow_schema_enums_type <- function(type_type, type) {
236-
switch(type_type,
250+
switch(
251+
type_type,
237252
"FloatingPoint" = {
238253
type$precision <- float_precision_names[type$precision]
239254
},
@@ -260,7 +275,7 @@ fill_arrow_schema_enums_type <- function(type_type, type) {
260275

261276
fill_arrow_schema_enums_dict <- function(dict) {
262277
if (!is.null(dict)) {
263-
dict$dictionary_kind <-dict_kind_names[dict$dictionary_kind]
278+
dict$dictionary_kind <- dict_kind_names[dict$dictionary_kind]
264279
}
265280
dict
266281
}
@@ -283,7 +298,7 @@ fill_arrow_schema_enums <- function(schema) {
283298
schema
284299
}
285300

286-
encode_arrow_schema<- function(df) {
301+
encode_arrow_schema <- function(df) {
287302
schema <- encode_arrow_schema_r(df)
288303
schema <- fill_arrow_schema_enums(schema)
289304
rawenc <- .Call(nanoparquet_encode_arrow_schema, schema)
@@ -295,11 +310,11 @@ encode_arrow_schema<- function(df) {
295310
# Arrow only supports 8, 16, 32 and 64.
296311
factor_bits <- function(x) {
297312
l <- length(levels(x))
298-
if (l < 2^(8-1)) {
313+
if (l < 2^(8 - 1)) {
299314
8L
300-
} else if (l < 2^(16-1)) {
315+
} else if (l < 2^(16 - 1)) {
301316
16L
302-
} else if (l < 2^(32-1)) {
317+
} else if (l < 2^(32 - 1)) {
303318
32L
304319
} else {
305320
64L

R/infer-parquet-schema.R

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,31 +12,31 @@
1212
#' @export
1313

1414
infer_parquet_schema <- function(df, options = parquet_options()) {
15-
types <- .Call(rf_nanoparquet_map_to_parquet_types, df, options)
16-
lt <- unname(lapply(types, function(x) x[[3]]))
17-
ct <- lapply(lt, function(x) if (!is.null(x)) logical_to_converted(x))
18-
type_tab <- data.frame(
19-
file_name = rep(NA_character_, length(df)),
20-
name = names(df),
21-
r_type = vapply(types, function(x) x[[2]], ""),
22-
type = vapply(types, function(x) x[[1]], ""),
23-
type_length = rep(NA_integer_, length(df)),
24-
repetition_type = ifelse(vapply(df, anyNA, TRUE), "OPTIONAL", "REQUIRED"),
25-
converted_type = map_chr(ct, function(x) {
26-
x[["converted_type"]] %||% NA_character_
27-
}),
28-
logical_type = I(lt),
29-
num_children = rep(NA_integer_, length(df)),
30-
scale = map_int(ct, function(x) {
31-
x[["scale"]] %||% NA_integer_
32-
}),
33-
precision = map_int(ct, function(x) {
34-
x[["precision"]] %||% NA_integer_
35-
}),
36-
field_id = rep(NA_integer_, length(df))
37-
)
15+
types <- .Call(rf_nanoparquet_map_to_parquet_types, df, options)
16+
lt <- unname(lapply(types, function(x) x[[3]]))
17+
ct <- lapply(lt, function(x) if (!is.null(x)) logical_to_converted(x))
18+
type_tab <- data.frame(
19+
file_name = rep(NA_character_, length(df)),
20+
name = names(df),
21+
r_type = vapply(types, function(x) x[[2]], ""),
22+
type = vapply(types, function(x) x[[1]], ""),
23+
type_length = rep(NA_integer_, length(df)),
24+
repetition_type = ifelse(vapply(df, anyNA, TRUE), "OPTIONAL", "REQUIRED"),
25+
converted_type = map_chr(ct, function(x) {
26+
x[["converted_type"]] %||% NA_character_
27+
}),
28+
logical_type = I(lt),
29+
num_children = rep(NA_integer_, length(df)),
30+
scale = map_int(ct, function(x) {
31+
x[["scale"]] %||% NA_integer_
32+
}),
33+
precision = map_int(ct, function(x) {
34+
x[["precision"]] %||% NA_integer_
35+
}),
36+
field_id = rep(NA_integer_, length(df))
37+
)
3838

39-
rownames(type_tab) <- NULL
40-
class(type_tab) <- c("tbl", class(type_tab))
41-
type_tab
39+
rownames(type_tab) <- NULL
40+
class(type_tab) <- c("tbl", class(type_tab))
41+
type_tab
4242
}

R/options.R

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,16 @@ parquet_options <- function(
6666
class = getOption("nanoparquet.class", "tbl"),
6767
compression_level = getOption("nanoparquet.compression_level", NA_integer_),
6868
keep_row_groups = FALSE,
69-
num_rows_per_row_group = getOption("nanoparquet.num_rows_per_row_group", 10000000L),
69+
num_rows_per_row_group = getOption(
70+
"nanoparquet.num_rows_per_row_group",
71+
10000000L
72+
),
7073
use_arrow_metadata = getOption("nanoparquet.use_arrow_metadata", TRUE),
7174
write_arrow_metadata = getOption("nanoparquet.write_arrow_metadata", TRUE),
72-
write_data_page_version = getOption("nanoparquet.write_data_page_version", 1L),
75+
write_data_page_version = getOption(
76+
"nanoparquet.write_data_page_version",
77+
1L
78+
),
7379
write_minmax_values = getOption("nanoparquet.write_minmax_values", TRUE)
7480
) {
7581
stopifnot(is.character(class))
@@ -78,9 +84,9 @@ parquet_options <- function(
7884
stopifnot(is_flag(write_arrow_metadata))
7985
stopifnot(
8086
identical(write_data_page_version, 1) ||
81-
identical(write_data_page_version, 2) ||
82-
identical(write_data_page_version, 1L) ||
83-
identical(write_data_page_version, 2L)
87+
identical(write_data_page_version, 2) ||
88+
identical(write_data_page_version, 1L) ||
89+
identical(write_data_page_version, 2L)
8490
)
8591
stopifnot(is_flag(write_minmax_values))
8692
num_rows_per_row_group <- as_count(
@@ -89,12 +95,17 @@ parquet_options <- function(
8995
)
9096
if (identical(compression_level, Inf)) {
9197
compression_level <- 100000L
92-
} else if (identical(compression_level, NA) ||
93-
identical(compression_level, NA_integer_) ||
94-
identical(compression_level, NA_real_)) {
98+
} else if (
99+
identical(compression_level, NA) ||
100+
identical(compression_level, NA_integer_) ||
101+
identical(compression_level, NA_real_)
102+
) {
95103
compression_level <- NA_integer_
96104
} else {
97-
compression_level <- as_integer_scalar(compression_level, "compression_level")
105+
compression_level <- as_integer_scalar(
106+
compression_level,
107+
"compression_level"
108+
)
98109
}
99110

100111
list(

0 commit comments

Comments
 (0)