Skip to content

Commit 01c574e

Browse files
authored
Use bigrquerystorage for downloads (#604)
1 parent 3a93d82 commit 01c574e

20 files changed

+508
-132
lines changed

DESCRIPTION

+8-7
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ Description: Easily talk to Google's 'BigQuery' database from R.
1212
License: MIT + file LICENSE
1313
URL: https://bigrquery.r-dbi.org, https://github.com/r-dbi/bigrquery
1414
BugReports: https://github.com/r-dbi/bigrquery/issues
15-
Depends:
15+
Depends:
1616
R (>= 4.0)
17-
Imports:
17+
Imports:
1818
bit64,
1919
brio,
2020
cli,
@@ -29,8 +29,9 @@ Imports:
2929
prettyunits,
3030
rlang (>= 1.1.0),
3131
tibble,
32-
nanoparquet (> 0.3.1)
32+
nanoparquet (>= 0.3.1)
3333
Suggests:
34+
bigrquerystorage (>= 1.1.0.9000),
3435
blob,
3536
covr,
3637
dbplyr (>= 2.4.0),
@@ -41,9 +42,7 @@ Suggests:
4142
testthat (>= 3.1.5),
4243
wk (>= 0.3.2),
4344
withr
44-
Remotes:
45-
r-lib/nanoparquet
46-
LinkingTo:
45+
LinkingTo:
4746
cli,
4847
cpp11,
4948
rapidjsonr
@@ -54,7 +53,7 @@ Config/testthat/start-first: bq-table, dplyr
5453
Encoding: UTF-8
5554
Roxygen: list(markdown = TRUE)
5655
RoxygenNote: 7.3.2
57-
Collate:
56+
Collate:
5857
'bigrquery-package.R'
5958
'bq-auth.R'
6059
'bq-dataset.R'
@@ -84,3 +83,5 @@ Collate:
8483
'import-standalone-types-check.R'
8584
'utils.R'
8685
'zzz.R'
86+
Remotes:
87+
meztez/bigrquerystorage

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ export(bq_perform_extract)
8888
export(bq_perform_load)
8989
export(bq_perform_query)
9090
export(bq_perform_query_dry_run)
91+
export(bq_perform_query_schema)
9192
export(bq_perform_upload)
9293
export(bq_project_datasets)
9394
export(bq_project_jobs)

NEWS.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# bigrquery (development version)
22

3+
* If the bigrquerystorage package is installed, `bq_table_download()` (and
4+
hence `collect()`, `dbGetQuery()` and `dbFetch()` will use it. This will
5+
drastically improve the speed of downloading large datasets. A big thanks
6+
to @meztez for creating the bigrquerystorage package!
7+
38
* The `bq_perform_upload()` function now allows users to choose the transmission format (JSON or PARQUET) for data sent to BigQuery (@apalacio9502, #608).
9+
410
* bigrquery now requires R 4.0, in line with our version support principles.
511

612
# bigrquery 1.5.1

R/bq-download.R

+84-25
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,77 @@
11
#' Download table data
22
#'
3-
#' This retrieves rows in chunks of `page_size`. It is most suitable for results
4-
#' of smaller queries (<100 MB, say). For larger queries, it is better to
5-
#' export the results to a CSV file stored on google cloud and use the
6-
#' bq command line tool to download locally.
3+
#' @description
4+
#' This function provides two ways to download data from BigQuery, transfering
5+
#' data using either JSON or arrow, depending on the `api` argument. If
6+
#' bigrquerystorage is installed, `api = "arrow"` will be used (because it's
7+
#' so much faster, but see the limitions below), otherwise you can select
8+
#' deliberately by using `api = "json"` or `api = "arrow"`.
79
#'
8-
#' @section Complex data:
9-
#' bigrquery will retrieve nested and repeated columns in to list-columns
10+
#' ## Arrow API
11+
#'
12+
#' The arrow API is much faster, but has heavier dependencies: bigrquerystorage
13+
#' requires the arrow package, which can be tricky to compile on Linux (but you
14+
#' usually should be able to get a binary from
15+
#' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/).
16+
#'
17+
#' There's one known limitation of `api = "arrow"`: when querying public data,
18+
#' you'll now need to provide a `billing` project.
19+
#'
20+
#' ## JSON API
21+
#'
22+
#' The JSON API retrieves rows in chunks of `page_size`. It is most suitable
23+
#' for results of smaller queries (<100 MB, say). Unfortunately due to
24+
#' limitations in the BigQuery API, you may need to vary this parameter
25+
#' depending on the complexity of the underlying data.
26+
#'
27+
#' The JSON API will convert nested and repeated columns in to list-columns
1028
#' as follows:
1129
#'
1230
#' * Repeated values (arrays) will become a list-column of vectors.
1331
#' * Records will become list-columns of named lists.
1432
#' * Repeated records will become list-columns of data frames.
1533
#'
16-
#' @section Larger datasets:
17-
#' In my timings, this code takes around 1 minute per 100 MB of data.
18-
#' If you need to download considerably more than this, I recommend:
19-
#'
20-
#' * Export a `.csv` file to Cloud Storage using [bq_table_save()].
21-
#' * Use the `gsutil` command line utility to download it.
22-
#' * Read the csv file into R with `readr::read_csv()` or `data.table::fread()`.
23-
#'
24-
#' Unfortunately you can not export nested or repeated formats into CSV, and
25-
#' the formats that BigQuery supports (arvn and ndjson) that allow for
26-
#' nested/repeated values, are not well supported in R.
27-
#'
2834
#' @return Because data retrieval may generate list-columns and the `data.frame`
2935
#' print method can have problems with list-columns, this method returns
3036
#' a tibble. If you need a `data.frame`, coerce the results with
3137
#' [as.data.frame()].
3238
#' @param x A [bq_table]
3339
#' @param n_max Maximum number of results to retrieve. Use `Inf` to retrieve all
3440
#' rows.
35-
#' @param page_size The number of rows requested per chunk. It is recommended to
36-
#' leave this unspecified until you have evidence that the `page_size`
37-
#' selected automatically by `bq_table_download()` is problematic.
41+
#' @param page_size (JSON only) The number of rows requested per chunk. It is
42+
#' recommended to leave this unspecified until you have evidence that the
43+
#' `page_size` selected automatically by `bq_table_download()` is problematic.
3844
#'
3945
#' When `page_size = NULL` bigrquery determines a conservative, natural chunk
4046
#' size empirically. If you specify the `page_size`, it is important that each
4147
#' chunk fits on one page, i.e. that the requested row limit is low enough to
4248
#' prevent the API from paginating based on response size.
43-
#' @param start_index Starting row index (zero-based).
44-
#' @param max_connections Number of maximum simultaneous connections to
45-
#' BigQuery servers.
49+
#' @param start_index (JSON only) Starting row index (zero-based).
50+
#' @param max_connections (JSON only) Number of maximum simultaneous
51+
#' connections to BigQuery servers.
52+
#' @param api Which API to use? The `"json"` API works where ever bigrquery
53+
#' does, but is slow and can require fiddling with the `page_size` parameter.
54+
#' The `"arrow"` API is faster and more reliable, but only works if you
55+
#' have also installed the bigrquerystorage package.
56+
#'
57+
#' Because the `"arrow"` API is so much faster, it will be used automatically
58+
#' if the bigrquerystorage package is installed.
4659
#' @inheritParams api-job
4760
#' @param bigint The R type that BigQuery's 64-bit integer types should be
4861
#' mapped to. The default is `"integer"`, which returns R's `integer` type,
4962
#' but results in `NA` for values above/below +/- 2147483647. `"integer64"`
5063
#' returns a [bit64::integer64], which allows the full range of 64 bit
5164
#' integers.
65+
#' @param billing (Arrow only) Project to bill; defaults to the project of `x`,
66+
#' and typically only needs to be specified if you're working with public
67+
#' datasets.
5268
#' @param max_results `r lifecycle::badge("deprecated")` Deprecated. Please use
5369
#' `n_max` instead.
5470
#' @section Google BigQuery API documentation:
5571
#' * [list](https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list)
5672
#' @export
5773
#' @examplesIf bq_testable()
58-
#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000)
74+
#' df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project())
5975
bq_table_download <-
6076
function(x,
6177
n_max = Inf,
@@ -64,20 +80,55 @@ bq_table_download <-
6480
max_connections = 6L,
6581
quiet = NA,
6682
bigint = c("integer", "integer64", "numeric", "character"),
83+
api = c("json", "arrow"),
84+
billing = x$project,
6785
max_results = deprecated()) {
6886
x <- as_bq_table(x)
6987
check_number_whole(n_max, min = 0, allow_infinite = TRUE)
7088
check_number_whole(start_index, min = 0)
7189
check_number_whole(max_connections, min = 1)
7290
quiet <- check_quiet(quiet)
7391
bigint <- arg_match(bigint)
92+
api <- check_api(api)
93+
7494
if (lifecycle::is_present(max_results)) {
7595
lifecycle::deprecate_warn(
7696
"1.4.0", "bq_table_download(max_results)", "bq_table_download(n_max)"
7797
)
7898
n_max <- max_results
7999
}
80100

101+
if (api == "arrow") {
102+
check_installed("bigrquerystorage", "required to download using arrow API")
103+
if (!missing(page_size)) {
104+
cli::cli_warn(
105+
'{.arg page_size} is ignored when {.code api == "arrow"}',
106+
call = environment()
107+
)
108+
}
109+
if (!missing(start_index)) {
110+
cli::cli_warn(
111+
'{.arg start_index} is ignored when {.code api == "arrow"}',
112+
call = environment()
113+
)
114+
}
115+
if (!missing(max_connections)) {
116+
cli::cli_warn(
117+
'{.arg max_connections} is ignored when {.code api == "arrow"}',
118+
call = environment()
119+
)
120+
}
121+
122+
return(bigrquerystorage::bqs_table_download(
123+
x = toString(x),
124+
parent = billing,
125+
n_max = n_max,
126+
quiet = quiet,
127+
bigint = bigint,
128+
as_tibble = TRUE
129+
))
130+
}
131+
81132
params <- set_row_params(
82133
nrow = bq_table_nrow(x),
83134
n_max = n_max,
@@ -202,6 +253,14 @@ bq_table_download <-
202253
parse_postprocess(table_data, bigint = bigint)
203254
}
204255

256+
check_api <- function(api = c("json", "arrow"), error_call = caller_env()) {
257+
if (identical(api, c("json", "arrow"))) {
258+
if (has_bigrquerystorage()) "arrow" else "json"
259+
} else {
260+
arg_match(api, error_call = error_call)
261+
}
262+
}
263+
205264
# This function is a modified version of
206265
# https://github.com/r-dbi/RPostgres/blob/master/R/PqResult.R
207266
parse_postprocess <- function(df, bigint) {

R/bq-perform.R

+58-14
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ export_json <- function(values) {
210210
#' Google Cloud.
211211
#'
212212
#' For Google Cloud Storage URIs: Each URI can contain one
213-
#' `'*'`` wildcard character and it must come after the 'bucket' name.
213+
#' `'*'` wildcard character and it must come after the 'bucket' name.
214214
#' Size limits related to load jobs apply to external data sources.
215215
#'
216216
#' For Google Cloud Bigtable URIs: Exactly one URI can be specified and
@@ -358,21 +358,13 @@ bq_perform_query_dry_run <- function(query, billing,
358358
parameters = NULL,
359359
use_legacy_sql = FALSE) {
360360

361-
check_string(query)
362-
check_string(billing)
363-
check_bool(use_legacy_sql)
364361

365-
query <- list(
366-
query = unbox(query),
367-
useLegacySql = unbox(use_legacy_sql)
362+
query <- bq_perform_query_data(
363+
query = query,
364+
default_dataset = default_dataset,
365+
parameters = parameters,
366+
use_legacy_sql = use_legacy_sql
368367
)
369-
if (!is.null(parameters)) {
370-
parameters <- as_bq_params(parameters)
371-
query$queryParameters <- as_json(parameters)
372-
}
373-
if (!is.null(default_dataset)) {
374-
query$defaultDataset <- datasetReference(default_dataset)
375-
}
376368

377369
url <- bq_path(billing, jobs = "")
378370
body <- list(configuration = list(query = query, dryRun = unbox(TRUE)))
@@ -386,6 +378,58 @@ bq_perform_query_dry_run <- function(query, billing,
386378
structure(bytes, class = "bq_bytes")
387379
}
388380

381+
#' @export
382+
#' @rdname api-perform
383+
bq_perform_query_schema <- function(query, billing,
384+
...,
385+
default_dataset = NULL,
386+
parameters = NULL) {
387+
388+
query <- bq_perform_query_data(
389+
query = query,
390+
default_dataset = default_dataset,
391+
parameters = parameters,
392+
use_legacy_sql = FALSE
393+
)
394+
395+
url <- bq_path(billing, jobs = "")
396+
body <- list(configuration = list(query = query, dryRun = unbox(TRUE)))
397+
398+
res <- bq_post(
399+
url,
400+
body = bq_body(body, ...),
401+
query = list(fields = "statistics")
402+
)
403+
# https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableSchema
404+
res$statistics$query$schema$fields
405+
}
406+
407+
bq_perform_query_data <- function(query,
408+
...,
409+
default_dataset = NULL,
410+
parameters = NULL,
411+
use_legacy_sql = FALSE,
412+
call = caller_env()) {
413+
check_string(query, error_call = call)
414+
check_bool(use_legacy_sql, error_call = call)
415+
416+
query <- list(
417+
query = unbox(query),
418+
useLegacySql = unbox(use_legacy_sql)
419+
)
420+
if (!is.null(parameters)) {
421+
parameters <- as_bq_params(parameters)
422+
query$queryParameters <- as_json(parameters)
423+
}
424+
if (!is.null(default_dataset)) {
425+
query$defaultDataset <- datasetReference(default_dataset)
426+
}
427+
428+
query
429+
}
430+
431+
432+
389433
#' @export
390434
#' @rdname api-perform
391435
bq_perform_copy <- function(src, dest,

R/dbi-connection.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ setMethod("dbCreateTable", "BigQueryConnection", dbCreateTable_bq)
318318

319319
dbReadTable_bq <- function(conn, name, ...) {
320320
tb <- as_bq_table(conn, name)
321-
bq_table_download(tb, ...)
321+
bq_table_download(tb, ..., api = "json")
322322
}
323323

324324
#' @rdname DBI

R/dbi-result.R

+22-9
Original file line numberDiff line numberDiff line change
@@ -100,18 +100,31 @@ setMethod(
100100
"dbFetch", "BigQueryResult",
101101
function(res, n = -1, ...) {
102102
check_number_whole(n, min = -1, allow_infinite = TRUE)
103+
if (n == -1) n <- Inf
103104

104-
if (n == -1 || n == Inf) {
105+
if (has_bigrquerystorage() && n == Inf && res@cursor$cur() == 0) {
106+
# https://github.com/meztez/bigrquerystorage/issues/48
105107
n <- res@cursor$left()
108+
109+
# If possible, download complete dataset using arrow
110+
data <- bq_table_download(res@bq_table,
111+
n_max = n,
112+
bigint = res@bigint,
113+
quiet = res@quiet,
114+
api = "arrow"
115+
)
116+
} else {
117+
# Otherwise, fall back to slower JSON API
118+
data <- bq_table_download(res@bq_table,
119+
n_max = n,
120+
start_index = res@cursor$cur(),
121+
page_size = res@page_size,
122+
bigint = res@bigint,
123+
quiet = res@quiet,
124+
api = "json"
125+
)
106126
}
107-
108-
data <- bq_table_download(res@bq_table,
109-
n_max = n,
110-
start_index = res@cursor$cur(),
111-
page_size = res@page_size,
112-
bigint = res@bigint,
113-
quiet = res@quiet
114-
)
127+
115128
res@cursor$adv(nrow(data))
116129

117130
data

0 commit comments

Comments
 (0)