1
1
# ' Download table data
2
2
# '
3
- # ' This retrieves rows in chunks of `page_size`. It is most suitable for results
4
- # ' of smaller queries (<100 MB, say). For larger queries, it is better to
5
- # ' export the results to a CSV file stored on google cloud and use the
6
- # ' bq command line tool to download locally.
3
+ # ' @description
4
+ # ' This function provides two ways to download data from BigQuery, transfering
5
+ # ' data using either JSON or arrow, depending on the `api` argument. If
6
+ # ' bigrquerystorage is installed, `api = "arrow"` will be used (because it's
7
+ # ' so much faster, but see the limitions below), otherwise you can select
8
+ # ' deliberately by using `api = "json"` or `api = "arrow"`.
7
9
# '
8
- # ' @section Complex data:
9
- # ' bigrquery will retrieve nested and repeated columns in to list-columns
10
+ # ' ## Arrow API
11
+ # '
12
+ # ' The arrow API is much faster, but has heavier dependencies: bigrquerystorage
13
+ # ' requires the arrow package, which can be tricky to compile on Linux (but you
14
+ # ' usually should be able to get a binary from
15
+ # ' [Posit Public Package Manager](https://posit.co/products/cloud/public-package-manager/).
16
+ # '
17
+ # ' There's one known limitation of `api = "arrow"`: when querying public data,
18
+ # ' you'll now need to provide a `billing` project.
19
+ # '
20
+ # ' ## JSON API
21
+ # '
22
+ # ' The JSON API retrieves rows in chunks of `page_size`. It is most suitable
23
+ # ' for results of smaller queries (<100 MB, say). Unfortunately due to
24
+ # ' limitations in the BigQuery API, you may need to vary this parameter
25
+ # ' depending on the complexity of the underlying data.
26
+ # '
27
+ # ' The JSON API will convert nested and repeated columns in to list-columns
10
28
# ' as follows:
11
29
# '
12
30
# ' * Repeated values (arrays) will become a list-column of vectors.
13
31
# ' * Records will become list-columns of named lists.
14
32
# ' * Repeated records will become list-columns of data frames.
15
33
# '
16
- # ' @section Larger datasets:
17
- # ' In my timings, this code takes around 1 minute per 100 MB of data.
18
- # ' If you need to download considerably more than this, I recommend:
19
- # '
20
- # ' * Export a `.csv` file to Cloud Storage using [bq_table_save()].
21
- # ' * Use the `gsutil` command line utility to download it.
22
- # ' * Read the csv file into R with `readr::read_csv()` or `data.table::fread()`.
23
- # '
24
- # ' Unfortunately you can not export nested or repeated formats into CSV, and
25
- # ' the formats that BigQuery supports (arvn and ndjson) that allow for
26
- # ' nested/repeated values, are not well supported in R.
27
- # '
28
34
# ' @return Because data retrieval may generate list-columns and the `data.frame`
29
35
# ' print method can have problems with list-columns, this method returns
30
36
# ' a tibble. If you need a `data.frame`, coerce the results with
31
37
# ' [as.data.frame()].
32
38
# ' @param x A [bq_table]
33
39
# ' @param n_max Maximum number of results to retrieve. Use `Inf` to retrieve all
34
40
# ' rows.
35
- # ' @param page_size The number of rows requested per chunk. It is recommended to
36
- # ' leave this unspecified until you have evidence that the `page_size`
37
- # ' selected automatically by `bq_table_download()` is problematic.
41
+ # ' @param page_size (JSON only) The number of rows requested per chunk. It is
42
+ # ' recommended to leave this unspecified until you have evidence that the
43
+ # ' `page_size` selected automatically by `bq_table_download()` is problematic.
38
44
# '
39
45
# ' When `page_size = NULL` bigrquery determines a conservative, natural chunk
40
46
# ' size empirically. If you specify the `page_size`, it is important that each
41
47
# ' chunk fits on one page, i.e. that the requested row limit is low enough to
42
48
# ' prevent the API from paginating based on response size.
43
- # ' @param start_index Starting row index (zero-based).
44
- # ' @param max_connections Number of maximum simultaneous connections to
45
- # ' BigQuery servers.
49
+ # ' @param start_index (JSON only) Starting row index (zero-based).
50
+ # ' @param max_connections (JSON only) Number of maximum simultaneous
51
+ # ' connections to BigQuery servers.
52
+ # ' @param api Which API to use? The `"json"` API works where ever bigrquery
53
+ # ' does, but is slow and can require fiddling with the `page_size` parameter.
54
+ # ' The `"arrow"` API is faster and more reliable, but only works if you
55
+ # ' have also installed the bigrquerystorage package.
56
+ # '
57
+ # ' Because the `"arrow"` API is so much faster, it will be used automatically
58
+ # ' if the bigrquerystorage package is installed.
46
59
# ' @inheritParams api-job
47
60
# ' @param bigint The R type that BigQuery's 64-bit integer types should be
48
61
# ' mapped to. The default is `"integer"`, which returns R's `integer` type,
49
62
# ' but results in `NA` for values above/below +/- 2147483647. `"integer64"`
50
63
# ' returns a [bit64::integer64], which allows the full range of 64 bit
51
64
# ' integers.
65
+ # ' @param billing (Arrow only) Project to bill; defaults to the project of `x`,
66
+ # ' and typically only needs to be specified if you're working with public
67
+ # ' datasets.
52
68
# ' @param max_results `r lifecycle::badge("deprecated")` Deprecated. Please use
53
69
# ' `n_max` instead.
54
70
# ' @section Google BigQuery API documentation:
55
71
# ' * [list](https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list)
56
72
# ' @export
57
73
# ' @examplesIf bq_testable()
58
- # ' df <- bq_table_download("publicdata.samples.natality", n_max = 35000)
74
+ # ' df <- bq_table_download("publicdata.samples.natality", n_max = 35000, billing = bq_test_project() )
59
75
bq_table_download <-
60
76
function (x ,
61
77
n_max = Inf ,
@@ -64,20 +80,55 @@ bq_table_download <-
64
80
max_connections = 6L ,
65
81
quiet = NA ,
66
82
bigint = c(" integer" , " integer64" , " numeric" , " character" ),
83
+ api = c(" json" , " arrow" ),
84
+ billing = x $ project ,
67
85
max_results = deprecated()) {
68
86
x <- as_bq_table(x )
69
87
check_number_whole(n_max , min = 0 , allow_infinite = TRUE )
70
88
check_number_whole(start_index , min = 0 )
71
89
check_number_whole(max_connections , min = 1 )
72
90
quiet <- check_quiet(quiet )
73
91
bigint <- arg_match(bigint )
92
+ api <- check_api(api )
93
+
74
94
if (lifecycle :: is_present(max_results )) {
75
95
lifecycle :: deprecate_warn(
76
96
" 1.4.0" , " bq_table_download(max_results)" , " bq_table_download(n_max)"
77
97
)
78
98
n_max <- max_results
79
99
}
80
100
101
+ if (api == " arrow" ) {
102
+ check_installed(" bigrquerystorage" , " required to download using arrow API" )
103
+ if (! missing(page_size )) {
104
+ cli :: cli_warn(
105
+ ' {.arg page_size} is ignored when {.code api == "arrow"}' ,
106
+ call = environment()
107
+ )
108
+ }
109
+ if (! missing(start_index )) {
110
+ cli :: cli_warn(
111
+ ' {.arg start_index} is ignored when {.code api == "arrow"}' ,
112
+ call = environment()
113
+ )
114
+ }
115
+ if (! missing(max_connections )) {
116
+ cli :: cli_warn(
117
+ ' {.arg max_connections} is ignored when {.code api == "arrow"}' ,
118
+ call = environment()
119
+ )
120
+ }
121
+
122
+ return (bigrquerystorage :: bqs_table_download(
123
+ x = toString(x ),
124
+ parent = billing ,
125
+ n_max = n_max ,
126
+ quiet = quiet ,
127
+ bigint = bigint ,
128
+ as_tibble = TRUE
129
+ ))
130
+ }
131
+
81
132
params <- set_row_params(
82
133
nrow = bq_table_nrow(x ),
83
134
n_max = n_max ,
@@ -202,6 +253,14 @@ bq_table_download <-
202
253
parse_postprocess(table_data , bigint = bigint )
203
254
}
204
255
256
+ check_api <- function (api = c(" json" , " arrow" ), error_call = caller_env()) {
257
+ if (identical(api , c(" json" , " arrow" ))) {
258
+ if (has_bigrquerystorage()) " arrow" else " json"
259
+ } else {
260
+ arg_match(api , error_call = error_call )
261
+ }
262
+ }
263
+
205
264
# This function is a modified version of
206
265
# https://github.com/r-dbi/RPostgres/blob/master/R/PqResult.R
207
266
parse_postprocess <- function (df , bigint ) {
0 commit comments