-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsparklyr.R
More file actions
58 lines (39 loc) · 1.33 KB
/
sparklyr.R
File metadata and controls
58 lines (39 loc) · 1.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
## Spark Connection Test
## Lewis Quayle, Ph.D. (drlquayle@gmail.com)
## 2025-09-18
# setup -------------------------------------------------------------------
# load dependencies
# install.packages("sparklyr")
# install.packages("pysparklyr")
# pysparklyr::install_pyspark(version = "3.4.4")
library(dplyr)
library(sparklyr)
# configuration -----------------------------------------------------------
# base config (override here as needed)
config <- spark_config()
# config$spark.executor.memory <- "4G"
# config$spark.driver.memory <- "4G"
# config$spark.executor.cores <- 1
# connection --------------------------------------------------------------
# connect to spark using sparklyr
sc <-
spark_connect(
master = "sc://localhost:15002",
method = "spark_connect",
version = "3.4.4",
config = config
)
# minimal test ------------------------------------------------------------
# quick sanity check: enumerate available tables
DBI::dbListTables(sc)
# upload the built-in iris dataset and compute a trivial aggregation
df <- copy_to(sc, iris, overwrite = TRUE)
df %>%
group_by(Species) %>%
summarise(
mean_sepal_length = mean(Sepal_Length, na.rm = TRUE)
)
# confirm table presence after write
DBI::dbListTables(sc)
# disconnect --------------------------------------------------------------
spark_disconnect(sc)