sparklyr-livy/knox_config.R.template at main · ab2dridi/sparklyr-livy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# ==============================================================================
# KNOX CONFIGURATION - DO NOT COMMIT THIS FILE!
# ==============================================================================
# Copy this file to knox_config.R and fill in your information
# Add knox_config.R to .gitignore to protect your credentials

# ==============================================================================
# KNOX CREDENTIALS
# ==============================================================================

KNOX_USERNAME <- "your_username"
KNOX_PASSWORD <- "your_password"

# ==============================================================================
# KNOX URLS (Cloudera CDP 7.1.9)
# ==============================================================================

# Livy URL for Spark connection
# Adapt knox_hostname and knox_port according to your cluster
KNOX_MASTER_URL <- "https://knox_hostname:knox_port/gateway/cdp-proxy-api/livy_for_spark3"

# WebHDFS URL for JAR upload (uses the same credentials)
KNOX_WEBHDFS_URL <- "https://knox_hostname:knox_port/gateway/cdp-proxy-api/webhdfs/v1"

# ==============================================================================
# SPARK SESSION CONFIGURATION
# ==============================================================================

# Driver memory (e.g., "4G", "8G", "16G")
SPARK_DRIVER_MEMORY <- "4G"

# Memory per executor (e.g., "4G", "8G")
SPARK_EXECUTOR_MEMORY <- "4G"

# Number of executors (e.g., 2, 4, 8)
SPARK_NUM_EXECUTORS <- 2

# Cores per executor (e.g., 2, 4)
SPARK_EXECUTOR_CORES <- 2

# YARN queue (e.g., "default", "production", "data-science")
SPARK_QUEUE <- "default"

# Connection timeout in seconds (300 = 5 minutes)
SPARK_CONNECT_TIMEOUT <- 300

# ==============================================================================
# SPARKLYR JAR PATH
# ==============================================================================

# This path will be configured automatically by setup_jar.R
# DO NOT MODIFY MANUALLY unless you uploaded the JAR yourself
# Examples:
#   SPARKLYR_JAR_PATH <- "hdfs:///user/your_username/sparklyr/sparklyr-3.0-2.12.jar"
#   SPARKLYR_JAR_PATH <- "hdfs:///user/your_username/sparklyr/sparklyr-3.5-2.12.jar"
SPARKLYR_JAR_PATH <- ""

# ==============================================================================
# INSTRUCTIONS
# ==============================================================================
#
# 1. Copy this file: cp knox_config.R.template knox_config.R
# 2. Edit knox_config.R with your actual information
# 3. Run JAR installation: source("setup_jar.R")
# 4. Connect to Spark: source("sparklyr_connection.R")
# 5. Test with examples: source("examples/basic_examples.R")
#
# ⚠️ Don't forget to add knox_config.R to your .gitignore!
#

# Alternative: If you found an existing JAR on the cluster
# SPARKLYR_JAR_PATH <- "file:///opt/cloudera/parcels/CDH/jars/sparklyr-3.0-2.12.jar"