-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathredshift.config.reference.hocon
209 lines (190 loc) · 7.57 KB
/
redshift.config.reference.hocon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
{
# Data Lake (S3) region
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "us-east-1",
# SQS topic name used by Transformer and Loader to communicate
"messageQueue": "test-queue",
# Optional. S3 path that holds JSONPaths
"jsonpaths": "s3://bucket/jsonpaths/",
# Warehouse connection details
"storage" : {
# The destination DB
"type": "redshift",
# Redshift hostname
"host": "redshift.amazonaws.com",
# Database name
"database": "snowplow",
# Database port. Optional, default value 5439
"port": 5439,
# AWS Role ARN allowing Redshift to load data from S3
"roleArn": "arn:aws:iam::123456789876:role/RedshiftLoadRole",
# DB schema name
"schema": "atomic",
# DB user with permissions to load data
"username": "admin",
# DB password
"password": "Supersecret1",
# Custom JDBC configuration. Optional, default value { "ssl": true }
"jdbc": { "ssl": true },
# MAXERROR, amount of acceptable loading errors. Optional, default value 10
"maxError": 10,
# unlock experimental features
"experimental": {
"enableWideRow": false
}
},
"schedules": {
# Periodic schedules to stop loading, e.g. for Redshift maintenance window
# Any amount of schedules is supported, but recommended to not overlap them
# The schedule works with machine's local timezone (and UTC is recommended)
"noOperation": [
{
# Human-readable name of the no-op window
"name": "Maintenance window",
# Cron expression with second granularity
"when": "0 0 12 * * ?",
# For how long the loader should be paused
"duration": "1 hour"
}
]
}
# Observability and reporting options
"monitoring": {
# Snowplow tracking (optional)
"snowplow": {
"appId": "redshift-loader",
"collector": "snplow.acme.com",
},
# An endpoint for alerts and infromational events
# Everything sent to snowplow collector (as properly formed self-describing events)
# will also be sent to the webhook as POST payloads with self-describing JSONs
"webhook": {
# An actual HTTP endpoint
"endpoint": "https://webhook.acme.com",
# Set of arbitrary key-value pairs attached to the payload
"tags": {
"pipeline": "production"
}
},
# Optional, for tracking runtime exceptions
"sentry": {
"dsn": "http://sentry.acme.com"
},
# Optional, configure how metrics are reported
"metrics": {
# Optional, send metrics to StatsD server
"statsd": {
"hostname": "localhost",
"port": 8125,
# Any key-value pairs to be tagged on every StatsD metric
"tags": {
"app": "rdb-loader"
}
# Optional, override the default metric prefix
# "prefix": "snowplow.rdbloader."
},
# Optional, print metrics on stdout (with slf4j)
"stdout": {
# Optional, override the default metric prefix
# "prefix": "snowplow.rdbloader."
}
# Optional, period for metrics emitted periodically
# Default value 5 minutes
# There is only one periodic metric at the moment.
# This metric is minimum_age_of_loaded_data.
# It specifies how old is the latest event in the warehouse.
"period": "5 minutes"
},
# Optional, configuration for periodic unloaded/corrupted folders checks
"folders": {
# Path where Loader could store auxiliary logs
# Loader should be able to write here, Redshift should be able to load from here
"staging": "s3://acme-snowplow/loader/logs/",
# How often to check
"period": "1 hour"
# Specifies since when folder monitoring will check
"since": "14 days"
# Specifies until when folder monitoring will check
"until": "7 days"
# Path to transformer archive (must be same as Transformer's `output.path`)
"transformerOutput": "s3://acme-snowplow/loader/transformed/"
# How many times the check can fail before generating an alarm instead of warning
"failBeforeAlarm": 3
},
# Periodic DB health-check, raising a warning if DB hasn't responded to `SELECT 1`
"healthCheck": {
# How often query a DB
"frequency": "20 minutes",
# How long to wait for a response
"timeout": "15 seconds"
}
},
# Immediate retries configuration
# Unlike retryQueue these retries happen immediately, without proceeding to another message
"retries": {
# Starting backoff period
"backoff": "30 seconds"
# A strategy to use when deciding on next backoff
"strategy": "EXPONENTIAL"
# How many attempts to make before sending the message into retry queue
# If missing - the loader will be retrying until cumulative bound
"attempts": 3,
# When backoff reaches this delay the Loader will stop retrying
# Missing cumulativeBound with missing attempts will force to retry inifintely
"cumulativeBound": "1 hour"
},
# Check the target destination to make sure it is ready.
# Retry the checking until target got ready and block the application in the meantime
"readyCheck": {
# Starting backoff period
"backoff": "15 seconds"
# A strategy to use when deciding on next backoff
"strategy": "CONSTANT"
},
# Retries configuration for initilization block
# It will retry on all exceptions from there
"initRetries": {
# Starting backoff period
"backoff": "30 seconds"
# A strategy to use when deciding on next backoff
"strategy": "EXPONENTIAL"
# How many attempts to make before sending the message into retry queue
# If missing - the loader will be retrying until cumulative bound
"attempts": 3,
# When backoff reaches this delay the Loader will stop retrying
# Missing cumulativeBound with missing attempts will force to retry inifintely
"cumulativeBound": "1 hour"
},
# Additional backlog of recently failed folders that could be automatically retried
# Retry Queue saves a failed folder and then re-reads the info from shredding_complete S3 file
"retryQueue": {
# How often batch of failed folders should be pulled into a discovery queue
"period": "30 minutes",
# How many failures should be kept in memory
# After the limit is reached new failures are dropped
"size": 64,
# How many attempt to make for each folder
# After the limit is reached new failures are dropped
"maxAttempts": 3,
# Artificial pause after each failed folder being added to the queue
"interval": "5 seconds"
},
"timeouts": {
# How long loading (actual COPY statements) can take before considering Redshift unhealthy
# Without any progress (i.e. different subfolder) within this period, loader
# will abort the transaction
"loading": "1 hour",
# How long non-loading steps (such as ALTER TABLE or metadata queries) can take
# before considering Redshift unhealthy
"nonLoading": "10 minutes"
# SQS visibility timeout is the time window in which a message must be
# deleted (acknowledged). Otherwise it is considered abandoned.
# If a message has been pulled, but hasn't been deleted, the next time
# it will re-appear in another consumer is equal to the visibility timeout
# Another consequence is that if Loader has failed on a message processing,
# the next time it will get this (or anything) from a queue has this delay
"sqsVisibility": "5 minutes"
}
}