-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathconfig.rs
More file actions
231 lines (205 loc) · 8.85 KB
/
Copy pathconfig.rs
File metadata and controls
231 lines (205 loc) · 8.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
use std::path::PathBuf;
use std::time::Duration;
use regelrecht_corpus::CorpusConfig;
use crate::error::{PipelineError, Result};
fn resolve_database_url() -> Result<String> {
std::env::var("DATABASE_URL")
.or_else(|_| std::env::var("DATABASE_SERVER_FULL"))
.map_err(|_| PipelineError::Config("DATABASE_URL or DATABASE_SERVER_FULL not set".into()))
}
fn resolve_max_connections() -> u32 {
std::env::var("DATABASE_MAX_CONNECTIONS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(5)
}
#[derive(Clone)]
pub struct PipelineConfig {
pub database_url: String,
pub max_connections: u32,
}
impl std::fmt::Debug for PipelineConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PipelineConfig")
.field("database_url", &"<redacted>")
.field("max_connections", &self.max_connections)
.finish()
}
}
impl PipelineConfig {
pub fn from_env() -> Result<Self> {
Ok(Self {
database_url: resolve_database_url()?,
max_connections: resolve_max_connections(),
})
}
pub fn new(database_url: impl Into<String>) -> Self {
Self {
database_url: database_url.into(),
max_connections: 5,
}
}
pub fn with_max_connections(mut self, max_connections: u32) -> Self {
self.max_connections = max_connections;
self
}
}
#[derive(Clone)]
pub struct WorkerConfig {
pub database_url: String,
pub max_connections: u32,
pub output_dir: PathBuf,
pub regulation_output_base: String,
pub poll_interval: Duration,
pub max_poll_interval: Duration,
pub corpus_config: Option<CorpusConfig>,
/// Maximum time a single job may run before being aborted by the worker.
/// Default: 20 minutes. Configurable via `WORKER_JOB_TIMEOUT_SECS`.
pub job_timeout: Duration,
/// Jobs stuck in 'processing' longer than this are reaped (reset or failed).
/// Default: 30 minutes. Configurable via `WORKER_ORPHAN_TIMEOUT_SECS`.
pub orphan_timeout: Duration,
/// Number of consecutive failures before a law is marked as exhausted.
/// Default: 10. Configurable via `EXHAUSTED_THRESHOLD`.
pub exhausted_threshold: i32,
/// Number of consecutive *resource-exhaustion* failures (fork()/EAGAIN/OOM)
/// before the worker exits so the orchestrator restarts it with a clean
/// process table. These faults are environmental and only clear on restart,
/// so retrying in-process just burns job retry budget in a tight loop.
/// Default: 5. Configurable via `WORKER_MAX_CONSECUTIVE_RESOURCE_FAILURES`.
pub max_consecutive_resource_failures: u32,
/// Maximum number of enrichment jobs (for this worker's provider) that may
/// run per UTC calendar day. Configurable via `ENRICH_DAILY_LIMIT`.
///
/// **Fail-closed**: absent (or unparseable) reads as `0`, and `0` pauses
/// enrichment entirely — a worker must be given an explicit positive limit
/// to run. This protects a personal Claude subscription token from being
/// spent by accident (a forgotten env var runs nothing rather than the whole
/// corpus).
///
/// Enforced by the enrich worker against the durable `jobs` table, so the
/// cap survives restarts. The cap keys on this worker's configured provider
/// (`LLM_PROVIDER`), and once reached it pauses the whole worker until the
/// UTC day rolls over — so it is meant for a provider-dedicated worker (e.g.
/// a claude-only enrichworker).
pub enrich_daily_limit: u32,
/// When true, a completed harvest auto-enqueues enrich jobs for that law.
/// Off by default; enrichment is otherwise requested explicitly via the admin
/// API. Configurable via `ENRICH_AUTO_ENQUEUE`.
pub auto_enrich_enqueue: bool,
/// Maximum recursion depth for related-legislation follow-up harvests.
/// A depth-0 enrichment may enqueue harvests at depth 1, whose enrichments
/// may enqueue at depth 2, etc., up to this cap. Default: 2. Configurable
/// via `RELATED_HARVEST_MAX_DEPTH`.
pub related_harvest_max_depth: u32,
}
impl std::fmt::Debug for WorkerConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("WorkerConfig")
.field("database_url", &"<redacted>")
.field("max_connections", &self.max_connections)
.field("output_dir", &self.output_dir)
.field("regulation_output_base", &self.regulation_output_base)
.field("poll_interval", &self.poll_interval)
.field("max_poll_interval", &self.max_poll_interval)
.field("corpus_config", &self.corpus_config)
.field("job_timeout", &self.job_timeout)
.field("orphan_timeout", &self.orphan_timeout)
.field("exhausted_threshold", &self.exhausted_threshold)
.field(
"max_consecutive_resource_failures",
&self.max_consecutive_resource_failures,
)
.field("enrich_daily_limit", &self.enrich_daily_limit)
.field("auto_enrich_enqueue", &self.auto_enrich_enqueue)
.field("related_harvest_max_depth", &self.related_harvest_max_depth)
.finish()
}
}
impl WorkerConfig {
pub fn from_env() -> Result<Self> {
let database_url = resolve_database_url()?;
let max_connections = resolve_max_connections();
let output_dir = std::env::var("REGULATION_REPO_PATH")
.unwrap_or_else(|_| "./regulation-repo".into())
.into();
let regulation_output_base =
std::env::var("REGULATION_OUTPUT_BASE").unwrap_or_else(|_| "regulation/nl".into());
let poll_interval_secs: u64 = std::env::var("WORKER_POLL_INTERVAL_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(5);
let max_poll_interval_secs: u64 = std::env::var("WORKER_MAX_POLL_INTERVAL_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(60);
let corpus_config = CorpusConfig::from_env_optional();
let job_timeout_secs: u64 = std::env::var("WORKER_JOB_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(20 * 60); // 20 minutes
let orphan_timeout_secs: u64 = std::env::var("WORKER_ORPHAN_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(30 * 60); // 30 minutes
let exhausted_threshold: i32 = std::env::var("EXHAUSTED_THRESHOLD")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(10)
.max(1);
let max_consecutive_resource_failures: u32 =
std::env::var("WORKER_MAX_CONSECUTIVE_RESOURCE_FAILURES")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(5)
.max(1);
// Per-provider daily run cap. Fail-closed: absent reads as 0 (paused),
// and a present-but-unparseable value warns and also reads as 0 rather
// than silently disabling the cap (this guards spend on a personal token).
let enrich_daily_limit: u32 = match std::env::var("ENRICH_DAILY_LIMIT") {
Ok(raw) => raw.parse::<u32>().unwrap_or_else(|_| {
tracing::warn!(
value = %raw,
"ENRICH_DAILY_LIMIT is not a valid non-negative integer; treating as 0 (enrichment paused)"
);
0
}),
Err(_) => 0,
};
// Auto-enrich after harvest is opt-in; unset/unrecognized reads as false.
let auto_enrich_enqueue = std::env::var("ENRICH_AUTO_ENQUEUE")
.map(|v| {
matches!(
v.trim().to_ascii_lowercase().as_str(),
"1" | "true" | "yes" | "on"
)
})
.unwrap_or(false);
let related_harvest_max_depth: u32 = std::env::var("RELATED_HARVEST_MAX_DEPTH")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(2);
Ok(Self {
database_url,
max_connections,
output_dir,
regulation_output_base,
poll_interval: Duration::from_secs(poll_interval_secs),
max_poll_interval: Duration::from_secs(max_poll_interval_secs),
corpus_config,
job_timeout: Duration::from_secs(job_timeout_secs),
orphan_timeout: Duration::from_secs(orphan_timeout_secs),
exhausted_threshold,
max_consecutive_resource_failures,
enrich_daily_limit,
auto_enrich_enqueue,
related_harvest_max_depth,
})
}
pub fn pipeline_config(&self) -> PipelineConfig {
PipelineConfig {
database_url: self.database_url.clone(),
max_connections: self.max_connections,
}
}
}