Skip to content

Commit ea3c931

Browse files
Merge pull request #39 from SaridakisStamatisChristos/codex/update-cli-and-configuration-support
Add predictive/telemetry config parsing and tests
2 parents b27ef03 + 17f813d commit ea3c931

File tree

15 files changed

+1393
-37
lines changed

15 files changed

+1393
-37
lines changed

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ set(THERMAL_SIMD_CORE_SOURCES
2121
src/config/runtime_flags.c
2222
src/logging.c
2323
src/config_parser.c
24+
src/third_party/jsmn.c
2425
src/statistics.c
2526
src/runtime_metrics.c
2627
src/health_check.c
@@ -112,6 +113,11 @@ if(BUILD_TESTING)
112113
target_compile_options(test_logging_metrics PRIVATE -Wall -Wextra -pthread)
113114
add_test(NAME logging_metrics COMMAND test_logging_metrics)
114115

116+
add_executable(test_runtime_config_cli tests/config/test_runtime_config_cli.c)
117+
target_link_libraries(test_runtime_config_cli PRIVATE thermal_simd_core_tests m)
118+
target_compile_options(test_runtime_config_cli PRIVATE -Wall -Wextra)
119+
add_test(NAME runtime_config_cli COMMAND test_runtime_config_cli)
120+
115121
add_executable(test_thermal_simd
116122
tests/test_thermal_simd.c
117123
src/thermal_simd.c)

README.md

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,43 @@ cmake --build build --config Release -j
3232
```
3333

3434
## Flags
35-
- `--interval=MS` check interval (default 50)
36-
- `--down-count=N` throttles before downgrade (default 3)
37-
- `--up-count=N` stable intervals before upgrade (default 5)
38-
- `--down-ratio=R` throttle threshold as CPI multiple (default 1.5)
39-
- `--cooldown-down=MS` cooldown after downgrade (default 1000)
40-
- `--cooldown-up=MS` cooldown after upgrade (default 2000)
41-
- `--min-dwell=MS` minimum time per SIMD width (default 200)
42-
- `--no-avx512` disable AVX‑512 usage
43-
- `--duration-sec=S` runtime duration for demo (default 10)
44-
- `--work-iters=N` inner work iterations per tick (default 10,000,000)
45-
- `--degraded-timeout-sec=S` fail closed if hardware counters remain unavailable for S seconds (default 120)
46-
- `--health-check` run diagnostics (perf counters, telemetry, trampolines) and exit with status
47-
- `--log-level=LEVEL` set log verbosity (`error`, `warn`, `info`, `debug`; default `info`)
48-
- `--temp-ceiling=°C` predictive controller ceiling (default 92)
49-
- `--safety-margin=°C` guard band below ceiling for upgrades (default 4)
50-
- `--emergency-margin=°C` triggers scalar emergency fallback (default 10)
51-
- `--telemetry-interval=MS` collector interval (default 50)
52-
- `--telemetry-max-skew=MS` allowable skew between collectors (default 15)
53-
- `--telemetry-ewma` CPI EWMA alpha (default 0.25)
54-
- `--metrics-port=PORT` Prometheus endpoint port (default 9753)
55-
- `--metrics-basic-auth=user:pass` enable basic auth for metrics
56-
- `--metrics-cert/--metrics-key` enable TLS for metrics endpoint
57-
- `--statsd-host/--statsd-port` send metrics to StatsD
35+
- `--config=FILE` load overrides from a JSON file (see [configuration docs](docs/configuration.md)).
36+
- `--interval=MS` check interval (default 50).
37+
- `--down-count=N` throttles before downgrade (default 3).
38+
- `--up-count=N` stable intervals before upgrade (default 5).
39+
- `--down-ratio=R` throttle threshold as CPI multiple (default 1.5).
40+
- `--cooldown-down=MS` cooldown after downgrade (default 1000).
41+
- `--cooldown-up=MS` cooldown after upgrade (default 2000).
42+
- `--min-dwell=MS` minimum time per SIMD width (default 200).
43+
- `--no-avx512` disable AVX‑512 usage.
44+
- `--duration-sec=S` runtime duration for demo (default 10).
45+
- `--work-iters=N` inner work iterations per tick (default 10,000,000).
46+
- `--degraded-timeout-sec=S` fail closed if hardware counters remain unavailable for S seconds (default 120).
47+
- `--log-level=LEVEL` set log verbosity (`error`, `warn`, `info`, `debug`; default `info`).
48+
- `--health-check` run diagnostics (perf counters, telemetry, trampolines) and exit with status.
49+
50+
**Predictive controller**
51+
- `--temp-ceiling=°C` predictive controller ceiling (default 92).
52+
- `--safety-margin=°C` guard band below the ceiling for upgrades (default 4).
53+
- `--emergency-margin=°C` additional buffer that triggers scalar fallback (default 10).
54+
- `--predictive-alpha=A` CPI EWMA alpha in the predictive path (default 0.25).
55+
- `--coeff-path=PATH` ARX coefficient bundle (default `config/controller_coeffs.json`).
56+
57+
**Telemetry fusion**
58+
- `--telemetry-interval=MS` collector interval (default 50).
59+
- `--telemetry-max-skew=MS` allowable skew between collectors (default 150).
60+
- `--telemetry-ewma=A` telemetry CPI EWMA alpha (default 0.25).
61+
- `--telemetry-profile=PATH` optional telemetry profile manifest.
62+
63+
**Metrics & observability**
64+
- `--metrics-port=PORT` Prometheus endpoint port (default 9464, `0` disables).
65+
- `--metrics-bind=ADDR` bind address (default `127.0.0.1`).
66+
- `--metrics-cert=PATH` / `--metrics-key=PATH` enable TLS for the metrics endpoint.
67+
- `--metrics-ca=PATH` optional client CA bundle when using mutual TLS.
68+
- `--metrics-require-client-auth` enforce mutual TLS for `/metrics` and `/healthz`.
69+
- `--metrics-basic-auth=user:pass` enable HTTP basic authentication.
70+
- `--statsd-host=HOST` emit StatsD metrics to the given host (disabled by default).
71+
- `--statsd-port=PORT` StatsD UDP port (default 8125).
5872

5973
Environment override:
6074
- `TSD_LOG_LEVEL` mirrors `--log-level` for non-interactive deployments.

docs/configuration.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Runtime Configuration
2+
3+
The dispatcher exposes a small command-line interface and an optional JSON configuration
4+
file to tailor predictive control, telemetry fusion, and observability. CLI flags take
5+
precedence over values loaded from the JSON file.
6+
7+
## Configuration file
8+
9+
Pass `--config=/path/to/runtime.json` to load overrides. The file supports the following
10+
structure:
11+
12+
```json
13+
{
14+
"predictive": {
15+
"coeff_path": "config/controller_coeffs.json",
16+
"temp_ceiling_c": 92,
17+
"safety_margin_c": 4,
18+
"emergency_margin_c": 10,
19+
"alpha": 0.25
20+
},
21+
"telemetry": {
22+
"profile": "config/telemetry/profile.json",
23+
"interval_ms": 50,
24+
"max_skew_ms": 150,
25+
"ewma": 0.25
26+
},
27+
"metrics": {
28+
"bind_address": "127.0.0.1",
29+
"port": 9464,
30+
"tls": {
31+
"certificate": "config/certs/dispatcher.crt",
32+
"private_key": "config/certs/dispatcher.key",
33+
"client_ca": "config/certs/ca.crt",
34+
"require_client_auth": false
35+
},
36+
"basic_auth": {
37+
"username": "metrics",
38+
"password": "change-me"
39+
},
40+
"statsd": {
41+
"host": "127.0.0.1",
42+
"port": 8125
43+
}
44+
}
45+
}
46+
```
47+
48+
All sections are optional—omitted values fall back to the compiled defaults documented
49+
below. The `predictive.coeff_path` defaults to the bundled
50+
`config/controller_coeffs.json` generated alongside the build.
51+
52+
## Key options
53+
54+
| Area | Flag / JSON key | Description | Default |
55+
| ---- | ---------------- | ----------- | ------- |
56+
| Predictive | `--temp-ceiling` / `predictive.temp_ceiling_c` | Controller temperature ceiling in °C. | 92 |
57+
| Predictive | `--safety-margin` / `predictive.safety_margin_c` | Guard band below the ceiling before upgrades. | 4 |
58+
| Predictive | `--emergency-margin` / `predictive.emergency_margin_c` | Additional buffer that forces scalar fallback. | 10 |
59+
| Predictive | `--predictive-alpha` / `predictive.alpha` | CPI EWMA alpha for the predictive controller. | 0.25 |
60+
| Predictive | `--coeff-path` / `predictive.coeff_path` | ARX coefficient bundle path. | `config/controller_coeffs.json` |
61+
| Telemetry | `--telemetry-interval` / `telemetry.interval_ms` | Telemetry fusion poll interval (ms). | 50 |
62+
| Telemetry | `--telemetry-max-skew` / `telemetry.max_skew_ms` | Maximum allowed skew between collectors (ms). | 150 |
63+
| Telemetry | `--telemetry-ewma` / `telemetry.ewma` | Telemetry CPI EWMA alpha. | 0.25 |
64+
| Telemetry | `--telemetry-profile` / `telemetry.profile` | Optional telemetry profile manifest. | *(unset)* |
65+
| Metrics | `--metrics-port` / `metrics.port` | Prometheus listen port (`0` disables). | 9464 |
66+
| Metrics | `--metrics-bind` / `metrics.bind_address` | Listen address. | `127.0.0.1` |
67+
| Metrics | `--metrics-cert` / `metrics.tls.certificate` | TLS certificate (PEM). | *(unset)* |
68+
| Metrics | `--metrics-key` / `metrics.tls.private_key` | TLS private key (PEM). | *(unset)* |
69+
| Metrics | `--metrics-ca` / `metrics.tls.client_ca` | Optional client CA bundle for mTLS. | *(unset)* |
70+
| Metrics | `--metrics-require-client-auth` / `metrics.tls.require_client_auth` | Enforce client certificates. | `false` |
71+
| Metrics | `--metrics-basic-auth` / `metrics.basic_auth.{username,password}` | HTTP basic auth credentials. | *(unset)* |
72+
| Metrics | `--statsd-host` / `metrics.statsd.host` | StatsD target host. | *(unset)* |
73+
| Metrics | `--statsd-port` / `metrics.statsd.port` | StatsD UDP port. | 8125 |
74+
75+
## Validation rules
76+
77+
- TLS requires both certificate and private key paths. Supplying `--metrics-require-client-auth`
78+
(or setting `metrics.tls.require_client_auth`) also requires a client CA bundle.
79+
- Basic authentication requires both username and password.
80+
- StatsD is enabled only when both host and port are set.
81+
- Telemetry intervals and skews must remain between 10 ms and 60,000 ms.
82+
- Predictive margins must fall between 0 °C and 60 °C.
83+
84+
Invalid combinations terminate the process with a descriptive log entry so that
85+
misconfigurations are caught during startup.

docs/metrics-endpoints.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ The dispatcher exports metrics and health data via a multi-channel strategy tail
66
- **In-process registry:** `metrics/registry.c` tracks counters, gauges, and histograms. All subsystems register metrics during initialization.
77
- **Snapshot API:** `metrics/snapshot.h` exposes `metrics_snapshot_collect()` which produces a read-only view of the current values.
88
- **Exporters:**
9-
- **Prometheus text endpoint** on `localhost:9753/metrics` (optional TLS with `--metrics-cert` / `--metrics-key`).
9+
- **Prometheus text endpoint** on `localhost:9464/metrics` (TLS enabled via `--metrics-cert` / `--metrics-key`).
1010
- **StatsD UDP exporter** (disabled by default) configured via `--statsd-host` and `--statsd-port`.
1111
- **Structured logs** that emit metric deltas under `event=metrics_flush` for environments without scrape support.
1212

@@ -37,9 +37,11 @@ The dispatcher exports metrics and health data via a multi-channel strategy tail
3737
### Configuration Flags
3838
| Flag | Description | Default |
3939
| --- | --- | --- |
40-
| `--metrics-port` | Listen port for HTTP endpoint. | 9753 |
40+
| `--metrics-port` | Listen port for HTTP endpoint. | 9464 |
4141
| `--metrics-addr` | Bind address. | `127.0.0.1` |
4242
| `--metrics-cert` / `--metrics-key` | Enable TLS for Prometheus endpoint. | Disabled |
43+
| `--metrics-ca` | Client CA bundle for mutual TLS. | Disabled |
44+
| `--metrics-require-client-auth` | Enforce mTLS for `/metrics` and `/healthz`. | Disabled |
4345
| `--metrics-basic-auth` | `user:pass` credentials for basic auth. | None |
4446
| `--statsd-host` | StatsD host for UDP export. | Disabled |
4547
| `--statsd-port` | StatsD port. | 8125 |

docs/runbooks/patcher-attestation-alert.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ The security attestation service flagged the dispatcher patcher subsystem due to
1212
1. **Confirm Alert Context**
1313
```bash
1414
kubectl logs <pod> | grep attestation | tail
15-
curl -s http://<pod>:9753/metrics | egrep 'attestation|patch_failures_total'
15+
curl -s http://<pod>:9464/metrics | egrep 'attestation|patch_failures_total'
1616
```
1717
2. **Check Dispatcher State**
1818
```bash

docs/runbooks/sensor-failure.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ A mandatory telemetry sensor (perf counters, MSR temperature, or frequency sourc
1212
1. **Confirm Scope**
1313
```bash
1414
kubectl logs <pod> | grep telemetry_sensor | tail
15-
curl -s http://<pod>:9753/metrics | grep telemetry_degraded_total
15+
curl -s http://<pod>:9464/metrics | grep telemetry_degraded_total
1616
```
1717
2. **Force Health Check**
1818
```bash

docs/sandbox-workflow.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ This workflow describes how to exercise the dispatcher in a non-production sandb
3939
## Workflow Details
4040
- The runner starts the dispatcher container with `--health-check` followed by a steady-state workload phase.
4141
- Telemetry fuzzer attaches over a Unix domain socket exposed by the dispatcher (`/run/tsd/telemetry.sock`).
42-
- Metrics probe scrapes `localhost:9753` and writes results to `artifacts/metrics.ndjson`.
42+
- Metrics probe scrapes `localhost:9464` and writes results to `artifacts/metrics.ndjson`.
4343
- Sandbox artifacts (logs, metrics, telemetry traces) land under `artifacts/YYYYmmdd-HHMMSS/` for upload to CI.
4444

4545
## Scenarios

include/thermal/simd/config_parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
int tsd_parse_int_option(const char *value, long min, long max, int *out);
88
int tsd_parse_ms_option(const char *value, int min_ms, int max_ms, int *out_us);
99
int tsd_parse_ratio_option(const char *value, double min, double max, double *ratio_out, uint64_t *scaled_out);
10+
int tsd_parse_double_option(const char *value, double min, double max, double *out);
1011
int tsd_compute_ticks_from_ms(int interval_us, int ms, int *out_ticks, long long *raw_ticks_out);
1112

1213
#endif

include/thermal/simd/thermal_config.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,23 @@ typedef struct {
3535
int metrics_enabled;
3636
int metrics_port;
3737
char metrics_bind_host[64];
38+
char metrics_tls_cert_path[256];
39+
char metrics_tls_key_path[256];
40+
char metrics_tls_ca_path[256];
41+
int metrics_tls_require_client_auth;
42+
char metrics_basic_auth_user[128];
43+
char metrics_basic_auth_pass[128];
44+
char statsd_host[128];
45+
int statsd_port;
46+
int telemetry_interval_ms;
47+
int telemetry_max_skew_ms;
48+
double telemetry_ewma_alpha;
49+
char telemetry_profile_path[256];
50+
int predictive_temp_ceiling_c;
51+
int predictive_safety_margin_c;
52+
int predictive_emergency_margin_c;
53+
double predictive_alpha;
54+
char predictive_coeff_path[256];
3855
tsd_log_level_t log_level;
3956
tsd_policy_config policy;
4057
} tsd_runtime_config;
@@ -47,10 +64,8 @@ void tsd_runtime_config_enter_degraded_mode(tsd_runtime_config *cfg, const char
4764
void tsd_runtime_config_exit_degraded_mode(tsd_runtime_config *cfg, const char *reason);
4865
int tsd_runtime_config_is_degraded(void);
4966

50-
#ifndef TSD_ENABLE_TESTS
5167
int tsd_runtime_config_parse_cli(tsd_runtime_config *cfg, int argc, char **argv);
5268
void tsd_runtime_config_print_usage(const char *prog);
53-
#endif
5469

5570
#ifdef __cplusplus
5671
}

src/config_parser.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,26 @@ int tsd_parse_ratio_option(const char *value, double min, double max, double *ra
6363
return 0;
6464
}
6565

66+
int tsd_parse_double_option(const char *value, double min, double max, double *out) {
67+
if (!value || !out) {
68+
return -1;
69+
}
70+
errno = 0;
71+
char *end = NULL;
72+
double parsed = strtod(value, &end);
73+
if (errno != 0 || end == value || *end != '\0') {
74+
return -1;
75+
}
76+
if (isnan(parsed) || isinf(parsed)) {
77+
return -1;
78+
}
79+
if (parsed < min || parsed > max) {
80+
return -1;
81+
}
82+
*out = parsed;
83+
return 0;
84+
}
85+
6686
int tsd_compute_ticks_from_ms(int interval_us, int ms, int *out_ticks, long long *raw_ticks_out) {
6787
if (raw_ticks_out) {
6888
*raw_ticks_out = -1;

0 commit comments

Comments
 (0)