Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(THERMAL_SIMD_CORE_SOURCES
src/config/runtime_flags.c
src/logging.c
src/config_parser.c
src/third_party/jsmn.c
src/statistics.c
src/runtime_metrics.c
src/health_check.c
Expand Down Expand Up @@ -112,6 +113,11 @@ if(BUILD_TESTING)
target_compile_options(test_logging_metrics PRIVATE -Wall -Wextra -pthread)
add_test(NAME logging_metrics COMMAND test_logging_metrics)

add_executable(test_runtime_config_cli tests/config/test_runtime_config_cli.c)
target_link_libraries(test_runtime_config_cli PRIVATE thermal_simd_core_tests m)
target_compile_options(test_runtime_config_cli PRIVATE -Wall -Wextra)
add_test(NAME runtime_config_cli COMMAND test_runtime_config_cli)

add_executable(test_thermal_simd
tests/test_thermal_simd.c
src/thermal_simd.c)
Expand Down
60 changes: 37 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,43 @@ cmake --build build --config Release -j
```

## Flags
- `--interval=MS` check interval (default 50)
- `--down-count=N` throttles before downgrade (default 3)
- `--up-count=N` stable intervals before upgrade (default 5)
- `--down-ratio=R` throttle threshold as CPI multiple (default 1.5)
- `--cooldown-down=MS` cooldown after downgrade (default 1000)
- `--cooldown-up=MS` cooldown after upgrade (default 2000)
- `--min-dwell=MS` minimum time per SIMD width (default 200)
- `--no-avx512` disable AVX‑512 usage
- `--duration-sec=S` runtime duration for demo (default 10)
- `--work-iters=N` inner work iterations per tick (default 10,000,000)
- `--degraded-timeout-sec=S` fail closed if hardware counters remain unavailable for S seconds (default 120)
- `--health-check` run diagnostics (perf counters, telemetry, trampolines) and exit with status
- `--log-level=LEVEL` set log verbosity (`error`, `warn`, `info`, `debug`; default `info`)
- `--temp-ceiling=°C` predictive controller ceiling (default 92)
- `--safety-margin=°C` guard band below ceiling for upgrades (default 4)
- `--emergency-margin=°C` triggers scalar emergency fallback (default 10)
- `--telemetry-interval=MS` collector interval (default 50)
- `--telemetry-max-skew=MS` allowable skew between collectors (default 15)
- `--telemetry-ewma` CPI EWMA alpha (default 0.25)
- `--metrics-port=PORT` Prometheus endpoint port (default 9753)
- `--metrics-basic-auth=user:pass` enable basic auth for metrics
- `--metrics-cert/--metrics-key` enable TLS for metrics endpoint
- `--statsd-host/--statsd-port` send metrics to StatsD
- `--config=FILE` load overrides from a JSON file (see [configuration docs](docs/configuration.md)).
- `--interval=MS` check interval (default 50).
- `--down-count=N` throttles before downgrade (default 3).
- `--up-count=N` stable intervals before upgrade (default 5).
- `--down-ratio=R` throttle threshold as CPI multiple (default 1.5).
- `--cooldown-down=MS` cooldown after downgrade (default 1000).
- `--cooldown-up=MS` cooldown after upgrade (default 2000).
- `--min-dwell=MS` minimum time per SIMD width (default 200).
- `--no-avx512` disable AVX‑512 usage.
- `--duration-sec=S` runtime duration for demo (default 10).
- `--work-iters=N` inner work iterations per tick (default 10,000,000).
- `--degraded-timeout-sec=S` fail closed if hardware counters remain unavailable for S seconds (default 120).
- `--log-level=LEVEL` set log verbosity (`error`, `warn`, `info`, `debug`; default `info`).
- `--health-check` run diagnostics (perf counters, telemetry, trampolines) and exit with status.

**Predictive controller**
- `--temp-ceiling=°C` predictive controller ceiling (default 92).
- `--safety-margin=°C` guard band below the ceiling for upgrades (default 4).
- `--emergency-margin=°C` additional buffer that triggers scalar fallback (default 10).
- `--predictive-alpha=A` CPI EWMA alpha in the predictive path (default 0.25).
- `--coeff-path=PATH` ARX coefficient bundle (default `config/controller_coeffs.json`).

**Telemetry fusion**
- `--telemetry-interval=MS` collector interval (default 50).
- `--telemetry-max-skew=MS` allowable skew between collectors (default 150).
- `--telemetry-ewma=A` telemetry CPI EWMA alpha (default 0.25).
- `--telemetry-profile=PATH` optional telemetry profile manifest.

**Metrics & observability**
- `--metrics-port=PORT` Prometheus endpoint port (default 9464, `0` disables).
- `--metrics-bind=ADDR` bind address (default `127.0.0.1`).
- `--metrics-cert=PATH` / `--metrics-key=PATH` enable TLS for the metrics endpoint.
- `--metrics-ca=PATH` optional client CA bundle when using mutual TLS.
- `--metrics-require-client-auth` enforce mutual TLS for `/metrics` and `/healthz`.
- `--metrics-basic-auth=user:pass` enable HTTP basic authentication.
- `--statsd-host=HOST` emit StatsD metrics to the given host (disabled by default).
- `--statsd-port=PORT` StatsD UDP port (default 8125).

Environment override:
- `TSD_LOG_LEVEL` mirrors `--log-level` for non-interactive deployments.
Expand Down
85 changes: 85 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Runtime Configuration

The dispatcher exposes a small command-line interface and an optional JSON configuration
file to tailor predictive control, telemetry fusion, and observability. CLI flags take
precedence over values loaded from the JSON file.

## Configuration file

Pass `--config=/path/to/runtime.json` to load overrides. The file supports the following
structure:

```json
{
"predictive": {
"coeff_path": "config/controller_coeffs.json",
"temp_ceiling_c": 92,
"safety_margin_c": 4,
"emergency_margin_c": 10,
"alpha": 0.25
},
"telemetry": {
"profile": "config/telemetry/profile.json",
"interval_ms": 50,
"max_skew_ms": 150,
"ewma": 0.25
},
"metrics": {
"bind_address": "127.0.0.1",
"port": 9464,
"tls": {
"certificate": "config/certs/dispatcher.crt",
"private_key": "config/certs/dispatcher.key",
"client_ca": "config/certs/ca.crt",
"require_client_auth": false
},
"basic_auth": {
"username": "metrics",
"password": "change-me"
},
"statsd": {
"host": "127.0.0.1",
"port": 8125
}
}
}
```

All sections are optional—omitted values fall back to the compiled defaults documented
below. The `predictive.coeff_path` defaults to the bundled
`config/controller_coeffs.json` generated alongside the build.

## Key options

| Area | Flag / JSON key | Description | Default |
| ---- | ---------------- | ----------- | ------- |
| Predictive | `--temp-ceiling` / `predictive.temp_ceiling_c` | Controller temperature ceiling in °C. | 92 |
| Predictive | `--safety-margin` / `predictive.safety_margin_c` | Guard band below the ceiling before upgrades. | 4 |
| Predictive | `--emergency-margin` / `predictive.emergency_margin_c` | Additional buffer that forces scalar fallback. | 10 |
| Predictive | `--predictive-alpha` / `predictive.alpha` | CPI EWMA alpha for the predictive controller. | 0.25 |
| Predictive | `--coeff-path` / `predictive.coeff_path` | ARX coefficient bundle path. | `config/controller_coeffs.json` |
| Telemetry | `--telemetry-interval` / `telemetry.interval_ms` | Telemetry fusion poll interval (ms). | 50 |
| Telemetry | `--telemetry-max-skew` / `telemetry.max_skew_ms` | Maximum allowed skew between collectors (ms). | 150 |
| Telemetry | `--telemetry-ewma` / `telemetry.ewma` | Telemetry CPI EWMA alpha. | 0.25 |
| Telemetry | `--telemetry-profile` / `telemetry.profile` | Optional telemetry profile manifest. | *(unset)* |
| Metrics | `--metrics-port` / `metrics.port` | Prometheus listen port (`0` disables). | 9464 |
| Metrics | `--metrics-bind` / `metrics.bind_address` | Listen address. | `127.0.0.1` |
| Metrics | `--metrics-cert` / `metrics.tls.certificate` | TLS certificate (PEM). | *(unset)* |
| Metrics | `--metrics-key` / `metrics.tls.private_key` | TLS private key (PEM). | *(unset)* |
| Metrics | `--metrics-ca` / `metrics.tls.client_ca` | Optional client CA bundle for mTLS. | *(unset)* |
| Metrics | `--metrics-require-client-auth` / `metrics.tls.require_client_auth` | Enforce client certificates. | `false` |
| Metrics | `--metrics-basic-auth` / `metrics.basic_auth.{username,password}` | HTTP basic auth credentials. | *(unset)* |
| Metrics | `--statsd-host` / `metrics.statsd.host` | StatsD target host. | *(unset)* |
| Metrics | `--statsd-port` / `metrics.statsd.port` | StatsD UDP port. | 8125 |

## Validation rules

- TLS requires both certificate and private key paths. Supplying `--metrics-require-client-auth`
(or setting `metrics.tls.require_client_auth`) also requires a client CA bundle.
- Basic authentication requires both username and password.
- StatsD is enabled only when both host and port are set.
- Telemetry intervals and skews must remain between 10 ms and 60,000 ms.
- Predictive margins must fall between 0 °C and 60 °C.

Invalid combinations terminate the process with a descriptive log entry so that
misconfigurations are caught during startup.
6 changes: 4 additions & 2 deletions docs/metrics-endpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The dispatcher exports metrics and health data via a multi-channel strategy tail
- **In-process registry:** `metrics/registry.c` tracks counters, gauges, and histograms. All subsystems register metrics during initialization.
- **Snapshot API:** `metrics/snapshot.h` exposes `metrics_snapshot_collect()` which produces a read-only view of the current values.
- **Exporters:**
- **Prometheus text endpoint** on `localhost:9753/metrics` (optional TLS with `--metrics-cert` / `--metrics-key`).
- **Prometheus text endpoint** on `localhost:9464/metrics` (TLS enabled via `--metrics-cert` / `--metrics-key`).
- **StatsD UDP exporter** (disabled by default) configured via `--statsd-host` and `--statsd-port`.
- **Structured logs** that emit metric deltas under `event=metrics_flush` for environments without scrape support.

Expand Down Expand Up @@ -37,9 +37,11 @@ The dispatcher exports metrics and health data via a multi-channel strategy tail
### Configuration Flags
| Flag | Description | Default |
| --- | --- | --- |
| `--metrics-port` | Listen port for HTTP endpoint. | 9753 |
| `--metrics-port` | Listen port for HTTP endpoint. | 9464 |
| `--metrics-addr` | Bind address. | `127.0.0.1` |
| `--metrics-cert` / `--metrics-key` | Enable TLS for Prometheus endpoint. | Disabled |
| `--metrics-ca` | Client CA bundle for mutual TLS. | Disabled |
| `--metrics-require-client-auth` | Enforce mTLS for `/metrics` and `/healthz`. | Disabled |
| `--metrics-basic-auth` | `user:pass` credentials for basic auth. | None |
| `--statsd-host` | StatsD host for UDP export. | Disabled |
| `--statsd-port` | StatsD port. | 8125 |
Expand Down
2 changes: 1 addition & 1 deletion docs/runbooks/patcher-attestation-alert.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The security attestation service flagged the dispatcher patcher subsystem due to
1. **Confirm Alert Context**
```bash
kubectl logs <pod> | grep attestation | tail
curl -s http://<pod>:9753/metrics | egrep 'attestation|patch_failures_total'
curl -s http://<pod>:9464/metrics | egrep 'attestation|patch_failures_total'
```
2. **Check Dispatcher State**
```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/runbooks/sensor-failure.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ A mandatory telemetry sensor (perf counters, MSR temperature, or frequency sourc
1. **Confirm Scope**
```bash
kubectl logs <pod> | grep telemetry_sensor | tail
curl -s http://<pod>:9753/metrics | grep telemetry_degraded_total
curl -s http://<pod>:9464/metrics | grep telemetry_degraded_total
```
2. **Force Health Check**
```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/sandbox-workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ This workflow describes how to exercise the dispatcher in a non-production sandb
## Workflow Details
- The runner starts the dispatcher container with `--health-check` followed by a steady-state workload phase.
- Telemetry fuzzer attaches over a Unix domain socket exposed by the dispatcher (`/run/tsd/telemetry.sock`).
- Metrics probe scrapes `localhost:9753` and writes results to `artifacts/metrics.ndjson`.
- Metrics probe scrapes `localhost:9464` and writes results to `artifacts/metrics.ndjson`.
- Sandbox artifacts (logs, metrics, telemetry traces) land under `artifacts/YYYYmmdd-HHMMSS/` for upload to CI.

## Scenarios
Expand Down
1 change: 1 addition & 0 deletions include/thermal/simd/config_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
int tsd_parse_int_option(const char *value, long min, long max, int *out);
int tsd_parse_ms_option(const char *value, int min_ms, int max_ms, int *out_us);
int tsd_parse_ratio_option(const char *value, double min, double max, double *ratio_out, uint64_t *scaled_out);
int tsd_parse_double_option(const char *value, double min, double max, double *out);
int tsd_compute_ticks_from_ms(int interval_us, int ms, int *out_ticks, long long *raw_ticks_out);

#endif
19 changes: 17 additions & 2 deletions include/thermal/simd/thermal_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ typedef struct {
int metrics_enabled;
int metrics_port;
char metrics_bind_host[64];
char metrics_tls_cert_path[256];
char metrics_tls_key_path[256];
char metrics_tls_ca_path[256];
int metrics_tls_require_client_auth;
char metrics_basic_auth_user[128];
char metrics_basic_auth_pass[128];
char statsd_host[128];
int statsd_port;
int telemetry_interval_ms;
int telemetry_max_skew_ms;
double telemetry_ewma_alpha;
char telemetry_profile_path[256];
int predictive_temp_ceiling_c;
int predictive_safety_margin_c;
int predictive_emergency_margin_c;
double predictive_alpha;
char predictive_coeff_path[256];
tsd_log_level_t log_level;
tsd_policy_config policy;
} tsd_runtime_config;
Expand All @@ -47,10 +64,8 @@ void tsd_runtime_config_enter_degraded_mode(tsd_runtime_config *cfg, const char
void tsd_runtime_config_exit_degraded_mode(tsd_runtime_config *cfg, const char *reason);
int tsd_runtime_config_is_degraded(void);

#ifndef TSD_ENABLE_TESTS
int tsd_runtime_config_parse_cli(tsd_runtime_config *cfg, int argc, char **argv);
void tsd_runtime_config_print_usage(const char *prog);
#endif

#ifdef __cplusplus
}
Expand Down
20 changes: 20 additions & 0 deletions src/config_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,26 @@ int tsd_parse_ratio_option(const char *value, double min, double max, double *ra
return 0;
}

int tsd_parse_double_option(const char *value, double min, double max, double *out) {
if (!value || !out) {
return -1;
}
errno = 0;
char *end = NULL;
double parsed = strtod(value, &end);
if (errno != 0 || end == value || *end != '\0') {
return -1;
}
if (isnan(parsed) || isinf(parsed)) {
return -1;
}
if (parsed < min || parsed > max) {
return -1;
}
*out = parsed;
return 0;
}

int tsd_compute_ticks_from_ms(int interval_us, int ms, int *out_ticks, long long *raw_ticks_out) {
if (raw_ticks_out) {
*raw_ticks_out = -1;
Expand Down
Loading