|
| 1 | +// Package claimstatus surfaces the feeder's account-claim status to the |
| 2 | +// webconfig UI. The webconfig itself makes no network calls; it shells out |
| 3 | +// to `apl-feed claim status --json`, which probes the backend with the |
| 4 | +// local claim secret and reports whether the feeder is registered and |
| 5 | +// whether a user account has claimed it (owner_present). |
| 6 | +// |
| 7 | +// The result is cached server-side with a single-flight guard so browser |
| 8 | +// reloads and multiple tabs collapse to one backend probe, keeping us well |
| 9 | +// inside the feeder-status endpoint's per-UUID rate limit. The cache is |
| 10 | +// keyed by the feeder identity (UUID + claim-secret fingerprint) so an |
| 11 | +// identity import / claim rotation invalidates a stale "claimed" verdict |
| 12 | +// without any explicit hook. |
| 13 | +package claimstatus |
| 14 | + |
| 15 | +import ( |
| 16 | + "context" |
| 17 | + "encoding/json" |
| 18 | + "fmt" |
| 19 | + "strings" |
| 20 | + "sync" |
| 21 | + "time" |
| 22 | + |
| 23 | + wexec "github.com/airplanes-live/image-webconfig/internal/exec" |
| 24 | +) |
| 25 | + |
| 26 | +// Result tokens. The first group is "definitive": a known local or |
| 27 | +// server-confirmed state worth caching and falling back to. The second |
| 28 | +// group is "transient": the probe could not get a definitive answer this |
| 29 | +// time, so we prefer a previously-cached definitive value if we have one. |
| 30 | +const ( |
| 31 | + ResultClaimed = "claimed" |
| 32 | + ResultUnclaimed = "unclaimed" |
| 33 | + ResultSecretMismatch = "secret_mismatch" |
| 34 | + ResultServerUnregistered = "server_unregistered" |
| 35 | + ResultUnregistered = "unregistered" |
| 36 | + ResultSecretInvalid = "secret_invalid" |
| 37 | + ResultNoIdentity = "no_identity" |
| 38 | + ResultBlocked = "blocked" |
| 39 | + |
| 40 | + ResultRateLimited = "rate_limited" |
| 41 | + ResultUnreachable = "unreachable" |
| 42 | + ResultError = "error" |
| 43 | + // ResultUnavailable is webconfig-local: the apl-feed claim status |
| 44 | + // command could not be run or did not emit a schema-v1 object (old |
| 45 | + // feed without the subcommand, missing binary, crash). Distinct from |
| 46 | + // the feed-emitted transient results above. |
| 47 | + ResultUnavailable = "unavailable" |
| 48 | +) |
| 49 | + |
| 50 | +// definitiveResults are the verdicts safe to cache and serve as a |
| 51 | +// last-known value when a later probe fails. |
| 52 | +var definitiveResults = map[string]bool{ |
| 53 | + ResultClaimed: true, |
| 54 | + ResultUnclaimed: true, |
| 55 | + ResultSecretMismatch: true, |
| 56 | + ResultServerUnregistered: true, |
| 57 | + ResultUnregistered: true, |
| 58 | + ResultSecretInvalid: true, |
| 59 | + ResultNoIdentity: true, |
| 60 | + ResultBlocked: true, |
| 61 | +} |
| 62 | + |
| 63 | +func isDefinitive(result string) bool { return definitiveResults[result] } |
| 64 | + |
| 65 | +// Output is the `apl-feed claim status --json` schema-v1 payload. Pointer |
| 66 | +// fields distinguish "absent" from a zero value so the API response can |
| 67 | +// omit them rather than assert false/0. |
| 68 | +type Output struct { |
| 69 | + SchemaVersion int `json:"schema_version"` |
| 70 | + Result string `json:"result"` |
| 71 | + Registered *bool `json:"registered"` |
| 72 | + OwnerPresent *bool `json:"owner_present"` |
| 73 | + Version *int `json:"version"` |
| 74 | + ResetUntil *string `json:"reset_until"` |
| 75 | + LastSeenAt *string `json:"last_seen_at"` |
| 76 | + LastSeenAgeSeconds *int `json:"last_seen_age_seconds"` |
| 77 | + RetryAfterSeconds *int `json:"retry_after_seconds"` |
| 78 | + Detail *string `json:"detail"` |
| 79 | +} |
| 80 | + |
| 81 | +// Response is the GET /api/claim/status body. It is the probe Output plus |
| 82 | +// freshness metadata the SPA renders ("checked N ago", stale banner). |
| 83 | +type Response struct { |
| 84 | + Result string `json:"result"` |
| 85 | + Registered *bool `json:"registered,omitempty"` |
| 86 | + OwnerPresent *bool `json:"owner_present,omitempty"` |
| 87 | + Version *int `json:"version,omitempty"` |
| 88 | + ResetUntil *string `json:"reset_until,omitempty"` |
| 89 | + LastSeenAt *string `json:"last_seen_at,omitempty"` |
| 90 | + LastSeenAgeSeconds *int `json:"last_seen_age_seconds,omitempty"` |
| 91 | + RetryAfterSeconds *int `json:"retry_after_seconds,omitempty"` |
| 92 | + // CheckedAt is the RFC3339 UTC time of the probe that produced this |
| 93 | + // verdict. For a stale fallback it is the time of the last good probe, |
| 94 | + // not the failed refresh, so "checked N ago" stays truthful. |
| 95 | + CheckedAt string `json:"checked_at"` |
| 96 | + // Stale is true when a refresh failed and we are serving an older |
| 97 | + // definitive verdict. Error then names the failure (unreachable / |
| 98 | + // rate_limited / unavailable / error). |
| 99 | + Stale bool `json:"stale"` |
| 100 | + Error string `json:"error,omitempty"` |
| 101 | +} |
| 102 | + |
| 103 | +func responseFrom(o Output, at time.Time) Response { |
| 104 | + return Response{ |
| 105 | + Result: o.Result, |
| 106 | + Registered: o.Registered, |
| 107 | + OwnerPresent: o.OwnerPresent, |
| 108 | + Version: o.Version, |
| 109 | + ResetUntil: o.ResetUntil, |
| 110 | + LastSeenAt: o.LastSeenAt, |
| 111 | + LastSeenAgeSeconds: o.LastSeenAgeSeconds, |
| 112 | + RetryAfterSeconds: o.RetryAfterSeconds, |
| 113 | + CheckedAt: at.UTC().Format(time.RFC3339), |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +// DefaultArgv is the read-only, unprivileged command the prober runs. No |
| 118 | +// sudo: apl-feed claim status reads the group-readable claim secret and |
| 119 | +// makes a network probe — neither needs root, so this argv is NOT part of |
| 120 | +// the sudoers/PrivilegedArgv parity contract. |
| 121 | +var DefaultArgv = []string{"/usr/local/bin/apl-feed", "claim", "status", "--json"} |
| 122 | + |
| 123 | +// DefaultProbeTimeout bounds a single CLI invocation. apl-feed's curl uses |
| 124 | +// connect-timeout 10 + max-time 30, so the slowest legitimate probe is |
| 125 | +// ~30s; 35s leaves slack without letting a hung child pin the single |
| 126 | +// in-flight slot indefinitely. |
| 127 | +const DefaultProbeTimeout = 35 * time.Second |
| 128 | + |
| 129 | +// Prober runs the apl-feed claim status CLI and parses its output. |
| 130 | +type Prober struct { |
| 131 | + Runner wexec.CommandRunner |
| 132 | + Argv []string |
| 133 | + Timeout time.Duration |
| 134 | +} |
| 135 | + |
| 136 | +// Probe runs the CLI and returns the parsed Output. A non-zero exit is |
| 137 | +// expected for the transient results (apl-feed exits 2 on unreachable / |
| 138 | +// error while still printing JSON), so the exit code is NOT consulted — |
| 139 | +// the contract is "valid schema-v1 JSON on stdout". Anything else (old |
| 140 | +// feed rejecting the subcommand, missing binary, crash, truncated output) |
| 141 | +// yields ResultUnavailable plus a descriptive error for server-side logs. |
| 142 | +func (p Prober) Probe(ctx context.Context) (Output, error) { |
| 143 | + argv := p.Argv |
| 144 | + if len(argv) == 0 { |
| 145 | + argv = DefaultArgv |
| 146 | + } |
| 147 | + timeout := p.Timeout |
| 148 | + if timeout <= 0 { |
| 149 | + timeout = DefaultProbeTimeout |
| 150 | + } |
| 151 | + cctx, cancel := context.WithTimeout(ctx, timeout) |
| 152 | + defer cancel() |
| 153 | + |
| 154 | + res, runErr := p.Runner(cctx, argv) |
| 155 | + out, ok := parseOutput(res.Stdout) |
| 156 | + if !ok { |
| 157 | + return Output{Result: ResultUnavailable}, fmt.Errorf( |
| 158 | + "claim status probe: no schema-v1 JSON (exit=%d err=%v stderr=%q)", |
| 159 | + res.ExitCode, runErr, truncate(res.Stderr, 200)) |
| 160 | + } |
| 161 | + return out, nil |
| 162 | +} |
| 163 | + |
| 164 | +// parseOutput decodes stdout and validates it is the schema-v1 contract. |
| 165 | +// Returns ok=false for empty / non-JSON / wrong-schema / empty-result |
| 166 | +// bodies so the caller maps them to ResultUnavailable. |
| 167 | +func parseOutput(stdout []byte) (Output, bool) { |
| 168 | + trimmed := strings.TrimSpace(string(stdout)) |
| 169 | + if trimmed == "" { |
| 170 | + return Output{}, false |
| 171 | + } |
| 172 | + var o Output |
| 173 | + if err := json.Unmarshal([]byte(trimmed), &o); err != nil { |
| 174 | + return Output{}, false |
| 175 | + } |
| 176 | + if o.SchemaVersion != 1 || o.Result == "" { |
| 177 | + return Output{}, false |
| 178 | + } |
| 179 | + return o, true |
| 180 | +} |
| 181 | + |
| 182 | +func truncate(b []byte, n int) string { |
| 183 | + s := strings.TrimSpace(string(b)) |
| 184 | + if len(s) > n { |
| 185 | + return s[:n] |
| 186 | + } |
| 187 | + return s |
| 188 | +} |
| 189 | + |
| 190 | +// ProbeFunc is the seam the Cache calls; production wires Prober.Probe, |
| 191 | +// tests inject a canned function. |
| 192 | +type ProbeFunc func(context.Context) (Output, error) |
| 193 | + |
| 194 | +// Cache is a single-entry, single-flight cache over a ProbeFunc. It serves |
| 195 | +// the last definitive verdict for the current identity within maxAge, |
| 196 | +// coalesces concurrent refreshes into one probe, enforces a minimum probe |
| 197 | +// interval (rate-limit safety), and falls back to the last good verdict |
| 198 | +// when a refresh fails. |
| 199 | +type Cache struct { |
| 200 | + probe ProbeFunc |
| 201 | + now func() time.Time |
| 202 | + floor time.Duration |
| 203 | + |
| 204 | + mu sync.Mutex |
| 205 | + key string |
| 206 | + good *Response // last definitive verdict for key |
| 207 | + goodAt time.Time |
| 208 | + last *Response // last verdict served for key (definitive or fallback) |
| 209 | + attemptAt time.Time // last probe attempt for key (any outcome) |
| 210 | + inflight chan struct{} |
| 211 | +} |
| 212 | + |
| 213 | +// MinProbeInterval is the floor between probes for one identity. 7s caps |
| 214 | +// the worst case near ~8/min — under the backend's 10/min sustained |
| 215 | +// per-UUID budget even if a client hammers "Check now". |
| 216 | +const MinProbeInterval = 7 * time.Second |
| 217 | + |
| 218 | +// NewCache builds a Cache. now defaults to time.Now; floor defaults to |
| 219 | +// MinProbeInterval. |
| 220 | +func NewCache(probe ProbeFunc, now func() time.Time) *Cache { |
| 221 | + if now == nil { |
| 222 | + now = time.Now |
| 223 | + } |
| 224 | + return &Cache{probe: probe, now: now, floor: MinProbeInterval} |
| 225 | +} |
| 226 | + |
| 227 | +// Get returns the claim-status verdict for the given identity key. maxAge |
| 228 | +// is the caller's freshness tolerance; it is clamped up to the floor so a |
| 229 | +// forced refresh (maxAge 0) still cannot probe more than once per floor |
| 230 | +// for the same identity. A key change (identity import / rotation) resets |
| 231 | +// the slot and bypasses the floor. |
| 232 | +func (c *Cache) Get(ctx context.Context, key string, maxAge time.Duration) Response { |
| 233 | + eff := maxAge |
| 234 | + if eff < c.floor { |
| 235 | + eff = c.floor |
| 236 | + } |
| 237 | + |
| 238 | + c.mu.Lock() |
| 239 | + if c.key != key { |
| 240 | + // New identity: drop the prior slot entirely. |
| 241 | + c.key = key |
| 242 | + c.good = nil |
| 243 | + c.last = nil |
| 244 | + c.goodAt = time.Time{} |
| 245 | + c.attemptAt = time.Time{} |
| 246 | + } |
| 247 | + // Fresh definitive verdict for this identity. |
| 248 | + if c.good != nil && c.now().Sub(c.goodAt) <= eff { |
| 249 | + r := *c.good |
| 250 | + c.mu.Unlock() |
| 251 | + return r |
| 252 | + } |
| 253 | + // Floor guard: probed this identity too recently; serve best-available |
| 254 | + // rather than hit the rate-limited endpoint again. |
| 255 | + if !c.attemptAt.IsZero() && c.now().Sub(c.attemptAt) < c.floor { |
| 256 | + r := c.bestLocked() |
| 257 | + c.mu.Unlock() |
| 258 | + return r |
| 259 | + } |
| 260 | + // Coalesce: a probe is already running for this slot — wait for it, |
| 261 | + // then serve whatever it stored. |
| 262 | + if c.inflight != nil { |
| 263 | + ch := c.inflight |
| 264 | + c.mu.Unlock() |
| 265 | + select { |
| 266 | + case <-ch: |
| 267 | + case <-ctx.Done(): |
| 268 | + c.mu.Lock() |
| 269 | + r := c.bestLocked() |
| 270 | + c.mu.Unlock() |
| 271 | + return r |
| 272 | + } |
| 273 | + c.mu.Lock() |
| 274 | + r := c.bestLocked() |
| 275 | + c.mu.Unlock() |
| 276 | + return r |
| 277 | + } |
| 278 | + ch := make(chan struct{}) |
| 279 | + c.inflight = ch |
| 280 | + c.mu.Unlock() |
| 281 | + |
| 282 | + out, err := c.probe(ctx) |
| 283 | + |
| 284 | + c.mu.Lock() |
| 285 | + defer func() { |
| 286 | + c.inflight = nil |
| 287 | + close(ch) |
| 288 | + c.mu.Unlock() |
| 289 | + }() |
| 290 | + now := c.now() |
| 291 | + // A concurrent Get may have reset the slot to a different identity |
| 292 | + // while we were probing; don't store our (now-stale-identity) result. |
| 293 | + if c.key != key { |
| 294 | + return responseFrom(out, now) |
| 295 | + } |
| 296 | + c.attemptAt = now |
| 297 | + resp := responseFrom(out, now) |
| 298 | + switch { |
| 299 | + case err == nil && isDefinitive(out.Result): |
| 300 | + c.good = &resp |
| 301 | + c.goodAt = now |
| 302 | + c.last = &resp |
| 303 | + return resp |
| 304 | + case c.good != nil: |
| 305 | + // Transient/unavailable refresh, but we have a prior good verdict: |
| 306 | + // keep showing it, flagged stale, with the failure reason and any |
| 307 | + // backoff hint carried over. |
| 308 | + fallback := *c.good |
| 309 | + fallback.Stale = true |
| 310 | + fallback.Error = out.Result |
| 311 | + fallback.RetryAfterSeconds = out.RetryAfterSeconds |
| 312 | + c.last = &fallback |
| 313 | + return fallback |
| 314 | + default: |
| 315 | + // No prior good verdict: surface the transient result as-is. |
| 316 | + c.last = &resp |
| 317 | + return resp |
| 318 | + } |
| 319 | +} |
| 320 | + |
| 321 | +// bestLocked returns the best currently-known verdict without probing. |
| 322 | +// Caller must hold c.mu. |
| 323 | +func (c *Cache) bestLocked() Response { |
| 324 | + if c.last != nil { |
| 325 | + return *c.last |
| 326 | + } |
| 327 | + if c.good != nil { |
| 328 | + return *c.good |
| 329 | + } |
| 330 | + return Response{Result: ResultUnavailable, CheckedAt: c.now().UTC().Format(time.RFC3339)} |
| 331 | +} |
0 commit comments