Skip to content

Commit 0a29338

Browse files
author
xboard
committed
fix(cert): reload PEM on warm restart and rebuild ACME on config change
1 parent 68ca2d2 commit 0a29338

2 files changed

Lines changed: 361 additions & 39 deletions

File tree

internal/cert/cert.go

Lines changed: 125 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,18 @@ import (
55
"crypto/ecdsa"
66
"crypto/elliptic"
77
"crypto/rand"
8+
"crypto/sha256"
89
"crypto/tls"
910
"crypto/x509"
1011
"crypto/x509/pkix"
12+
"encoding/hex"
1113
"encoding/pem"
1214
"fmt"
1315
"math/big"
1416
"net"
1517
"os"
1618
"path/filepath"
19+
"sort"
1720
"strings"
1821
"sync/atomic"
1922
"time"
@@ -41,20 +44,21 @@ import (
4144
//
4245
// Priority Logic (when CertMode is empty):
4346
//
44-
// 1. "http" (if auto_tls is true)
45-
// 2. "content" (if both CertContent and KeyContent are provided)
46-
// 3. "file" (if both CertFile and KeyFile paths are provided)
47-
// 4. "none" (default)
47+
// 1. "http" (if auto_tls is true)
48+
// 2. "content" (if both CertContent and KeyContent are provided)
49+
// 3. "file" (if both CertFile and KeyFile paths are provided)
50+
// 4. "none" (default)
4851
type Manager struct {
4952
cfg config.CertConfig
5053

51-
// Cert material stored atomically so the ACME renewal goroutine can
52-
// swap it without racing with TLSCert() readers.
54+
// Atomic so the ACME renewal goroutine can swap without racing readers.
5355
mat atomic.Pointer[certMaterial]
5456

55-
magic *certmagic.Config
56-
renewed atomic.Bool
57-
acmeStarted bool
57+
magic *certmagic.Config
58+
renewed atomic.Bool
59+
acmeStarted bool
60+
acmeFingerprint string
61+
acmeCancel context.CancelFunc
5862
}
5963

6064
// certMaterial is an immutable snapshot of PEM-encoded cert + key.
@@ -131,14 +135,22 @@ func (m *Manager) loadPersistedPEM() bool {
131135
}
132136

133137
// Reconfigure applies a new cert configuration at runtime (e.g. from panel push).
134-
// For ACME modes that are already running this is a no-op — a process restart
135-
// is required to change ACME settings.
136138
// Returns true when the PEM material changed (caller should restart kernel).
137139
func (m *Manager) Reconfigure(ctx context.Context, newCfg config.CertConfig) (bool, error) {
138140
if newCfg.CertDir == "" {
139141
newCfg.CertDir = m.cfg.CertDir
140142
}
141143

144+
// If ACME is running and the new config materially differs (or switches
145+
// away from ACME), tear down the old certmagic instance first.
146+
if m.acmeStarted {
147+
newFp := acmeFingerprint(newCfg)
148+
newMode := resolveModeFor(newCfg)
149+
if newMode != "http" && newMode != "dns" || newFp != m.acmeFingerprint {
150+
m.tearDownACME()
151+
}
152+
}
153+
142154
oldTLS := m.TLSCert()
143155
m.cfg = newCfg
144156

@@ -150,6 +162,65 @@ func (m *Manager) Reconfigure(ctx context.Context, newCfg config.CertConfig) (bo
150162
return !pemEqual(oldTLS, newTLS), nil
151163
}
152164

165+
// tearDownACME cancels the running certmagic background goroutines and clears
166+
// related state so a fresh ACME init can run.
167+
func (m *Manager) tearDownACME() {
168+
if m.acmeCancel != nil {
169+
m.acmeCancel()
170+
m.acmeCancel = nil
171+
}
172+
m.magic = nil
173+
m.acmeStarted = false
174+
m.acmeFingerprint = ""
175+
m.mat.Store(nil)
176+
}
177+
178+
// acmeFingerprint produces a stable string capturing every config field that
179+
// requires re-running ACME when changed.
180+
func acmeFingerprint(cfg config.CertConfig) string {
181+
keys := make([]string, 0, len(cfg.DNSEnv))
182+
for k := range cfg.DNSEnv {
183+
keys = append(keys, k)
184+
}
185+
sort.Strings(keys)
186+
var b strings.Builder
187+
b.WriteString(strings.ToLower(strings.TrimSpace(cfg.CertMode)))
188+
b.WriteByte('|')
189+
b.WriteString(strings.TrimSpace(cfg.Domain))
190+
b.WriteByte('|')
191+
b.WriteString(strings.TrimSpace(cfg.Email))
192+
b.WriteByte('|')
193+
b.WriteString(strings.TrimSpace(cfg.DNSProvider))
194+
b.WriteByte('|')
195+
for _, k := range keys {
196+
b.WriteString(k)
197+
b.WriteByte('=')
198+
b.WriteString(cfg.DNSEnv[k])
199+
b.WriteByte(';')
200+
}
201+
sum := sha256.Sum256([]byte(b.String()))
202+
return hex.EncodeToString(sum[:])
203+
}
204+
205+
// resolveModeFor mirrors (*Manager).resolveMode() for an arbitrary cfg, used
206+
// by Reconfigure to decide tear-down without mutating manager state.
207+
func resolveModeFor(cfg config.CertConfig) string {
208+
mode := strings.ToLower(strings.TrimSpace(cfg.CertMode))
209+
if mode != "" {
210+
return mode
211+
}
212+
if cfg.AutoTLS {
213+
return "http"
214+
}
215+
if cfg.CertContent != "" && cfg.KeyContent != "" {
216+
return "content"
217+
}
218+
if cfg.CertFile != "" && cfg.KeyFile != "" {
219+
return "file"
220+
}
221+
return "none"
222+
}
223+
153224
// resolveMode returns the effective cert mode, handling backward compat for auto_tls.
154225
func (m *Manager) resolveMode() string {
155226
mode := strings.ToLower(strings.TrimSpace(m.cfg.CertMode))
@@ -320,10 +391,8 @@ func (m *Manager) startContent() error {
320391
// ─── Mode: http / dns (ACME) ──────────────────────────────────────────────
321392

322393
func (m *Manager) startACME(ctx context.Context, dnsSolver *certmagic.DNS01Solver) error {
323-
// Guard: certmagic's background goroutines must only be started once.
324-
// A process restart is required to change ACME settings at runtime.
394+
// Idempotent for unchanged config: Reconfigure already tore down stale ACME state.
325395
if m.acmeStarted {
326-
nlog.Core().Info("cert: ACME already active, skipping re-init (restart to change ACME settings)")
327396
return nil
328397
}
329398

@@ -347,9 +416,23 @@ func (m *Manager) startACME(ctx context.Context, dnsSolver *certmagic.DNS01Solve
347416
magic = certmagic.New(cache, certmagic.Config{
348417
Storage: storage,
349418
OnEvent: func(evtCtx context.Context, event string, data map[string]any) error {
350-
if event == "cert_obtained" {
351-
m.loadCertFromStorage(evtCtx, storage, data)
419+
// Only react to renewals; initial load happens explicitly after ObtainCertSync.
420+
if event != "cert_obtained" {
421+
return nil
422+
}
423+
if renewed, _ := data["renewal"].(bool); !renewed {
424+
return nil
352425
}
426+
issuerKey, _ := data["issuer"].(string)
427+
if issuerKey == "" {
428+
return nil
429+
}
430+
if err := m.loadPEMFromStorage(evtCtx, storage, issuerKey, m.cfg.Domain); err != nil {
431+
nlog.Core().Error("failed to reload cert after renewal", "error", err)
432+
return nil
433+
}
434+
m.renewed.Store(true)
435+
nlog.Core().Info("TLS certificate reloaded after renewal", "domain", m.cfg.Domain)
353436
return nil
354437
},
355438
})
@@ -379,19 +462,27 @@ func (m *Manager) startACME(ctx context.Context, dnsSolver *certmagic.DNS01Solve
379462
}
380463
m.magic = magic
381464

382-
if err := magic.ObtainCertSync(ctx, m.cfg.Domain); err != nil {
465+
// Derived ctx so Reconfigure can cancel ACME background goroutines.
466+
acmeCtx, acmeCancel := context.WithCancel(ctx)
467+
468+
if err := magic.ObtainCertSync(acmeCtx, m.cfg.Domain); err != nil {
469+
acmeCancel()
383470
return fmt.Errorf("obtain certificate: %w", err)
384471
}
385472

386-
// The cert_obtained event callback should have loaded PEM. Verify.
387-
if !m.HasCert() {
388-
return fmt.Errorf("ACME obtained certificate but PEM was not loaded; check storage")
473+
issuerKey := magic.Issuers[0].IssuerKey()
474+
if err := m.loadPEMFromStorage(acmeCtx, storage, issuerKey, m.cfg.Domain); err != nil {
475+
acmeCancel()
476+
return fmt.Errorf("load cert from storage: %w", err)
389477
}
390478

391-
if err := magic.ManageAsync(ctx, []string{m.cfg.Domain}); err != nil {
479+
if err := magic.ManageAsync(acmeCtx, []string{m.cfg.Domain}); err != nil {
480+
acmeCancel()
392481
return fmt.Errorf("start cert manager: %w", err)
393482
}
394483

484+
m.acmeCancel = acmeCancel
485+
m.acmeFingerprint = acmeFingerprint(m.cfg)
395486
m.acmeStarted = true
396487
return nil
397488
}
@@ -433,31 +524,26 @@ func (m *Manager) newDNSProvider() (certmagic.DNSProvider, error) {
433524

434525
// ─── Helpers ───────────────────────────────────────────────────────────────
435526

436-
// loadCertFromStorage reads ACME cert/key from certmagic's storage into memory.
437-
func (m *Manager) loadCertFromStorage(ctx context.Context, storage certmagic.Storage, data map[string]any) {
438-
certKey, _ := data["certificate_path"].(string)
439-
keyKey, _ := data["private_key_path"].(string)
440-
if certKey == "" || keyKey == "" {
441-
nlog.Core().Warn("cert_obtained event missing storage paths", "data", data)
442-
return
527+
// loadPEMFromStorage reads cert + key for domain from certmagic storage into
528+
// the in-memory cache. Single entry point for both initial load and renewal refresh.
529+
func (m *Manager) loadPEMFromStorage(ctx context.Context, storage certmagic.Storage, issuerKey, domain string) error {
530+
if issuerKey == "" || domain == "" {
531+
return fmt.Errorf("loadPEMFromStorage: issuerKey and domain are required")
443532
}
533+
certKey := certmagic.StorageKeys.SiteCert(issuerKey, domain)
534+
keyKey := certmagic.StorageKeys.SitePrivateKey(issuerKey, domain)
444535

445536
certPEM, err := storage.Load(ctx, certKey)
446537
if err != nil {
447-
nlog.Core().Error("failed to load cert from ACME storage", "error", err)
448-
return
538+
return fmt.Errorf("load cert %q: %w", certKey, err)
449539
}
450540
keyPEM, err := storage.Load(ctx, keyKey)
451541
if err != nil {
452-
nlog.Core().Error("failed to load key from ACME storage", "error", err)
453-
return
542+
return fmt.Errorf("load key %q: %w", keyKey, err)
454543
}
455-
456-
m.storePEM(certPEM, keyPEM)
457-
458-
renewal, _ := data["renewal"].(bool)
459-
nlog.Core().Info("TLS certificate loaded from ACME storage", "domain", m.cfg.Domain, "renewal", renewal)
460-
if renewal {
461-
m.renewed.Store(true)
544+
if err := validateKeyPair(certPEM, keyPEM); err != nil {
545+
return fmt.Errorf("invalid cert/key pair on disk: %w", err)
462546
}
547+
m.storePEM(certPEM, keyPEM)
548+
return nil
463549
}

0 commit comments

Comments
 (0)