Skip to content

Commit 0e88b3e

Browse files
authored
Initial implementation of ARI (#286)
* Initial implementation of ARI * Enhance redundancy, robustness, and logging * Improve ARI updating; integrate on-demand TLS; detect changed window
1 parent fa7161a commit 0e88b3e

11 files changed

+528
-100
lines changed

acmeclient.go

+45-32
Original file line numberDiff line numberDiff line change
@@ -137,44 +137,21 @@ func (iss *ACMEIssuer) newACMEClientWithAccount(ctx context.Context, useTestCA,
137137
// independent of any particular ACME account. If useTestCA is true, am.TestCA
138138
// will be used if it is set; otherwise, the primary CA will be used.
139139
func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
140-
// ensure defaults are filled in
141-
var caURL string
142-
if useTestCA {
143-
caURL = iss.TestCA
144-
}
145-
if caURL == "" {
146-
caURL = iss.CA
140+
client, err := iss.newBasicACMEClient()
141+
if err != nil {
142+
return nil, err
147143
}
148-
if caURL == "" {
149-
caURL = DefaultACME.CA
144+
145+
// fill in a little more beyond a basic client
146+
if useTestCA && iss.TestCA != "" {
147+
client.Client.Directory = iss.TestCA
150148
}
151149
certObtainTimeout := iss.CertObtainTimeout
152150
if certObtainTimeout == 0 {
153151
certObtainTimeout = DefaultACME.CertObtainTimeout
154152
}
155-
156-
// ensure endpoint is secure (assume HTTPS if scheme is missing)
157-
if !strings.Contains(caURL, "://") {
158-
caURL = "https://" + caURL
159-
}
160-
u, err := url.Parse(caURL)
161-
if err != nil {
162-
return nil, err
163-
}
164-
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
165-
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
166-
}
167-
168-
client := &acmez.Client{
169-
Client: &acme.Client{
170-
Directory: caURL,
171-
PollTimeout: certObtainTimeout,
172-
UserAgent: buildUAString(),
173-
HTTPClient: iss.httpClient,
174-
},
175-
ChallengeSolvers: make(map[string]acmez.Solver),
176-
}
177-
client.Logger = iss.Logger.Named("acme_client")
153+
client.Client.PollTimeout = certObtainTimeout
154+
client.ChallengeSolvers = make(map[string]acmez.Solver)
178155

179156
// configure challenges (most of the time, DNS challenge is
180157
// exclusive of other ones because it is usually only used
@@ -230,6 +207,42 @@ func (iss *ACMEIssuer) newACMEClient(useTestCA bool) (*acmez.Client, error) {
230207
return client, nil
231208
}
232209

210+
// newBasicACMEClient sets up a basically-functional ACME client that is not capable
211+
// of solving challenges but can provide basic interactions with the server.
212+
func (iss *ACMEIssuer) newBasicACMEClient() (*acmez.Client, error) {
213+
caURL := iss.CA
214+
if caURL == "" {
215+
caURL = DefaultACME.CA
216+
}
217+
// ensure endpoint is secure (assume HTTPS if scheme is missing)
218+
if !strings.Contains(caURL, "://") {
219+
caURL = "https://" + caURL
220+
}
221+
u, err := url.Parse(caURL)
222+
if err != nil {
223+
return nil, err
224+
}
225+
if u.Scheme != "https" && !SubjectIsInternal(u.Host) {
226+
return nil, fmt.Errorf("%s: insecure CA URL (HTTPS required for non-internal CA)", caURL)
227+
}
228+
return &acmez.Client{
229+
Client: &acme.Client{
230+
Directory: caURL,
231+
UserAgent: buildUAString(),
232+
HTTPClient: iss.httpClient,
233+
Logger: iss.Logger.Named("acme_client"),
234+
},
235+
}, nil
236+
}
237+
238+
func (iss *ACMEIssuer) getRenewalInfo(ctx context.Context, cert Certificate) (acme.RenewalInfo, error) {
239+
acmeClient, err := iss.newBasicACMEClient()
240+
if err != nil {
241+
return acme.RenewalInfo{}, err
242+
}
243+
return acmeClient.GetRenewalInfo(ctx, cert.Certificate.Leaf)
244+
}
245+
233246
func (iss *ACMEIssuer) getHTTPPort() int {
234247
useHTTPPort := HTTPChallengePort
235248
if HTTPPort > 0 && HTTPPort != HTTPChallengePort {

acmeissuer.go

+28-6
Original file line numberDiff line numberDiff line change
@@ -362,12 +362,13 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
362362
panic("missing config pointer (must use NewACMEIssuer)")
363363
}
364364

365-
var isRetry bool
366-
if attempts, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
367-
isRetry = *attempts > 0
365+
var attempts int
366+
if attemptsPtr, ok := ctx.Value(AttemptsCtxKey).(*int); ok {
367+
attempts = *attemptsPtr
368368
}
369+
isRetry := attempts > 0
369370

370-
cert, usedTestCA, err := am.doIssue(ctx, csr, isRetry)
371+
cert, usedTestCA, err := am.doIssue(ctx, csr, attempts)
371372
if err != nil {
372373
return nil, err
373374
}
@@ -395,7 +396,7 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
395396
// other endpoint. This is more likely to happen if a user is testing with
396397
// the staging CA as the main CA, then changes their configuration once they
397398
// think they are ready for the production endpoint.
398-
cert, _, err = am.doIssue(ctx, csr, false)
399+
cert, _, err = am.doIssue(ctx, csr, 0)
399400
if err != nil {
400401
// succeeded with test CA but failed just now with the production CA;
401402
// either we are observing differing internal states of each CA that will
@@ -423,7 +424,8 @@ func (am *ACMEIssuer) Issue(ctx context.Context, csr *x509.CertificateRequest) (
423424
return cert, err
424425
}
425426

426-
func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, useTestCA bool) (*IssuedCertificate, bool, error) {
427+
func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest, attempts int) (*IssuedCertificate, bool, error) {
428+
useTestCA := attempts > 0
427429
client, err := am.newACMEClientWithAccount(ctx, useTestCA, false)
428430
if err != nil {
429431
return nil, false, err
@@ -449,6 +451,22 @@ func (am *ACMEIssuer) doIssue(ctx context.Context, csr *x509.CertificateRequest,
449451
params.NotAfter = time.Now().Add(am.NotAfter)
450452
}
451453

454+
// Notify the ACME server we are replacing a certificate (if the caller says we are),
455+
// only if the following conditions are met:
456+
// - The caller has set a Replaces value in the context, indicating this is a renewal.
457+
// - Not using test CA. This should be obvious, but a test CA should be in a separate
458+
// environment from production, and thus not have knowledge of the cert being replaced.
459+
// - Not a certain attempt number. We skip setting Replaces once early on in the retries
460+
// in case the reason the order is failing is only because there is a state inconsistency
461+
// between client and server or some sort of bookkeeping error with regards to the certID
462+
// and the server is rejecting the ARI certID. In any case, an invalid certID may cause
463+
// orders to fail. So try once without setting it.
464+
if !usingTestCA && attempts != 2 {
465+
if replacing, ok := ctx.Value(ctxKeyARIReplaces).(*x509.Certificate); ok {
466+
params.Replaces = replacing
467+
}
468+
}
469+
452470
// do this in a loop because there's an error case that may necessitate a retry, but not more than once
453471
var certChains []acme.Certificate
454472
for i := 0; i < 2; i++ {
@@ -631,6 +649,10 @@ const (
631649
// prefixACME is the storage key prefix used for ACME-specific assets.
632650
const prefixACME = "acme"
633651

652+
type ctxKey string
653+
654+
const ctxKeyARIReplaces = ctxKey("ari_replaces")
655+
634656
// Interface guards
635657
var (
636658
_ PreChecker = (*ACMEIssuer)(nil)

certificates.go

+146-13
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,15 @@ import (
1818
"context"
1919
"crypto/tls"
2020
"crypto/x509"
21+
"encoding/json"
2122
"fmt"
23+
"math/rand"
2224
"net"
2325
"os"
2426
"strings"
2527
"time"
2628

29+
"github.com/mholt/acmez/v2/acme"
2730
"go.uber.org/zap"
2831
"golang.org/x/crypto/ocsp"
2932
)
@@ -56,6 +59,9 @@ type Certificate struct {
5659

5760
// The unique string identifying the issuer of this certificate.
5861
issuerKey string
62+
63+
// ACME Renewal Information, if available
64+
ari acme.RenewalInfo
5965
}
6066

6167
// Empty returns true if the certificate struct is not filled out; at
@@ -67,10 +73,106 @@ func (cert Certificate) Empty() bool {
6773
// Hash returns a checksum of the certificate chain's DER-encoded bytes.
6874
func (cert Certificate) Hash() string { return cert.hash }
6975

70-
// NeedsRenewal returns true if the certificate is
71-
// expiring soon (according to cfg) or has expired.
76+
// NeedsRenewal returns true if the certificate is expiring
77+
// soon (according to ARI and/or cfg) or has expired.
7278
func (cert Certificate) NeedsRenewal(cfg *Config) bool {
73-
return currentlyInRenewalWindow(cert.Leaf.NotBefore, expiresAt(cert.Leaf), cfg.RenewalWindowRatio)
79+
return cfg.certNeedsRenewal(cert.Leaf, cert.ari, true)
80+
}
81+
82+
// certNeedsRenewal consults ACME Renewal Info (ARI) and certificate expiration to determine
83+
// whether the leaf certificate needs to be renewed yet. If true is returned, the certificate
84+
// should be renewed as soon as possible. The reasoning for a true return value is logged
85+
// unless emitLogs is false; this can be useful to suppress noisy logs in the case where you
86+
// first call this to determine if a cert in memory needs renewal, and then right after you
87+
// call it again to see if the cert in storage still needs renewal -- you probably don't want
88+
// to log the second time for checking the cert in storage which is mainly for synchronization.
89+
func (cfg *Config) certNeedsRenewal(leaf *x509.Certificate, ari acme.RenewalInfo, emitLogs bool) bool {
90+
expiration := expiresAt(leaf)
91+
92+
var logger *zap.Logger
93+
if emitLogs {
94+
logger = cfg.Logger.With(
95+
zap.Strings("subjects", leaf.DNSNames),
96+
zap.Time("expiration", expiration),
97+
zap.String("ari_cert_id", ari.UniqueIdentifier),
98+
zap.Timep("next_ari_update", ari.RetryAfter),
99+
zap.Duration("renew_check_interval", cfg.certCache.options.RenewCheckInterval),
100+
zap.Time("window_start", ari.SuggestedWindow.Start),
101+
zap.Time("window_end", ari.SuggestedWindow.End))
102+
} else {
103+
logger = zap.NewNop()
104+
}
105+
106+
// first check ARI: if it says it's time to renew, it's time to renew
107+
// (notice that we don't strictly require an ARI window to also exist; we presume
108+
// that if a time has been selected, a window does or did exist, even if it didn't
109+
// get stored/encoded for some reason - but also: this allows administrators to
110+
// manually or explicitly schedule a renewal time indepedently of ARI which could
111+
// be useful)
112+
selectedTime := ari.SelectedTime
113+
114+
// if, for some reason a random time in the window hasn't been selected yet, but an ARI
115+
// window does exist, we can always improvise one... even if this is called repeatedly,
116+
// a random time is a random time, whether you generate it once or more :D
117+
// (code borrowed from our acme package)
118+
if selectedTime.IsZero() &&
119+
(!ari.SuggestedWindow.Start.IsZero() && !ari.SuggestedWindow.End.IsZero()) {
120+
start, end := ari.SuggestedWindow.Start.Unix()+1, ari.SuggestedWindow.End.Unix()
121+
selectedTime = time.Unix(rand.Int63n(end-start)+start, 0).UTC()
122+
logger.Warn("no renewal time had been selected with ARI; chose an ephemeral one for now",
123+
zap.Time("ephemeral_selected_time", selectedTime))
124+
}
125+
126+
// if a renewal time has been selected, start with that
127+
if !selectedTime.IsZero() {
128+
// ARI spec recommends an algorithm that renews after the randomly-selected
129+
// time OR just before it if the next waking time would be after it; this
130+
// cutoff can actually be before the start of the renewal window, but the spec
131+
// author says that's OK: https://github.com/aarongable/draft-acme-ari/issues/71
132+
cutoff := ari.SelectedTime.Add(-cfg.certCache.options.RenewCheckInterval)
133+
if time.Now().After(cutoff) {
134+
logger.Info("certificate needs renewal based on ARI window",
135+
zap.Time("selected_time", selectedTime),
136+
zap.Time("renewal_cutoff", cutoff))
137+
return true
138+
}
139+
140+
// according to ARI, we are not ready to renew; however, we do not rely solely on
141+
// ARI calculations... what if there is a bug in our implementation, or in the
142+
// server's, or the stored metadata? for redundancy, give credence to the expiration
143+
// date; ignore ARI if we are past a "dangerously close" limit, to avoid any
144+
// possibility of a bug in ARI compromising a site's uptime: we should always always
145+
// always give heed to actual validity period
146+
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/20.0) {
147+
logger.Warn("certificate is in emergency renewal window; superceding ARI",
148+
zap.Duration("remaining", time.Until(expiration)),
149+
zap.Time("renewal_cutoff", cutoff))
150+
return true
151+
}
152+
153+
}
154+
155+
// the normal check, in the absence of ARI, is to determine if we're near enough (or past)
156+
// the expiration date based on the configured remaining:lifetime ratio
157+
if currentlyInRenewalWindow(leaf.NotBefore, expiration, cfg.RenewalWindowRatio) {
158+
logger.Info("certificate is in configured renewal window based on expiration date",
159+
zap.Duration("remaining", time.Until(expiration)))
160+
return true
161+
}
162+
163+
// finally, if the certificate is expiring imminently, always attempt a renewal;
164+
// we check both a (very low) lifetime ratio and also a strict difference between
165+
// the time until expiration and the interval at which we run the standard maintenance
166+
// routine to check for renewals, to accommodate both exceptionally long and short
167+
// cert lifetimes
168+
if currentlyInRenewalWindow(leaf.NotBefore, expiration, 1.0/50.0) ||
169+
time.Until(expiration) < cfg.certCache.options.RenewCheckInterval*5 {
170+
logger.Warn("certificate is in emergency renewal window; expiration imminent",
171+
zap.Duration("remaining", time.Until(expiration)))
172+
return true
173+
}
174+
175+
return false
74176
}
75177

76178
// Expired returns true if the certificate has expired.
@@ -85,10 +187,12 @@ func (cert Certificate) Expired() bool {
85187
return time.Now().After(expiresAt(cert.Leaf))
86188
}
87189

88-
// currentlyInRenewalWindow returns true if the current time is
89-
// within the renewal window, according to the given start/end
190+
// currentlyInRenewalWindow returns true if the current time is within
191+
// (or after) the renewal window, according to the given start/end
90192
// dates and the ratio of the renewal window. If true is returned,
91-
// the certificate being considered is due for renewal.
193+
// the certificate being considered is due for renewal. The ratio
194+
// is remaining:total time, i.e. 1/3 = 1/3 of lifetime remaining,
195+
// or 9/10 = 9/10 of time lifetime remaining.
92196
func currentlyInRenewalWindow(notBefore, notAfter time.Time, renewalWindowRatio float64) bool {
93197
if notAfter.IsZero() {
94198
return false
@@ -154,9 +258,37 @@ func (cfg *Config) loadManagedCertificate(ctx context.Context, domain string) (C
154258
}
155259
cert.managed = true
156260
cert.issuerKey = certRes.issuerKey
261+
if ari, err := certRes.getARI(); err == nil && ari != nil {
262+
cert.ari = *ari
263+
}
157264
return cert, nil
158265
}
159266

267+
// getARI unpacks ACME Renewal Information from the issuer data, if available.
268+
// It is only an error if there is invalid JSON.
269+
func (certRes CertificateResource) getARI() (*acme.RenewalInfo, error) {
270+
acmeData, err := certRes.getACMEData()
271+
if err != nil {
272+
return nil, err
273+
}
274+
return acmeData.RenewalInfo, nil
275+
}
276+
277+
// getACMEData returns the ACME certificate metadata from the IssuerData, but
278+
// note that a non-ACME-issued certificate may return an empty value and nil
279+
// since the JSON may still decode successfully but just not match any or all
280+
// of the fields. Remember that the IssuerKey is used to store and access the
281+
// cert files in the first place (it is part of the path) so in theory if you
282+
// load a CertificateResource from an ACME issuer it should work as expected.
283+
func (certRes CertificateResource) getACMEData() (acme.Certificate, error) {
284+
if len(certRes.IssuerData) == 0 {
285+
return acme.Certificate{}, nil
286+
}
287+
var acmeCert acme.Certificate
288+
err := json.Unmarshal(certRes.IssuerData, &acmeCert)
289+
return acmeCert, err
290+
}
291+
160292
// CacheUnmanagedCertificatePEMFile loads a certificate for host using certFile
161293
// and keyFile, which must be in PEM format. It stores the certificate in
162294
// the in-memory cache and returns the hash, useful for removing from the cache.
@@ -329,21 +461,22 @@ func fillCertFromLeaf(cert *Certificate, tlsCert tls.Certificate) error {
329461
return nil
330462
}
331463

332-
// managedCertInStorageExpiresSoon returns true if cert (being a
333-
// managed certificate) is expiring within RenewDurationBefore.
334-
// It returns false if there was an error checking the expiration
335-
// of the certificate as found in storage, or if the certificate
336-
// in storage is NOT expiring soon. A certificate that is expiring
464+
// managedCertInStorageNeedsRenewal returns true if cert (being a
465+
// managed certificate) is expiring soon (according to cfg) or if
466+
// ACME Renewal Information (ARI) is available and says that it is
467+
// time to renew (it uses existing ARI; it does not update it).
468+
// It returns false if there was an error, the cert is not expiring
469+
// soon, and ARI window is still future. A certificate that is expiring
337470
// soon in our cache but is not expiring soon in storage probably
338471
// means that another instance renewed the certificate in the
339472
// meantime, and it would be a good idea to simply load the cert
340473
// into our cache rather than repeating the renewal process again.
341-
func (cfg *Config) managedCertInStorageExpiresSoon(ctx context.Context, cert Certificate) (bool, error) {
474+
func (cfg *Config) managedCertInStorageNeedsRenewal(ctx context.Context, cert Certificate) (bool, error) {
342475
certRes, err := cfg.loadCertResourceAnyIssuer(ctx, cert.Names[0])
343476
if err != nil {
344477
return false, err
345478
}
346-
_, needsRenew := cfg.managedCertNeedsRenewal(certRes)
479+
_, _, needsRenew := cfg.managedCertNeedsRenewal(certRes, false)
347480
return needsRenew, nil
348481
}
349482

0 commit comments

Comments
 (0)