@@ -46,8 +46,7 @@ import (
46
46
// GetCertificate will run in a new context, use GetCertificateWithContext to provide
47
47
// a context.
48
48
func (cfg * Config ) GetCertificate (clientHello * tls.ClientHelloInfo ) (* tls.Certificate , error ) {
49
- ctx := context .TODO () // TODO: get a proper context? from somewhere...
50
- return cfg .GetCertificateWithContext (ctx , clientHello )
49
+ return cfg .GetCertificateWithContext (clientHello .Context (), clientHello )
51
50
}
52
51
53
52
func (cfg * Config ) GetCertificateWithContext (ctx context.Context , clientHello * tls.ClientHelloInfo ) (* tls.Certificate , error ) {
@@ -276,15 +275,15 @@ func (cfg *Config) getCertDuringHandshake(ctx context.Context, hello *tls.Client
276
275
name := cfg .getNameFromClientHello (hello )
277
276
278
277
// By this point, we need to load or obtain a certificate. If a swarm of requests comes in for the same
279
- // domain, avoid pounding manager or storage thousands of times simultaneously. We do a similar sync
278
+ // domain, avoid pounding manager or storage thousands of times simultaneously. We use a similar sync
280
279
// strategy for obtaining certificate during handshake.
281
280
certLoadWaitChansMu .Lock ()
282
281
wait , ok := certLoadWaitChans [name ]
283
282
if ok {
284
283
// another goroutine is already loading the cert; just wait and we'll get it from the in-memory cache
285
284
certLoadWaitChansMu .Unlock ()
286
285
287
- timeout := time .NewTimer (2 * time .Minute ) // TODO: have Caddy use the context param to establish a timeout
286
+ timeout := time .NewTimer (2 * time .Minute )
288
287
select {
289
288
case <- timeout .C :
290
289
return Certificate {}, fmt .Errorf ("timed out waiting to load certificate for %s" , name )
@@ -480,6 +479,9 @@ func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.Cli
480
479
// wait for it to finish obtaining the cert and then we'll use it.
481
480
obtainCertWaitChansMu .Unlock ()
482
481
482
+ log .Debug ("new certificate is needed, but is already being obtained; waiting for that issuance to complete" ,
483
+ zap .String ("subject" , name ))
484
+
483
485
// TODO: see if we can get a proper context in here, for true cancellation
484
486
timeout := time .NewTimer (2 * time .Minute )
485
487
select {
@@ -489,7 +491,9 @@ func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.Cli
489
491
timeout .Stop ()
490
492
}
491
493
492
- return cfg .loadCertFromStorage (ctx , log , hello )
494
+ // it should now be loaded in the cache, ready to go; if not,
495
+ // the goroutine in charge of that probably had an error
496
+ return cfg .getCertDuringHandshake (ctx , hello , false )
493
497
}
494
498
495
499
// looks like it's up to us to do all the work and obtain the cert.
@@ -507,28 +511,28 @@ func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.Cli
507
511
508
512
log .Info ("obtaining new certificate" , zap .String ("server_name" , name ))
509
513
510
- // TODO: we are only adding a timeout because we don't know if the context passed in is actually cancelable...
514
+ // set a timeout so we don't inadvertently hold a client handshake open too long
511
515
// (timeout duration is based on https://caddy.community/t/zerossl-dns-challenge-failing-often-route53-plugin/13822/24?u=matt)
512
516
var cancel context.CancelFunc
513
517
ctx , cancel = context .WithTimeout (ctx , 180 * time .Second )
514
518
defer cancel ()
515
519
516
- // Obtain the certificate
520
+ // obtain the certificate (this puts it in storage) and if successful,
521
+ // load it from storage so we and any other waiting goroutine can use it
522
+ var cert Certificate
517
523
err := cfg .ObtainCertAsync (ctx , name )
524
+ if err == nil {
525
+ // load from storage while others wait to make the op as atomic as possible
526
+ cert , err = cfg .loadCertFromStorage (ctx , log , hello )
527
+ if err != nil {
528
+ log .Error ("loading newly-obtained certificate from storage" , zap .String ("server_name" , name ), zap .Error (err ))
529
+ }
530
+ }
518
531
519
- // immediately unblock anyone waiting for it; doing this in
520
- // a defer would risk deadlock because of the recursive call
521
- // to getCertDuringHandshake below when we return!
532
+ // immediately unblock anyone waiting for it
522
533
unblockWaiters ()
523
534
524
- if err != nil {
525
- // shucks; failed to solve challenge on-demand
526
- return Certificate {}, err
527
- }
528
-
529
- // success; certificate was just placed on disk, so
530
- // we need only restart serving the certificate
531
- return cfg .loadCertFromStorage (ctx , log , hello )
535
+ return cert , err
532
536
}
533
537
534
538
// handshakeMaintenance performs a check on cert for expiration and OCSP validity.
@@ -611,7 +615,7 @@ func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHe
611
615
//
612
616
// This function is safe for use by multiple concurrent goroutines.
613
617
func (cfg * Config ) renewDynamicCertificate (ctx context.Context , hello * tls.ClientHelloInfo , currentCert Certificate ) (Certificate , error ) {
614
- log := cfg .Logger .Named ("on_demand" )
618
+ log := logWithRemote ( cfg .Logger .Named ("on_demand" ), hello )
615
619
616
620
name := cfg .getNameFromClientHello (hello )
617
621
timeLeft := time .Until (expiresAt (currentCert .Leaf ))
@@ -651,7 +655,9 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
651
655
timeout .Stop ()
652
656
}
653
657
654
- return cfg .loadCertFromStorage (ctx , log , hello )
658
+ // it should now be loaded in the cache, ready to go; if not,
659
+ // the goroutine in charge of that probably had an error
660
+ return cfg .getCertDuringHandshake (ctx , hello , false )
655
661
}
656
662
657
663
// looks like it's up to us to do all the work and renew the cert
@@ -703,16 +709,8 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
703
709
} else {
704
710
err = cfg .RenewCertAsync (ctx , name , false )
705
711
if err == nil {
706
- // even though the recursive nature of the dynamic cert loading
707
- // would just call this function anyway, we do it here to
708
- // make the replacement as atomic as possible.
709
- newCert , err = cfg .CacheManagedCertificate (ctx , name )
710
- if err != nil {
711
- log .Error ("loading renewed certificate" , zap .String ("server_name" , name ), zap .Error (err ))
712
- } else {
713
- // replace the old certificate with the new one
714
- cfg .certCache .replaceCertificate (currentCert , newCert )
715
- }
712
+ // load from storage while in lock to make the replacement as atomic as possible
713
+ newCert , err = cfg .reloadManagedCertificate (ctx , currentCert )
716
714
}
717
715
}
718
716
@@ -722,11 +720,10 @@ func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.Clien
722
720
unblockWaiters ()
723
721
724
722
if err != nil {
725
- log .Error ("renewing and reloading certificate" , zap .Error (err ))
726
- return newCert , err
723
+ log .Error ("renewing and reloading certificate" , zap .String ("server_name" , name ), zap .Error (err ))
727
724
}
728
725
729
- return cfg . loadCertFromStorage ( ctx , log , hello )
726
+ return newCert , err
730
727
}
731
728
732
729
// if the certificate hasn't expired, we can serve what we have and renew in the background
@@ -872,6 +869,8 @@ var (
872
869
obtainCertWaitChans = make (map [string ]chan struct {})
873
870
obtainCertWaitChansMu sync.Mutex
874
871
)
872
+
873
+ // TODO: this lockset should probably be per-cache
875
874
var (
876
875
certLoadWaitChans = make (map [string ]chan struct {})
877
876
certLoadWaitChansMu sync.Mutex
0 commit comments