Skip to content

Commit 3131826

Browse files
committed
fix: provide NTS sync with bad initial clock state
In the initial bootstrap, if the clock (RTC) is badly out of sync, NTS can't verify server certificate timestamp properly, so provide a fallback path that is only activated in the initial boot sequence to ignore certificate timestamp until the time is in sync at least once. Also add a test with `--bad-rtc` option we had previously to verify this flow. The controller will log a warning if certificate timestamp was ignored in the initial bootstrap sync. Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
1 parent 89e307e commit 3131826

9 files changed

Lines changed: 329 additions & 12 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# THIS FILE WAS AUTOMATICALLY GENERATED BY KRES, PLEASE DO NOT EDIT.
22
#
3-
# Generated on 2026-05-29T09:06:16Z by kres e1a258d.
3+
# Generated on 2026-06-03T12:12:42Z by kres d4e4c90.
44

55
concurrency:
66
group: ${{ github.head_ref || github.run_id }}
@@ -3537,6 +3537,9 @@ jobs:
35373537
- test: e2e-k8s-user-namespace
35383538
variant: default
35393539
withConfigPatch: '@hack/test/patches/usernamespace.yaml'
3540+
- test: e2e-bad-rtc
3541+
variant: default
3542+
withBadRtc: "true"
35403543
- extraTestArgs: -talos.enforcing
35413544
tagSuffixIn: -enforcing
35423545
test: e2e-siderolink
@@ -3564,6 +3567,12 @@ jobs:
35643567
variant: enforcing
35653568
withConfigPatch: '@hack/test/patches/usernamespace.yaml'
35663569
withEnforcing: "true"
3570+
- extraTestArgs: -talos.enforcing
3571+
tagSuffixIn: -enforcing
3572+
test: e2e-bad-rtc
3573+
variant: enforcing
3574+
withBadRtc: "true"
3575+
withEnforcing: "true"
35673576
fail-fast: false
35683577
max-parallel: 2
35693578
needs:
@@ -3627,6 +3636,7 @@ jobs:
36273636
TAG_SUFFIX_IN: ${{ matrix.tagSuffixIn }}
36283637
VIA_MAINTENANCE_MODE: ${{ matrix.viaMaintenanceMode }}
36293638
WITH_APPARMOR_LSM_ENABLED: ${{ matrix.withAppArmorLsmEnabled }}
3639+
WITH_BAD_RTC: ${{ matrix.withBadRtc }}
36303640
WITH_CONFIG_PATCH: ${{ matrix.withConfigPatch }}
36313641
WITH_ENFORCING: ${{ matrix.withEnforcing }}
36323642
WITH_SIDEROLINK_AGENT: ${{ matrix.withSiderolinkAgent }}

.github/workflows/integration-misc-4-triggered.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# THIS FILE WAS AUTOMATICALLY GENERATED BY KRES, PLEASE DO NOT EDIT.
22
#
3-
# Generated on 2026-05-29T09:06:16Z by kres e1a258d.
3+
# Generated on 2026-06-03T12:12:42Z by kres d4e4c90.
44

55
concurrency:
66
group: ${{ github.head_ref || github.run_id }}
@@ -41,6 +41,9 @@ jobs:
4141
- test: e2e-k8s-user-namespace
4242
variant: default
4343
withConfigPatch: '@hack/test/patches/usernamespace.yaml'
44+
- test: e2e-bad-rtc
45+
variant: default
46+
withBadRtc: "true"
4447
- extraTestArgs: -talos.enforcing
4548
tagSuffixIn: -enforcing
4649
test: e2e-siderolink
@@ -68,6 +71,12 @@ jobs:
6871
variant: enforcing
6972
withConfigPatch: '@hack/test/patches/usernamespace.yaml'
7073
withEnforcing: "true"
74+
- extraTestArgs: -talos.enforcing
75+
tagSuffixIn: -enforcing
76+
test: e2e-bad-rtc
77+
variant: enforcing
78+
withBadRtc: "true"
79+
withEnforcing: "true"
7180
fail-fast: false
7281
max-parallel: 2
7382
steps:
@@ -130,6 +139,7 @@ jobs:
130139
TAG_SUFFIX_IN: ${{ matrix.tagSuffixIn }}
131140
VIA_MAINTENANCE_MODE: ${{ matrix.viaMaintenanceMode }}
132141
WITH_APPARMOR_LSM_ENABLED: ${{ matrix.withAppArmorLsmEnabled }}
142+
WITH_BAD_RTC: ${{ matrix.withBadRtc }}
133143
WITH_CONFIG_PATCH: ${{ matrix.withConfigPatch }}
134144
WITH_ENFORCING: ${{ matrix.withEnforcing }}
135145
WITH_SIDEROLINK_AGENT: ${{ matrix.withSiderolinkAgent }}

.kres.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1250,6 +1250,9 @@ spec:
12501250
- variant: default
12511251
test: e2e-k8s-user-namespace
12521252
withConfigPatch: "@hack/test/patches/usernamespace.yaml"
1253+
- variant: default
1254+
test: e2e-bad-rtc
1255+
withBadRtc: "true"
12531256
- variant: enforcing
12541257
test: e2e-siderolink
12551258
triggerLabels: [integration/misc-4-enforcing]
@@ -1281,6 +1284,13 @@ spec:
12811284
withEnforcing: "true"
12821285
extraTestArgs: -talos.enforcing
12831286
withConfigPatch: "@hack/test/patches/usernamespace.yaml"
1287+
- variant: enforcing
1288+
test: e2e-bad-rtc
1289+
triggerLabels: [integration/misc-4-enforcing]
1290+
tagSuffixIn: -enforcing
1291+
withEnforcing: "true"
1292+
extraTestArgs: -talos.enforcing
1293+
withBadRtc: "true"
12841294
steps:
12851295
- name: download-artifacts
12861296
artifactStep:
@@ -1299,6 +1309,7 @@ spec:
12991309
WITH_APPARMOR_LSM_ENABLED: ${{ matrix.withAppArmorLsmEnabled }}
13001310
WITH_CONFIG_PATCH: ${{ matrix.withConfigPatch }}
13011311
TAG_SUFFIX_IN: ${{ matrix.tagSuffixIn }}
1312+
WITH_BAD_RTC: ${{ matrix.withBadRtc }}
13021313
EXTRA_TEST_ARGS: ${{ matrix.extraTestArgs }}
13031314
WITH_ENFORCING: ${{ matrix.withEnforcing }}
13041315
IMAGE_REGISTRY: registry.dev.siderolabs.io

hack/test/e2e-qemu.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ case "${WITH_UEFI:-none}" in
3535
;;
3636
esac
3737

38+
case "${WITH_BAD_RTC:-none}" in
39+
true)
40+
QEMU_FLAGS+=("--bad-rtc")
41+
;;
42+
esac
43+
3844
case "${WITH_VIRTUAL_IP:-false}" in
3945
true)
4046
QEMU_FLAGS+=("--use-vip")

internal/pkg/ntp/consts.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,14 @@ const (
2222
EpochLimit = 15 * time.Minute
2323
// ExpectedAccuracy is the expected time sync accuracy, used to adjust poll interval.
2424
ExpectedAccuracy = 200 * time.Millisecond
25+
// NTSBootstrapAttempts is the number of initial NTS session establishment attempts
26+
// during which a TLS certificate validity-period (time) failure is tolerated by
27+
// falling back to a verifier which validates the chain and hostname but ignores
28+
// the certificate notBefore/notAfter.
29+
//
30+
// This works around the boot-time chicken-and-egg problem: NTS key exchange runs
31+
// over TLS, but the system clock used to validate the certificate may not be set
32+
// yet. After this many attempts (or once time has been synced), certificate
33+
// validation is always strict.
34+
NTSBootstrapAttempts = 5
2535
)

internal/pkg/ntp/interfaces.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,8 @@ type NTSSession interface {
3333

3434
// NTSNewSessionFunc creates an NTS session for a given server address.
3535
// Defaults to nts.NewSession wrapper; injectable for testing.
36-
type NTSNewSessionFunc func(address string) (NTSSession, error)
36+
//
37+
// When skipCertTimeCheck is true, the implementation should validate the TLS
38+
// certificate chain and hostname but ignore the certificate validity period.
39+
// This is used to bootstrap NTS before the system clock has been set.
40+
type NTSNewSessionFunc func(address string, skipCertTimeCheck bool) (NTSSession, error)

internal/pkg/ntp/ntp.go

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ package ntp
88
import (
99
"bytes"
1010
"context"
11+
"crypto/x509"
12+
"errors"
1113
"fmt"
1214
"math/bits"
1315
"net"
@@ -58,6 +60,13 @@ type Syncer struct {
5860
// NTS (Network Time Security) support
5961
UseNTS bool
6062
NTSNewSession NTSNewSessionFunc
63+
64+
// NTSBootstrapAttempts is the number of initial NTS session establishment
65+
// attempts during which a TLS certificate validity-period failure is tolerated
66+
// by retrying with a verifier that ignores certificate time constraints.
67+
NTSBootstrapAttempts int
68+
69+
ntsBootstrapUsed int
6170
}
6271

6372
// Measurement is a struct containing correction data based on a time request.
@@ -92,6 +101,8 @@ func NewSyncer(logger *zap.Logger, timeServers []string, useNTS bool) *Syncer {
92101

93102
UseNTS: useNTS,
94103
NTSNewSession: DefaultNTSNewSession,
104+
105+
NTSBootstrapAttempts: NTSBootstrapAttempts,
95106
}
96107

97108
return syncer
@@ -521,7 +532,7 @@ func (syncer *Syncer) getNTSSession(server string) (NTSSession, error) {
521532
return nil, fmt.Errorf("NTS session factory not configured")
522533
}
523534

524-
session, err := syncer.NTSNewSession(server)
535+
session, err := syncer.newNTSSession(server)
525536
if err != nil {
526537
return nil, err
527538
}
@@ -536,6 +547,51 @@ func (syncer *Syncer) getNTSSession(server string) (NTSSession, error) {
536547
return session, nil
537548
}
538549

550+
// newNTSSession establishes an NTS session, applying the boot-time bootstrap
551+
// workaround for the TLS certificate time check.
552+
//
553+
// It first attempts strict validation. If that fails with a certificate
554+
// validity-period error and the bootstrap budget is not yet exhausted (and time
555+
// has not been synced yet), it retries with the certificate time check disabled
556+
// while still validating the chain and hostname. After NTSBootstrapAttempts such
557+
// retries, or once time is synced, validation is always strict.
558+
func (syncer *Syncer) newNTSSession(server string) (NTSSession, error) {
559+
session, err := syncer.NTSNewSession(server, false)
560+
if err == nil {
561+
return session, nil
562+
}
563+
564+
if syncer.timeSyncNotified || syncer.ntsBootstrapUsed >= syncer.NTSBootstrapAttempts || !isCertTimeError(err) {
565+
return nil, err
566+
}
567+
568+
syncer.ntsBootstrapUsed++
569+
570+
syncer.logger.Warn(
571+
"NTS certificate time validation failed, retrying while ignoring certificate time constraints (system clock may not be set yet)",
572+
zap.String("server", server),
573+
zap.Int("bootstrap_attempts_used", syncer.ntsBootstrapUsed),
574+
zap.Int("bootstrap_attempts_limit", syncer.NTSBootstrapAttempts),
575+
zap.Error(err),
576+
)
577+
578+
return syncer.NTSNewSession(server, true)
579+
}
580+
581+
// isCertTimeError reports whether err is a TLS certificate validation failure
582+
// caused by the certificate validity period (expired or not yet valid).
583+
//
584+
// Other certificate failures (unknown authority, hostname mismatch, etc.) are
585+
// deliberately excluded: those are genuine trust failures which must not be
586+
// bypassed even during bootstrap.
587+
func isCertTimeError(err error) bool {
588+
if certErr, ok := errors.AsType[x509.CertificateInvalidError](err); ok {
589+
return certErr.Reason == x509.Expired
590+
}
591+
592+
return false
593+
}
594+
539595
// log2i returns 0 for v == 0 and v == 1.
540596
func log2i(v uint64) int {
541597
if v == 0 {

0 commit comments

Comments
 (0)