Skip to content

Commit a7ee20f

Browse files
committed
Improve resilience of datastore bootstrap reconcile from etcd
* Add store tests with fixtures * Try connecting to local etcd first, if it is available * Handle panics from etcd backend code * Don't try to read WAL and restore v3 snapshots as they almost never exist Signed-off-by: Brad Davidson <brad.davidson@rancher.com> (cherry picked from commit d300004) Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
1 parent cfce7ce commit a7ee20f

7 files changed

Lines changed: 405 additions & 57 deletions

File tree

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ require (
123123
github.com/opencontainers/cgroups v0.0.4
124124
github.com/opencontainers/go-digest v1.0.0
125125
github.com/opencontainers/image-spec v1.1.1
126-
github.com/opencontainers/selinux v1.13.0
127-
github.com/otiai10/copy v1.7.0
126+
github.com/opencontainers/selinux v1.13.1
127+
github.com/otiai10/copy v1.14.1
128128
github.com/pkg/errors v0.9.1
129129
github.com/prometheus/client_golang v1.23.2
130130
github.com/prometheus/common v0.66.1
@@ -138,6 +138,7 @@ require (
138138
github.com/rootless-containers/rootlesskit v1.1.1
139139
github.com/sirupsen/logrus v1.9.4
140140
github.com/spegel-org/spegel v0.6.0
141+
github.com/spf13/afero v1.15.0
141142
github.com/spf13/pflag v1.0.10
142143
github.com/stretchr/testify v1.11.1
143144
github.com/urfave/cli/v2 v2.27.7
@@ -405,6 +406,7 @@ require (
405406
github.com/nats-io/nuid v1.0.1 // indirect
406407
github.com/opencontainers/runtime-spec v1.2.1 // indirect
407408
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
409+
github.com/otiai10/mint v1.6.3 // indirect
408410
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
409411
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
410412
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect

go.sum

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,13 +1115,10 @@ github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 h1:
11151115
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626/go.mod h1:BRHJJd0E+cx42OybVYSgUvZmU0B8P9gZuRXlZUP7TKI=
11161116
github.com/opencontainers/selinux v1.13.0 h1:Zza88GWezyT7RLql12URvoxsbLfjFx988+LGaWfbL84=
11171117
github.com/opencontainers/selinux v1.13.0/go.mod h1:XxWTed+A/s5NNq4GmYScVy+9jzXhGBVEOAyucdRUY8s=
1118-
github.com/otiai10/copy v1.7.0 h1:hVoPiN+t+7d2nzzwMiDHPSOogsWAStewq3TwU05+clE=
1119-
github.com/otiai10/copy v1.7.0/go.mod h1:rmRl6QPdJj6EiUqXQ/4Nn2lLXoNQjFCQbbNrxgc/t3U=
1120-
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
1121-
github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs=
1122-
github.com/otiai10/mint v1.3.0/go.mod h1:F5AjcsTsWUqX+Na9fpHb52P8pcRX2CI6A3ctIT91xUo=
1123-
github.com/otiai10/mint v1.3.3 h1:7JgpsBaN0uMkyju4tbYHu0mnM55hNKVYLsXmwr15NQI=
1124-
github.com/otiai10/mint v1.3.3/go.mod h1:/yxELlJQ0ufhjUwhshSj+wFjZ78CnZ48/1wtmBH1OTc=
1118+
github.com/otiai10/copy v1.14.1 h1:5/7E6qsUMBaH5AnQ0sSLzzTg1oTECmcCmT6lvF45Na8=
1119+
github.com/otiai10/copy v1.14.1/go.mod h1:oQwrEDDOci3IM8dJF0d8+jnbfPDllW6vUjNc3DoZm9I=
1120+
github.com/otiai10/mint v1.6.3 h1:87qsV/aw1F5as1eH1zS/yqHY85ANKVMgkDrf9rcxbQs=
1121+
github.com/otiai10/mint v1.6.3/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
11251122
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
11261123
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
11271124
github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
@@ -1271,6 +1268,8 @@ github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY52
12711268
github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I=
12721269
github.com/spf13/afero v1.9.2/go.mod h1:iUV7ddyEEZPO5gA3zD4fJt6iStLlL+Lg4m2cihcDf8Y=
12731270
github.com/spf13/afero v1.10.0/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ=
1271+
github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I=
1272+
github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg=
12741273
github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0=
12751274
github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE=
12761275
github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=

pkg/cluster/bootstrap.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@ import (
2020
"github.com/k3s-io/k3s/pkg/bootstrap"
2121
"github.com/k3s-io/k3s/pkg/clientaccess"
2222
"github.com/k3s-io/k3s/pkg/daemons/config"
23+
"github.com/k3s-io/k3s/pkg/daemons/executor"
2324
"github.com/k3s-io/k3s/pkg/etcd/store"
2425
"github.com/k3s-io/k3s/pkg/util"
2526
"github.com/k3s-io/k3s/pkg/version"
27+
"github.com/k3s-io/kine/pkg/endpoint"
28+
"github.com/k3s-io/kine/pkg/tls"
2629
"github.com/otiai10/copy"
2730
pkgerrors "github.com/pkg/errors"
2831
"github.com/sirupsen/logrus"
@@ -284,11 +287,36 @@ func (c *Cluster) ReconcileBootstrapData(ctx context.Context, buf io.ReadSeeker,
284287
}
285288

286289
var kv *mvccpb.KeyValue
290+
var storageClient store.ReadCloser
291+
292+
if executor.IsSelfHosted() {
293+
// etcd will never be running at this point when using embedded executor,
294+
// but other executors may opt to leave it running when the supervisor
295+
// process is down. In this case, try to connect to local etcd first; if
296+
// that fails fall back to reading direct from etcd store on disk.
297+
etcdConfig := endpoint.ETCDConfig{
298+
Endpoints: []string{fmt.Sprintf("https://%s:2379", c.config.Loopback(true))},
299+
TLSConfig: tls.Config{
300+
TrustedCAFile: c.config.Runtime.ETCDServerCA,
301+
CAFile: c.config.Runtime.ETCDServerCA,
302+
CertFile: c.config.Runtime.ClientETCDCert,
303+
KeyFile: c.config.Runtime.ClientETCDKey,
304+
},
305+
}
306+
storageClient, err = store.NewRemoteStore(etcdConfig)
307+
if err != nil {
308+
logrus.Infof("Unable to connect to etcd: %v; trying direct datastore access", err)
309+
storageClient = nil
310+
}
311+
}
287312

288-
storageClient, err := store.NewTemporaryStore(filepath.Join(c.config.DataDir, "db", "etcd"))
289-
if err != nil {
290-
return pkgerrors.WithMessage(err, "failed to create temporary datastore client")
313+
if storageClient == nil {
314+
storageClient, err = store.NewTemporaryStore(filepath.Join(c.config.DataDir, "db", "etcd"))
315+
if err != nil {
316+
return pkgerrors.WithMessage(err, "failed to create temporary datastore client")
317+
}
291318
}
319+
292320
defer storageClient.Close()
293321

294322
kv, c.saveBootstrap, err = getBootstrapKeyFromStorage(ctx, storageClient, normalizedToken, token)

pkg/etcd/store/store.go

Lines changed: 80 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"os"
8+
"runtime/debug"
89
"time"
910

1011
"github.com/k3s-io/kine/pkg/endpoint"
@@ -16,16 +17,14 @@ import (
1617
"go.etcd.io/etcd/client/pkg/v3/logutil"
1718
clientv3 "go.etcd.io/etcd/client/v3"
1819
"go.etcd.io/etcd/server/v3/config"
19-
"go.etcd.io/etcd/server/v3/etcdserver/api/snap"
20-
"go.etcd.io/etcd/server/v3/etcdserver/cindex"
2120
etcderrors "go.etcd.io/etcd/server/v3/etcdserver/errors"
2221
"go.etcd.io/etcd/server/v3/lease"
23-
"go.etcd.io/etcd/server/v3/storage"
2422
"go.etcd.io/etcd/server/v3/storage/backend"
2523
"go.etcd.io/etcd/server/v3/storage/mvcc"
2624
"go.etcd.io/etcd/server/v3/storage/schema"
27-
"go.etcd.io/etcd/server/v3/storage/wal"
25+
"go.uber.org/zap"
2826
"go.uber.org/zap/zapcore"
27+
"google.golang.org/grpc"
2928
)
3029

3130
// ReadCloser is a generic wrapper around a MVCC store that provides only read/close functions
@@ -62,9 +61,14 @@ func NewRemoteStore(config endpoint.ETCDConfig) (*RemoteStore, error) {
6261
if err != nil {
6362
return nil, err
6463
}
64+
logger = logger.Named("k3s.remotestore")
65+
66+
logrus.Infof("Opening etcd client connection with endpoints %v", config.Endpoints)
67+
6568
c, err := clientv3.New(clientv3.Config{
6669
Endpoints: config.Endpoints,
6770
DialTimeout: 5 * time.Second,
71+
DialOptions: []grpc.DialOption{grpc.WithBlock(), grpc.FailOnNonTempDialError(true)},
6872
Logger: logger,
6973
TLS: tlsConfig,
7074
})
@@ -163,7 +167,24 @@ func NewTemporaryStore(dataDir string) (*TemporaryStore, error) {
163167
return nil, err
164168
}
165169

166-
if err := copy.Copy(dataDir, tempDir, copy.Options{PreserveOwner: true}); err != nil {
170+
// only copy the bbolt backend database; we don't need the WAL, legacy v2
171+
// store snapshots, config file, or anything else.
172+
// ref: https://etcd.io/docs/v3.6/learning/persistent-storage-files/#long-leaving-files
173+
copyOpts := copy.Options{
174+
PreserveOwner: true,
175+
PreserveTimes: true,
176+
NumOfWorkers: 0,
177+
Sync: true,
178+
Skip: func(srcinfo os.FileInfo, src, dest string) (bool, error) {
179+
switch srcinfo.Name() {
180+
case "member", "snap", "db":
181+
return false, nil
182+
default:
183+
return true, nil
184+
}
185+
},
186+
}
187+
if err := copy.Copy(dataDir, tempDir, copyOpts); err != nil {
167188
return nil, err
168189
}
169190

@@ -198,69 +219,83 @@ type Store struct {
198219
be backend.Backend
199220
}
200221

201-
func NewStore(dataDir string) (*Store, error) {
202-
var currentIndex, latestIndex uint64
222+
func NewStore(dataDir string) (store *Store, rerr error) {
223+
s := &Store{}
224+
203225
logger, err := logutil.CreateDefaultZapLogger(zapcore.InfoLevel)
204226
if err != nil {
205227
return nil, err
206228
}
207229

230+
// etcd relies on panic/fatal errors to trigger process exit; we need to
231+
// handle it properly by recovering and returning an error.
232+
logger = logger.Named("k3s.store").WithOptions(
233+
zap.WithPanicHook(zapcore.WriteThenPanic),
234+
zap.WithFatalHook(zapcore.WriteThenPanic),
235+
)
236+
237+
// recover from zap panics and ensure kv and backened are closed on error
238+
defer func() {
239+
if err := recover(); err != nil {
240+
msg := fmt.Sprintf("panic: %v", err)
241+
if logrus.IsLevelEnabled(logrus.DebugLevel) {
242+
msg += " at " + string(debug.Stack())
243+
}
244+
rerr = errors.New(msg)
245+
}
246+
if rerr != nil && s != nil {
247+
go s.Close()
248+
}
249+
}()
250+
208251
cfg := config.ServerConfig{Logger: logger, DataDir: dataDir}
209252
path := cfg.BackendPath()
210253

211-
// need to check for backend path ourselves, as backend.New just logs a panic
212-
// via zap if it doesn't exist, which isn't fatal.
254+
// need to check for backend path ourselves, as backend.New just creates
255+
// a new empty database if the file does not exist or is empty.
213256
if _, err := os.Stat(path); err != nil {
214257
return nil, pkgerrors.WithMessage(err, "failed to stat MVCC KV store backend path")
215258
}
216259

217-
logrus.Infof("Opening etcd MVCC KV store at %s", path)
260+
logrus.Infof("Opening etcd MVCC KV backend database at %s", path)
218261

219262
// open backend database
220263
bcfg := backend.DefaultBackendConfig(logger)
221264
bcfg.Path = path
222265
bcfg.UnsafeNoFsync = true
223-
bcfg.BatchInterval = 0
224-
bcfg.BatchLimit = 0
225-
be := backend.New(bcfg)
226-
227-
// get current index from backend
228-
currentIndex, _ = schema.ReadConsistentIndex(be.ReadTx())
229-
230-
// list snapshots from WAL dir
231-
walSnaps, err := wal.ValidSnapshotEntries(cfg.Logger, cfg.WALDir())
232-
if err != nil {
233-
return nil, err
266+
bcfg.BatchInterval = time.Hour
267+
bcfg.BatchLimit = 100000
268+
269+
// try to open the bbolt database; this may unrecoverably panic from inside
270+
// the bbolt freelist goroutine if the database is in an inconsistent state.
271+
s.be = backend.New(bcfg)
272+
if s.be == nil {
273+
return nil, errors.New("failed to open database")
234274
}
235275

236-
// find latest available snapshot index
237-
ss := snap.New(logger, cfg.SnapDir())
238-
snapshot, err := ss.LoadNewestAvailable(walSnaps)
239-
if err != nil && !errors.Is(err, snap.ErrNoSnapshot) {
240-
return nil, err
241-
}
242-
if snapshot != nil {
243-
latestIndex = snapshot.Metadata.Index
276+
// try to get current index from backend; this may fail if the bbolt database
277+
// was opened successfully but is in an inconsistent state.
278+
if currentIndex, _ := schema.ReadConsistentIndex(s.be.ReadTx()); currentIndex == 0 {
279+
return nil, errors.New("failed to read consistent index")
244280
}
245281

246-
// restore from snapshot if available
247-
if latestIndex > currentIndex {
248-
logrus.Warnf("MVCC database index %d is less than latest snapshot index %d", currentIndex, latestIndex)
249-
path, err := ss.DBFilePath(latestIndex)
250-
if err != nil {
251-
logrus.Warnf("MVCC database for snapshot index %d not available; data may be stale", latestIndex)
252-
} else {
253-
logrus.Infof("MVCC database restoring snapshot index %d from %s", latestIndex, path)
254-
be, err = storage.RecoverSnapshotBackend(cfg, be, *snapshot, true, storage.NewBackendHooks(cfg.Logger, cindex.NewConsistentIndex(nil)))
255-
if err != nil {
256-
be.Close()
257-
return nil, err
258-
}
259-
}
260-
}
282+
// We do not bother checking the latest snapshot index from the WAL or attempting to
283+
// restore from a snapshot, as v3 store snapshots are only created when replicas are
284+
// lagging and the leader sends them a fresh copy of the bbolt database - and are
285+
// therefore highly unlikely to exist. The .snap files in the snap dir are for the
286+
// legacy v2 store, and are of no use.
287+
//
288+
// ref: https://etcd.io/docs/v3.6/learning/persistent-storage-files/#long-leaving-files
289+
// > Note: Periodic snapshots generated on each replica are only emitted in the form of
290+
// > *.snap file (not snap.db file). So there is no guarantee the most recent snapshot (in
291+
// > WAL log) has the *.snap.db file. But in such a case the backend (snap/db) is expected
292+
// > to be newer than the snapshot.
293+
294+
s.kv = mvcc.NewStore(logger, s.be, &lease.FakeLessor{}, mvcc.StoreConfig{})
295+
logrus.Info("Opened etcd MVCC KV store")
261296

262297
// nb: closing the kv store does not implicitly close its backend; the backend must be closed separately
263-
return &Store{kv: mvcc.NewStore(cfg.Logger, be, &lease.FakeLessor{}, mvcc.StoreConfig{}), be: be}, nil
298+
return s, nil
264299
}
265300

266301
func (s *Store) Close() error {

0 commit comments

Comments
 (0)