@@ -19,20 +19,18 @@ package state
1919import (
2020 "fmt"
2121 "path"
22- "path/filepath"
2322 "reflect"
2423 "sync"
2524 "time"
2625
2726 "k8s.io/klog/v2"
28- "k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
2927 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
3028
3129 "github.com/kubewharf/katalyst-core/pkg/config/agent/qrm/statedirectory"
3230 "github.com/kubewharf/katalyst-core/pkg/metrics"
33- "github.com/kubewharf/katalyst-core/pkg/util/file"
3431 "github.com/kubewharf/katalyst-core/pkg/util/general"
3532 "github.com/kubewharf/katalyst-core/pkg/util/machine"
33+ "github.com/kubewharf/katalyst-core/pkg/util/qrmcheckpointmanager"
3634)
3735
3836const (
@@ -44,15 +42,16 @@ const (
4442// go to in-memory State, and then go to disk State, i.e. in write-back mode
4543type stateCheckpoint struct {
4644 sync.RWMutex
47- cache * cpuPluginState
48- policyName string
49- checkpointManager checkpointmanager. CheckpointManager
50- checkpointName string
45+ cache * cpuPluginState
46+ policyName string
47+ qrmCheckpointManager * qrmcheckpointmanager. QRMCheckpointManager
48+ checkpointName string
5149 // when we add new properties to checkpoint,
5250 // it will cause checkpoint corruption, and we should skip it
5351 skipStateCorruption bool
5452 GenerateMachineStateFromPodEntries GenerateMachineStateFromPodEntriesFunc
5553 emitter metrics.MetricEmitter
54+ hasPreStop bool
5655}
5756
5857var _ State = & stateCheckpoint {}
@@ -64,23 +63,24 @@ func NewCheckpointState(
6463 emitter metrics.MetricEmitter ,
6564) (State , error ) {
6665 currentStateDir , otherStateDir := stateDirectoryConfig .GetCurrentAndOtherStateFileDirectory ()
67- hasPreStop := stateDirectoryConfig .HasPreStop
68- checkpointManager , err := checkpointmanager .NewCheckpointManager (currentStateDir )
66+ // If there is an empty otherStateDir, this means that there is no pre-stop script in place
67+ hasPreStop := otherStateDir != ""
68+ qrmCheckpointManager , err := qrmcheckpointmanager .NewQRMCheckpointManager (currentStateDir , otherStateDir , checkpointName , "cpu_plugin" )
6969 if err != nil {
7070 return nil , fmt .Errorf ("failed to initialize checkpoint manager: %v" , err )
7171 }
72-
7372 sc := & stateCheckpoint {
7473 cache : NewCPUPluginState (topology ),
7574 policyName : policyName ,
76- checkpointManager : checkpointManager ,
75+ qrmCheckpointManager : qrmCheckpointManager ,
7776 checkpointName : checkpointName ,
7877 skipStateCorruption : skipStateCorruption ,
7978 GenerateMachineStateFromPodEntries : generateMachineStateFunc ,
8079 emitter : emitter ,
80+ hasPreStop : hasPreStop ,
8181 }
8282
83- if err := sc .restoreState (currentStateDir , otherStateDir , hasPreStop , topology ); err != nil {
83+ if err := sc .restoreState (topology ); err != nil {
8484 return nil , fmt .Errorf ("could not restore state from checkpoint: %v, please drain this node and delete " +
8585 "the cpu plugin checkpoint file %q before restarting Kubelet" , err , path .Join (currentStateDir , checkpointName ))
8686 }
@@ -90,19 +90,17 @@ func NewCheckpointState(
9090
9191// restoreState is first done by searching the current directory for the state file.
9292// If it does not exist, we search the other directory for the state file and try to migrate the state file over to the current directory.
93- func (sc * stateCheckpoint ) restoreState (
94- currentStateDir , otherStateDir string , hasPreStop bool , topology * machine.CPUTopology ,
95- ) error {
93+ func (sc * stateCheckpoint ) restoreState (topology * machine.CPUTopology ) error {
9694 sc .Lock ()
9795 defer sc .Unlock ()
9896 var err error
9997 var foundAndSkippedStateCorruption bool
10098
10199 checkpoint := NewCPUPluginCheckpoint ()
102- if err = sc .checkpointManager . GetCheckpoint (sc .checkpointName , checkpoint ); err != nil {
100+ if err = sc .qrmCheckpointManager . GetCurrentCheckpoint (sc .checkpointName , checkpoint , true ); err != nil {
103101 if err == errors .ErrCheckpointNotFound {
104102 // We cannot find checkpoint, so it is possible that previous checkpoint was stored in either disk or memory
105- return sc .tryMigrateState (topology , currentStateDir , otherStateDir , hasPreStop , checkpoint )
103+ return sc .tryMigrateState (topology , checkpoint )
106104 } else if err == errors .ErrCorruptCheckpoint {
107105 if ! sc .skipStateCorruption {
108106 return err
@@ -155,27 +153,20 @@ func (sc *stateCheckpoint) populateCacheAndState(
155153// tryMigrateState tries to migrate the state file from the other directory to current directory.
156154// If the other directory does not have a state file, then we build a new checkpoint.
157155func (sc * stateCheckpoint ) tryMigrateState (
158- topology * machine.CPUTopology , currentStateDir , otherStateDir string , hasPreStop bool ,
159- checkpoint * CPUPluginCheckpoint ,
156+ topology * machine.CPUTopology , checkpoint * CPUPluginCheckpoint ,
160157) error {
161158 var foundAndSkippedStateCorruption bool
162159 klog .Infof ("[cpu_plugin] trying to migrate state" )
163160
164161 // Do not migrate and build new checkpoint if there is no pre-stop script
165- if ! hasPreStop {
162+ if ! sc . hasPreStop {
166163 return sc .storeState ()
167164 }
168165
169- // Get the old checkpoint using the provided file directory
170- oldCheckpointManager , err := checkpointmanager .NewCheckpointManager (otherStateDir )
171- if err != nil {
172- return fmt .Errorf ("[cpu_plugin] failed to initialize old checkpoint manager for migration: %v" , err )
173- }
174-
175- if err = oldCheckpointManager .GetCheckpoint (sc .checkpointName , checkpoint ); err != nil {
166+ if err := sc .qrmCheckpointManager .GetPreviousCheckpoint (sc .checkpointName , checkpoint ); err != nil {
176167 if err == errors .ErrCheckpointNotFound {
177168 // Old checkpoint file is not found, so we just store state in new checkpoint
178- general .Infof ("[cpu_plugin] checkpoint %v doesn't exist in dir %v , create it" , sc .checkpointName , otherStateDir )
169+ general .Infof ("[cpu_plugin] checkpoint %v doesn't exist, create it" , sc .checkpointName )
179170 return sc .storeState ()
180171 } else if err == errors .ErrCorruptCheckpoint {
181172 if ! sc .skipStateCorruption {
@@ -188,41 +179,23 @@ func (sc *stateCheckpoint) tryMigrateState(
188179 }
189180 }
190181
191- if err = sc .populateCacheAndState (topology , checkpoint , foundAndSkippedStateCorruption ); err != nil {
182+ if err : = sc .populateCacheAndState (topology , checkpoint , foundAndSkippedStateCorruption ); err != nil {
192183 return fmt .Errorf ("[cpu_plugin] failed to populate checkpoint state during state migration: %v" , err )
193184 }
194185
195186 // always store state after migrating to new checkpoint
196- if err = sc .storeState (); err != nil {
187+ if err : = sc .storeState (); err != nil {
197188 return fmt .Errorf ("[cpu_plugin] failed to store checkpoint state during end of migration: %v" , err )
198189 }
199190
200- // validate that the two files are equal
201- equal , err := sc .checkpointFilesEqual (currentStateDir , otherStateDir )
202- if err != nil {
203- return fmt .Errorf ("[cpu_plugin] failed to compare checkpoint files: %v" , err )
204- }
205- if ! equal {
206- klog .Infof ("[cpu_plugin] checkpoint files are not equal, migration failed, fall back to old checkpoint" )
207- sc .checkpointManager = oldCheckpointManager
208- return nil
209- }
210-
211- // remove old checkpoint file
212- if err = oldCheckpointManager .RemoveCheckpoint (sc .checkpointName ); err != nil {
213- return fmt .Errorf ("[cpu_plugin] failed to remove old checkpoint: %v" , err )
191+ if err := sc .qrmCheckpointManager .ValidateCheckpointFilesMigration (); err != nil {
192+ return fmt .Errorf ("[cpu_plugin] ValidateCheckpointFilesMigration failed with error: %v" , err )
214193 }
215194
216195 klog .Infof ("[cpu_plugin] migrate checkpoint succeeded" )
217196 return nil
218197}
219198
220- func (sc * stateCheckpoint ) checkpointFilesEqual (currentStateDir , otherStateDir string ) (bool , error ) {
221- currentFilePath := filepath .Join (currentStateDir , sc .checkpointName )
222- otherFilePath := filepath .Join (otherStateDir , sc .checkpointName )
223- return file .FilesEqual (currentFilePath , otherFilePath )
224- }
225-
226199func (sc * stateCheckpoint ) StoreState () error {
227200 sc .Lock ()
228201 defer sc .Unlock ()
@@ -244,7 +217,7 @@ func (sc *stateCheckpoint) storeState() error {
244217 checkpoint .PodEntries = sc .cache .GetPodEntries ()
245218 checkpoint .AllowSharedCoresOverlapReclaimedCores = sc .cache .GetAllowSharedCoresOverlapReclaimedCores ()
246219
247- err := sc .checkpointManager .CreateCheckpoint (sc .checkpointName , checkpoint )
220+ err := sc .qrmCheckpointManager .CreateCheckpoint (sc .checkpointName , checkpoint )
248221 if err != nil {
249222 klog .ErrorS (err , "Could not save checkpoint" )
250223 return err
0 commit comments