66 "fmt"
77 "os"
88 "os/exec"
9+ "regexp"
910 "strings"
1011
1112 "cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/deviceinfo"
@@ -18,19 +19,29 @@ import (
1819 "github.com/opencontainers/runtime-spec/specs-go"
1920)
2021
22+ // CCMode enums
2123const (
22- installerContainerID = "tee-gpu-driver-installer-container"
23- installerSnapshotID = "tee-gpu-driver-installer-snapshot"
24+ CCModeON CCMode = "ON"
25+ CCModeOFF CCMode = "OFF"
26+ installerContainerID = "tee-gpu-driver-installer-container"
27+ installerSnapshotID = "tee-gpu-driver-installer-snapshot"
2428)
2529
26- var supportedGpuTypes = []deviceinfo.GPUType {
27- deviceinfo .L4 ,
28- deviceinfo .T4 ,
29- deviceinfo .A100_40GB ,
30- deviceinfo .A100_80GB ,
30+ var supportedCGPUTypes = []deviceinfo.GPUType {
3131 deviceinfo .H100 ,
3232}
3333
34+ // CCMode represents the status confidential computing mode of the GPU.
35+ type CCMode string
36+
37+ func (ccm CCMode ) isValid () error {
38+ switch ccm {
39+ case CCModeOFF , CCModeON :
40+ return nil
41+ }
42+ return fmt .Errorf ("invalid gpu cc mode: %s" , ccm )
43+ }
44+
3445// DriverInstaller contains information about the GPU driver installer settings
3546type DriverInstaller struct {
3647 cdClient * containerd.Client
@@ -63,7 +74,7 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
6374 }
6475
6576 if ! gpuType .OpenSupported () {
66- return fmt .Errorf ("unsupported GPU type %s, please retry with one of the supported GPU types: %v" , gpuType .String (), supportedGpuTypes )
77+ return fmt .Errorf ("unsupported GPU type %s, please retry with one of the supported confidential GPU types: %v" , gpuType .String (), supportedCGPUTypes )
6778 }
6879
6980 ctx = namespaces .WithNamespace (ctx , namespaces .Default )
@@ -79,6 +90,16 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
7990 return fmt .Errorf ("failed to pull installer image: %v" , err )
8091 }
8192
93+ installerDigest := image .Target ().Digest .String ()
94+ expectedInstallerDigest , err := os .ReadFile (InstallerImageDigestFile )
95+ if err != nil {
96+ return fmt .Errorf ("failed to read reference image digest from file %s : %v" , InstallerImageDigestFile , err )
97+ }
98+
99+ if installerDigest != string (expectedInstallerDigest ) {
100+ return fmt .Errorf ("cos_gpu_installer image digest verification failed - expected : %s, actual : %s" , expectedInstallerDigest , installerDigest )
101+ }
102+
82103 mounts := []specs.Mount {
83104 {
84105 Type : "volume" ,
@@ -153,7 +174,7 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
153174 return fmt .Errorf ("failed to verify GPU driver installation: %v" , err )
154175 }
155176
156- ccEnabled , err := isGPUCCModeEnabled (di . logger , gpuType )
177+ ccEnabled , err := isGPUCCModeEnabled ()
157178 if err != nil {
158179 return fmt .Errorf ("failed to check confidential compute mode status: %v" , err )
159180 }
@@ -169,11 +190,11 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
169190}
170191
171192func getInstallerImageReference () (string , error ) {
172- installerImageRefBytes , err := exec . Command ( "cos-extensions" , "list" , "--" , "--gpu-installer" ). Output ( )
193+ imageRefBytes , err := os . ReadFile ( InstallerImageRefFile )
173194 if err != nil {
174195 return "" , fmt .Errorf ("failed to get the cos-gpu-installer version: %v" , err )
175196 }
176- installerImageRef := strings .TrimSpace (string (installerImageRefBytes ))
197+ installerImageRef := strings .TrimSpace (string (imageRefBytes ))
177198 return installerImageRef , nil
178199}
179200
@@ -208,20 +229,42 @@ func setGPUStateToReady() error {
208229 return nil
209230}
210231
211- func isGPUCCModeEnabled (logger logging.Logger , gpuType deviceinfo.GPUType ) (bool , error ) {
212- // Run nvidia-smi conf-compute command to check if confidential compute mode is ON.
232+ func isGPUCCModeEnabled () (bool , error ) {
233+ ccMode , err := GetGPUCCMode ()
234+ if err != nil {
235+ return false , err
236+ }
237+ return ccMode == CCModeON , nil
238+ }
239+
240+ // GetGPUCCMode executes nvidia-smi to determine the current Confidential Computing (CC) mode status of the GPU.
241+ // It returns the CC mode ("ON" or "OFF") and an error if the command fails or if the output cannot be parsed.
242+ func GetGPUCCMode () (CCMode , error ) {
243+ // Run nvidia-smi conf-compute command to get the confidential computing mode status.
213244 nvidiaSmiCmd := fmt .Sprintf ("%s/bin/nvidia-smi" , InstallationHostDir )
214245 ccModeOutput , err := exec .Command (nvidiaSmiCmd , "conf-compute" , "-f" ).Output ()
215- // The nvidia-smi conf-compute command fails for GPU which doesn't support confidential computing.
216- // This check would bypass nvidia-smi conf-compute command for GPU not having confidential compute support.
217- if strings .Contains (string (ccModeOutput ), "No CC capable devices found" ) {
218- logger .Info (fmt .Sprintf ("Confidential Computing is not supported for GPU type : %s" , gpuType .String ()))
219- return false , nil
246+ if err != nil {
247+ return "" , err
220248 }
249+ ccMode , err := parseCCStatus (string (ccModeOutput ))
221250 if err != nil {
222- return false , err
251+ return "" , err
252+ }
253+ return CCMode (ccMode ), nil
254+ }
255+
256+ func parseCCStatus (output string ) (CCMode , error ) {
257+ re := regexp .MustCompile (`CC status:\s*(ON|OFF)` )
258+ match := re .FindStringSubmatch (output )
259+
260+ if len (match ) < 2 {
261+ return "" , fmt .Errorf ("CC status not found in output: %s" , output )
262+ }
263+ ccMode := CCMode (match [1 ])
264+ if err := ccMode .isValid (); err != nil {
265+ return "" , err
223266 }
224- return strings . Contains ( string ( ccModeOutput ), "CC status: ON" ) , nil
267+ return ccMode , nil
225268}
226269
227270func launchNvidiaPersistencedProcess (logger logging.Logger ) error {
0 commit comments