Skip to content

Commit 75e8f5b

Browse files
committed
address review comments
1 parent 377f9cb commit 75e8f5b

File tree

4 files changed

+181
-16
lines changed

4 files changed

+181
-16
lines changed

launcher/image/preload.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,23 @@ set_gpu_driver_ref_values() {
132132

133133
mkdir ${GPU_REF_VALUES_PATH}
134134
cos_gpu_installer_image_ref=$(cos-extensions list -- --gpu-installer)
135+
if [ -z "${cos_gpu_installer_image_ref}" ]; then
136+
echo "Error: cos-extensions list returned an empty image reference." >&2
137+
return 1
138+
fi
139+
135140
cos_gpu_installer_image_digest=$(get_cos_gpu_installer_image_digest ${cos_gpu_installer_image_ref})
141+
if [ -z "${cos_gpu_installer_image_ref}" ]; then
142+
echo "Error: get_cos_gpu_installer_image_digest returned an empty or invalid digest for: ${cos_gpu_installer_image_ref}." >&2
143+
return 1
144+
fi
145+
146+
image_digest_hex_part=$(echo "${cos_gpu_installer_image_digest}" | sed 's/^sha256://' | tr -d '\n')
147+
# Check for the expected length of the SHA256 digest (64 hex characters)
148+
if [ ${#image_digest_hex_part} -ne 64 ]; then
149+
echo "Error: cos_gpu_installer image digest has an unexpected length: ${#image_digest_hex_part}, Expected 64." >&2
150+
return 1
151+
fi
136152

137153
echo ${cos_gpu_installer_image_ref} >> ${COS_GPU_INSTALLER_IMAGE_REF}
138154
echo ${cos_gpu_installer_image_digest} >> ${COS_GPU_INSTALLER_IMAGE_DIGEST}

launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,34 @@ substitutions:
66
'_ZONE': 'us-central1-f'
77
'_WORKLOAD_IMAGE': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/gpu/cuda-vector-add:latest'
88
steps:
9+
- name: 'gcr.io/cloud-builders/gcloud'
10+
id: CreateShieldedVMWithSingleGPU
11+
entrypoint: 'bash'
12+
env:
13+
- 'BUILD_ID=$BUILD_ID'
14+
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
15+
'-p', '${_IMAGE_PROJECT}',
16+
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
17+
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}',
18+
'-z', '${_ZONE}',
19+
'-v', 'n1-standard-4',
20+
'-g', 'nvidia-tesla-t4',
21+
'-c', '1'
22+
]
23+
- name: 'gcr.io/cloud-builders/gcloud'
24+
id: CreateShieldedVMWithMultipleGPU
25+
entrypoint: 'bash'
26+
env:
27+
- 'BUILD_ID=$BUILD_ID'
28+
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
29+
'-p', '${_IMAGE_PROJECT}',
30+
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true,tee-install-gpu-driver=true',
31+
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul',
32+
'-z', '${_ZONE}',
33+
'-v', 'n1-standard-4',
34+
'-g', 'nvidia-tesla-t4',
35+
'-c', '2'
36+
]
937
- name: 'gcr.io/cloud-builders/gcloud'
1038
id: CreateTDXCVMWithUnsupportedGPU
1139
entrypoint: 'bash'
@@ -46,6 +74,14 @@ steps:
4674
'-c', '1',
4775
'-x', 'TDX',
4876
]
77+
- name: 'gcr.io/cloud-builders/gcloud'
78+
id: SingleGpuWorkloadTest
79+
entrypoint: 'bash'
80+
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
81+
- name: 'gcr.io/cloud-builders/gcloud'
82+
id: MultipleGpuWorkloadTest
83+
entrypoint: 'bash'
84+
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
4985
- name: 'gcr.io/cloud-builders/gcloud'
5086
id: UnsupportedGpuWorkloadTest
5187
entrypoint: 'bash'
@@ -58,6 +94,18 @@ steps:
5894
id: ConfidentialGpuWorkloadTest
5995
entrypoint: 'bash'
6096
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-tdxcvm-cgpu', 'us-east5-a']
97+
- name: 'gcr.io/cloud-builders/gcloud'
98+
id: SingleGpuCleanUp
99+
entrypoint: 'bash'
100+
env:
101+
- 'CLEANUP=$_CLEANUP'
102+
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
103+
- name: 'gcr.io/cloud-builders/gcloud'
104+
id: MultipleGpuCleanUp
105+
entrypoint: 'bash'
106+
env:
107+
- 'CLEANUP=$_CLEANUP'
108+
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-mul', '${_ZONE}']
61109
- name: 'gcr.io/cloud-builders/gcloud'
62110
id: UnsupportedGpuVmCleanUp
63111
entrypoint: 'bash'

launcher/internal/gpu/driverinstaller.go

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,16 @@ import (
2323
const (
2424
CCModeON CCMode = "ON"
2525
CCModeOFF CCMode = "OFF"
26+
CCModeDevTools CCMode = "DEVTOOLS"
2627
installerContainerID = "tee-gpu-driver-installer-container"
2728
installerSnapshotID = "tee-gpu-driver-installer-snapshot"
2829
)
2930

3031
var supportedCGPUTypes = []deviceinfo.GPUType{
32+
deviceinfo.L4,
33+
deviceinfo.T4,
34+
deviceinfo.A100_40GB,
35+
deviceinfo.A100_80GB,
3136
deviceinfo.H100,
3237
}
3338

@@ -36,7 +41,7 @@ type CCMode string
3641

3742
func (ccm CCMode) isValid() error {
3843
switch ccm {
39-
case CCModeOFF, CCModeON:
44+
case CCModeOFF, CCModeON, CCModeDevTools:
4045
return nil
4146
}
4247
return fmt.Errorf("invalid gpu cc mode: %s", ccm)
@@ -74,11 +79,11 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
7479
}
7580

7681
if !gpuType.OpenSupported() {
77-
return fmt.Errorf("unsupported GPU type %s, please retry with one of the supported confidential GPU types: %v", gpuType.String(), supportedCGPUTypes)
82+
return fmt.Errorf("unsupported open sourced kernel modules for GPU type %s, please retry with one of the supported GPU types: %v", gpuType.String(), supportedCGPUTypes)
7883
}
7984

8085
ctx = namespaces.WithNamespace(ctx, namespaces.Default)
81-
installerImageRef, err := getInstallerImageReference()
86+
installerImageRef, err := getInstallerImageReference(InstallerImageRefFile)
8287
if err != nil {
8388
di.logger.Error(fmt.Sprintf("failed to get the installer container image reference: %v", err))
8489
return err
@@ -90,8 +95,7 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
9095
return fmt.Errorf("failed to pull installer image: %v", err)
9196
}
9297

93-
installerDigest := image.Target().Digest.String()
94-
if err := verifyInstallerImageDigest(installerDigest); err != nil {
98+
if err := verifyInstallerImageDigest(image, InstallerImageDigestFile); err != nil {
9599
return err
96100
}
97101

@@ -178,25 +182,27 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
178182
if err = setGPUStateToReady(); err != nil {
179183
return fmt.Errorf("failed to set the GPU state to ready: %v", err)
180184
}
181-
} else {
182-
return fmt.Errorf("confidential compute is not enabled for the gpu type %s", gpuType)
183185
}
184186

185187
di.logger.Info("GPU driver installation completed successfully")
186188
return nil
187189
}
188190

189-
func getInstallerImageReference() (string, error) {
190-
imageRefBytes, err := os.ReadFile(InstallerImageRefFile)
191+
func getInstallerImageReference(installerImageRefFile string) (string, error) {
192+
imageRefBytes, err := os.ReadFile(installerImageRefFile)
191193
if err != nil {
192194
return "", fmt.Errorf("failed to get the cos-gpu-installer version: %v", err)
193195
}
194196
installerImageRef := strings.TrimSpace(string(imageRefBytes))
197+
if len(installerImageRef) < 1 {
198+
return "", fmt.Errorf("empty value of cos-gpu-installer image reference")
199+
}
195200
return installerImageRef, nil
196201
}
197202

198-
func verifyInstallerImageDigest(installerDigest string) error {
199-
imageDigestBytes, err := os.ReadFile(InstallerImageDigestFile)
203+
func verifyInstallerImageDigest(image containerd.Image, referenceDigestFile string) error {
204+
installerDigest := image.Target().Digest.String()
205+
imageDigestBytes, err := os.ReadFile(referenceDigestFile)
200206
if err != nil {
201207
return fmt.Errorf("failed to get the cos-gpu-installer image digest: %v", err)
202208
}
@@ -239,16 +245,16 @@ func setGPUStateToReady() error {
239245
}
240246

241247
func isGPUCCModeEnabled() (bool, error) {
242-
ccMode, err := GetGPUCCMode()
248+
ccMode, err := QueryCCMode()
243249
if err != nil {
244250
return false, err
245251
}
246252
return ccMode == CCModeON, nil
247253
}
248254

249-
// GetGPUCCMode executes nvidia-smi to determine the current Confidential Computing (CC) mode status of the GPU.
250-
// It returns the CC mode ("ON" or "OFF") and an error if the command fails or if the output cannot be parsed.
251-
func GetGPUCCMode() (CCMode, error) {
255+
// QueryCCMode executes nvidia-smi to determine the current Confidential Computing (CC) mode status of the GPU.
256+
// If DEVTOOLS mode is enabled, it would override CC mode as DEVTOOLS. DEVTOOLS mode would be enabled only when CC mode is ON.
257+
func QueryCCMode() (CCMode, error) {
252258
// Run nvidia-smi conf-compute command to get the confidential computing mode status.
253259
nvidiaSmiCmd := fmt.Sprintf("%s/bin/nvidia-smi", InstallationHostDir)
254260
ccModeOutput, err := exec.Command(nvidiaSmiCmd, "conf-compute", "-f").Output()
@@ -259,7 +265,17 @@ func GetGPUCCMode() (CCMode, error) {
259265
if err != nil {
260266
return "", err
261267
}
262-
return CCMode(ccMode), nil
268+
269+
devToolsEnabled, err := isDevToolsModeEnabled()
270+
if err != nil {
271+
return "", err
272+
}
273+
274+
if devToolsEnabled {
275+
ccMode = CCModeDevTools
276+
}
277+
278+
return ccMode, nil
263279
}
264280

265281
func parseCCStatus(output string) (CCMode, error) {
@@ -276,6 +292,21 @@ func parseCCStatus(output string) (CCMode, error) {
276292
return ccMode, nil
277293
}
278294

295+
func isDevToolsModeEnabled() (bool, error) {
296+
nvidiaSmiCmd := fmt.Sprintf("%s/bin/nvidia-smi", InstallationHostDir)
297+
output, err := exec.Command(nvidiaSmiCmd, "conf-compute", "-d").Output()
298+
if err != nil {
299+
return false, err
300+
}
301+
re := regexp.MustCompile(`DevTools Mode:\s*(ON|OFF)`)
302+
match := re.FindStringSubmatch(string(output))
303+
304+
if len(match) < 2 {
305+
return false, fmt.Errorf("DevTools mode not found in output: %s", output)
306+
}
307+
return match[1] == "ON", nil
308+
}
309+
279310
func launchNvidiaPersistencedProcess(logger logging.Logger) error {
280311
nvidiaPersistencedCmd := fmt.Sprintf("%s/bin/nvidia-persistenced", InstallationHostDir)
281312
logger.Info("Starting nvidia-persistenced process")

launcher/internal/gpu/driverinstaller_test.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,79 @@
11
package gpu
22

33
import (
4+
"os"
5+
"strings"
46
"testing"
57
)
68

9+
func TestGetInstallerImageReference(t *testing.T) {
10+
tests := []struct {
11+
name string
12+
fileContent string
13+
filePath string
14+
wantRef string
15+
wantErr bool
16+
errSubstr string
17+
}{
18+
{
19+
name: "Successful read",
20+
fileContent: "gcr.io/google-containers/cos-gpu-installer:v1.2.3",
21+
filePath: "/tmp/test_installer_ref_success.txt",
22+
wantRef: "gcr.io/google-containers/cos-gpu-installer:v1.2.3",
23+
wantErr: false,
24+
},
25+
{
26+
name: "Successful read with whitespace",
27+
fileContent: " gcr.io/google-containers/cos-gpu-installer:v1.2.4 \n",
28+
filePath: "/tmp/test_installer_ref_whitespace.txt",
29+
wantRef: "gcr.io/google-containers/cos-gpu-installer:v1.2.4",
30+
wantErr: false,
31+
},
32+
{
33+
name: "File does not exist",
34+
fileContent: "",
35+
filePath: "/tmp/non_existent_file.txt",
36+
wantRef: "",
37+
wantErr: true,
38+
errSubstr: "no such file or directory",
39+
},
40+
{
41+
name: "Empty file",
42+
fileContent: "",
43+
filePath: "/tmp/test_installer_ref_empty.txt",
44+
wantRef: "",
45+
wantErr: true,
46+
errSubstr: "empty value",
47+
},
48+
}
49+
50+
for _, tt := range tests {
51+
t.Run(tt.name, func(t *testing.T) {
52+
if tt.filePath != "/tmp/non_existent_file.txt" {
53+
err := os.WriteFile(tt.filePath, []byte(tt.fileContent), 0644)
54+
if err != nil {
55+
t.Errorf("failed to write to the testfile %s", tt.filePath)
56+
}
57+
defer os.Remove(tt.filePath)
58+
}
59+
60+
ref, err := getInstallerImageReference(tt.filePath)
61+
62+
if (err != nil) != tt.wantErr {
63+
t.Errorf("getInstallerImageReference() error = %v, wantErr %v", err, tt.wantErr)
64+
return
65+
}
66+
if tt.wantErr && !strings.Contains(err.Error(), tt.errSubstr) {
67+
t.Errorf("getInstallerImageReference() err message %s is expected to contain %s", err.Error(), tt.errSubstr)
68+
return
69+
}
70+
if ref != tt.wantRef {
71+
t.Errorf("getInstallerImageReference() got = %v, want %v", ref, tt.wantRef)
72+
}
73+
})
74+
}
75+
}
76+
777
func TestParseCCStatus(t *testing.T) {
878
tests := []struct {
979
name string

0 commit comments

Comments
 (0)