Skip to content

Commit 2100b87

Browse files
committed
measure gpu cc mode
1 parent 263a935 commit 2100b87

File tree

7 files changed

+178
-30
lines changed

7 files changed

+178
-30
lines changed

launcher/container_runner.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
191191
}
192192

193193
for _, deviceFile := range gpuDeviceFiles {
194-
logger.Info("device file : %s", deviceFile)
194+
logger.Info(fmt.Sprintf("GPU device file : %s", deviceFile))
195195
specOpts = append(specOpts, oci.WithDevices(deviceFile, deviceFile, "crw-rw-rw-"))
196196
}
197197
}
@@ -341,6 +341,14 @@ func (r *ContainerRunner) measureCELEvents(ctx context.Context) error {
341341
return fmt.Errorf("failed to measure memory monitoring state: %v", err)
342342
}
343343

344+
if r.launchSpec.Experiments.EnableConfidentialGPUSupport && r.launchSpec.InstallGpuDriver {
345+
ccModeCmd := gpu.NvidiaSmiOutputFunc("conf-compute", "-f")
346+
devToolsCmd := gpu.NvidiaSmiOutputFunc("conf-compute", "-d")
347+
if err := r.measureGPUCCMode(ccModeCmd, devToolsCmd); err != nil {
348+
return fmt.Errorf("failed to measure GPU CC mode status: %v", err)
349+
}
350+
}
351+
344352
separator := cel.CosTlv{
345353
EventType: cel.LaunchSeparatorType,
346354
EventContent: nil, // Success
@@ -418,6 +426,18 @@ func (r *ContainerRunner) measureMemoryMonitor() error {
418426
return nil
419427
}
420428

429+
func (r *ContainerRunner) measureGPUCCMode(ccModeCmd, devToolsCmd gpu.NvidiaSmiCmdOutput) error {
430+
ccMode, err := gpu.QueryCCMode(ccModeCmd, devToolsCmd)
431+
if err != nil {
432+
return err
433+
}
434+
if err := r.attestAgent.MeasureEvent(cel.CosTlv{EventType: cel.GpuCCModeType, EventContent: []byte(ccMode.String())}); err != nil {
435+
return err
436+
}
437+
r.logger.Info("Successfully measured GPU CC mode status")
438+
return nil
439+
}
440+
421441
// Retrieves the default OIDC token from the attestation service, and returns how long
422442
// to wait before attemping to refresh it.
423443
// The token file will be written to a tmp file and then renamed.

launcher/container_runner_test.go

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/google/go-cmp/cmp"
2424
"github.com/google/go-tpm-tools/cel"
2525
"github.com/google/go-tpm-tools/launcher/agent"
26+
"github.com/google/go-tpm-tools/launcher/internal/gpu"
2627
"github.com/google/go-tpm-tools/launcher/internal/logging"
2728
"github.com/google/go-tpm-tools/launcher/launcherfile"
2829
"github.com/google/go-tpm-tools/launcher/spec"
@@ -638,6 +639,77 @@ func TestMeasureCELEvents(t *testing.T) {
638639
}
639640
}
640641

642+
func TestMeasureGPUCCMode(t *testing.T) {
643+
fakeContainer := &fakeContainer{
644+
image: &fakeImage{
645+
name: "fake image name",
646+
digest: "fake digest",
647+
id: "fake id",
648+
},
649+
args: []string{"fake args"},
650+
env: []string{"fake env"},
651+
}
652+
launchSpec := spec.LaunchSpec{
653+
InstallGpuDriver: true,
654+
}
655+
656+
tests := []struct {
657+
name string
658+
mockCCModeCmd gpu.NvidiaSmiCmdOutput
659+
mockDevToolsCmd gpu.NvidiaSmiCmdOutput
660+
wantErr bool
661+
wantEvents []cel.CosType
662+
}{
663+
{
664+
name: "Successful GPU CC mode measurement",
665+
mockCCModeCmd: func() ([]byte, error) { return []byte("CC status: ON"), nil },
666+
mockDevToolsCmd: func() ([]byte, error) { return []byte("DevTools Mode: OFF"), nil },
667+
wantErr: false,
668+
wantEvents: []cel.CosType{
669+
cel.GpuCCModeType,
670+
},
671+
},
672+
{
673+
name: "Failed GPU CC mode measurement",
674+
mockCCModeCmd: func() ([]byte, error) { return []byte("CC status: ON"), nil },
675+
mockDevToolsCmd: func() ([]byte, error) { return nil, fmt.Errorf("nvidia-smi DevTools mode error") },
676+
wantErr: true,
677+
wantEvents: []cel.CosType{},
678+
},
679+
}
680+
681+
for _, tc := range tests {
682+
t.Run(tc.name, func(t *testing.T) {
683+
gotEvents := []cel.CosType{}
684+
685+
fakeAgent := &fakeAttestationAgent{
686+
measureEventFunc: func(content cel.Content) error {
687+
got, _ := content.GetTLV()
688+
tlv := &cel.TLV{}
689+
tlv.UnmarshalBinary(got.Value)
690+
gotEvents = append(gotEvents, cel.CosType(tlv.Type))
691+
return nil
692+
},
693+
}
694+
695+
r := ContainerRunner{
696+
attestAgent: fakeAgent,
697+
container: fakeContainer,
698+
launchSpec: launchSpec,
699+
logger: logging.SimpleLogger(),
700+
}
701+
702+
err := r.measureGPUCCMode(tc.mockCCModeCmd, tc.mockDevToolsCmd)
703+
if (err != nil) != tc.wantErr {
704+
t.Errorf("measureGPUCCMode() error = %v, wantErr %v", err, tc.wantErr)
705+
}
706+
if !cmp.Equal(gotEvents, tc.wantEvents) {
707+
t.Errorf("failed to measure GPU CC mode event, got %v, but want %v", gotEvents, tc.wantEvents)
708+
}
709+
})
710+
}
711+
}
712+
641713
func TestPullImageWithRetries(t *testing.T) {
642714
testCases := []struct {
643715
name string
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
source util/read_serial.sh
4+
5+
SERIAL_OUTPUT=$(read_serial $1 $2)
6+
print_serial=false
7+
8+
if echo $SERIAL_OUTPUT | grep -q 'tee-install-gpu-driver is expected to set to true'
9+
then
10+
echo "- GPU driver installation without metadata flag is verified"
11+
else
12+
echo "FAILED: Driver installation metadata flag is not set"
13+
echo 'TEST FAILED.' > /workspace/status.txt
14+
print_serial=true
15+
fi

launcher/image/test/test_gpu_driver_installation_cloudbuild.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,21 @@ steps:
4646
'-c', '1',
4747
'-x', 'TDX',
4848
]
49+
- name: 'gcr.io/cloud-builders/gcloud'
50+
id: CreateTDXCVMWithCGPUNoMetadataFlag
51+
entrypoint: 'bash'
52+
env:
53+
- 'BUILD_ID=$BUILD_ID'
54+
args: ['create_gpu_vm.sh','-i', '${_IMAGE_NAME}',
55+
'-p', '${_IMAGE_PROJECT}',
56+
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-container-log-redirect=true',
57+
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}-noflag',
58+
'-z', 'us-east5-a', # H100 GPU with Confidential Compute on are available only under us-central1 and us-east5 regions.
59+
'-v', 'a3-highgpu-1g',
60+
'-g', 'nvidia-h100-80gb',
61+
'-c', '1',
62+
'-x', 'TDX',
63+
]
4964
- name: 'gcr.io/cloud-builders/gcloud'
5065
id: UnsupportedGpuWorkloadTest
5166
entrypoint: 'bash'
@@ -58,6 +73,10 @@ steps:
5873
id: ConfidentialGpuWorkloadTest
5974
entrypoint: 'bash'
6075
args: ['scripts/gpu/test_gpu_workload.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-tdxcvm-cgpu', 'us-east5-a']
76+
- name: 'gcr.io/cloud-builders/gcloud'
77+
id: NoMetadataFlagWorkloadTest
78+
entrypoint: 'bash'
79+
args: ['scripts/gpu/test_gpu_nometadata.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-noflag', 'us-east5-a']
6180
- name: 'gcr.io/cloud-builders/gcloud'
6281
id: UnsupportedGpuVmCleanUp
6382
entrypoint: 'bash'
@@ -76,6 +95,12 @@ steps:
7695
env:
7796
- 'CLEANUP=$_CLEANUP'
7897
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-tdxcvm-cgpu', 'us-east5-a']
98+
- name: 'gcr.io/cloud-builders/gcloud'
99+
id: NoMetadataVmCleanUp
100+
entrypoint: 'bash'
101+
env:
102+
- 'CLEANUP=$_CLEANUP'
103+
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}-noflag', 'us-east5-a']
79104
# Must come after cleanup.
80105
- name: 'gcr.io/cloud-builders/gcloud'
81106
id: NoGpuVmCheckFailure

launcher/internal/gpu/driverinstaller.go

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,13 @@ var supportedCGPUTypes = []deviceinfo.GPUType{
3030
deviceinfo.H100,
3131
}
3232

33-
type nvidiaSmiCmdOutput func() ([]byte, error)
34-
type nvidiaSmiCmdRun func() error
33+
// NvidiaSmiCmdOutput defines a function type for executing an NVIDIA SMI command
34+
// and returning the raw byte output along with any error.
35+
type NvidiaSmiCmdOutput func() ([]byte, error)
36+
37+
// NvidiaSmiCmdRun defines a function type for executing an NVIDIA SMI command
38+
// and returning only an error, if any.
39+
type NvidiaSmiCmdRun func() error
3540

3641
// DriverInstaller contains information about the GPU driver installer settings
3742
type DriverInstaller struct {
@@ -165,21 +170,21 @@ func (di *DriverInstaller) InstallGPUDrivers(ctx context.Context) error {
165170
return fmt.Errorf("failed to start nvidia-persistenced process: %v", err)
166171
}
167172

168-
nvidiaSmiVerifyCmd := nvidiaSmiRunFunc()
173+
nvidiaSmiVerifyCmd := NvidiaSmiRunFunc()
169174
if err = verifyDriverInstallation(nvidiaSmiVerifyCmd); err != nil {
170175
return fmt.Errorf("failed to verify GPU driver installation: %v", err)
171176
}
172177

173-
ccModeCmd := nvidiaSmiOutputFunc("conf-compute", "-f")
174-
devToolsCmd := nvidiaSmiOutputFunc("conf-compute", "-d")
178+
ccModeCmd := NvidiaSmiOutputFunc("conf-compute", "-f")
179+
devToolsCmd := NvidiaSmiOutputFunc("conf-compute", "-d")
175180

176181
ccEnabled, err := QueryCCMode(ccModeCmd, devToolsCmd)
177182
if err != nil {
178183
return fmt.Errorf("failed to check confidential compute mode status: %v", err)
179184
}
180185
// Explicitly need to set the GPU state to READY for GPUs with confidential compute mode ON.
181186
if ccEnabled == attest.GPUDeviceCCMode_ON {
182-
setGPUStateCmd := nvidiaSmiRunFunc("conf-compute", "-srs", "1")
187+
setGPUStateCmd := NvidiaSmiRunFunc("conf-compute", "-srs", "1")
183188
if err = setGPUStateToReady(setGPUStateCmd); err != nil {
184189
return fmt.Errorf("failed to set the GPU state to ready: %v", err)
185190
}
@@ -238,14 +243,14 @@ func remountAsExecutable(dir string) error {
238243
return nil
239244
}
240245

241-
func verifyDriverInstallation(nvidiaSmiVerifyCmd nvidiaSmiCmdRun) error {
246+
func verifyDriverInstallation(nvidiaSmiVerifyCmd NvidiaSmiCmdRun) error {
242247
if err := nvidiaSmiVerifyCmd(); err != nil {
243248
return fmt.Errorf("failed to verify GPU driver installation : %v", err)
244249
}
245250
return nil
246251
}
247252

248-
func setGPUStateToReady(nvidiaSmiSetGPUStateCmd nvidiaSmiCmdRun) error {
253+
func setGPUStateToReady(nvidiaSmiSetGPUStateCmd NvidiaSmiCmdRun) error {
249254
if err := nvidiaSmiSetGPUStateCmd(); err != nil {
250255
return fmt.Errorf("failed to set the GPU state to ready: %v", err)
251256
}
@@ -254,7 +259,7 @@ func setGPUStateToReady(nvidiaSmiSetGPUStateCmd nvidiaSmiCmdRun) error {
254259

255260
// QueryCCMode executes nvidia-smi to determine the current Confidential Computing (CC) mode status of the GPU.
256261
// If DEVTOOLS mode is enabled, it would override CC mode as DEVTOOLS. DEVTOOLS mode would be enabled only when CC mode is ON.
257-
func QueryCCMode(ccModeCmd, devToolsCmd nvidiaSmiCmdOutput) (attest.GPUDeviceCCMode, error) {
262+
func QueryCCMode(ccModeCmd, devToolsCmd NvidiaSmiCmdOutput) (attest.GPUDeviceCCMode, error) {
258263
ccMode := attest.GPUDeviceCCMode_UNSET
259264
ccModeOutput, err := ccModeCmd()
260265
if err != nil {
@@ -301,12 +306,16 @@ func isConfidentialComputeSupported(gpuType deviceinfo.GPUType, supportedCGPUTyp
301306
return fmt.Errorf("unsupported confidential GPU type %s, please retry with one of the supported confidential GPU types: %v", gpuType.String(), supportedCGPUTypes)
302307
}
303308

304-
func nvidiaSmiOutputFunc(args ...string) nvidiaSmiCmdOutput {
309+
// NvidiaSmiOutputFunc returns a function which executes the nvidia-smi command with the given arguments
310+
// and returns the raw byte output and any error.
311+
func NvidiaSmiOutputFunc(args ...string) NvidiaSmiCmdOutput {
305312
cmd := fmt.Sprintf("%s/bin/nvidia-smi", InstallationHostDir)
306313
return func() ([]byte, error) { return exec.Command(cmd, args...).Output() }
307314
}
308315

309-
func nvidiaSmiRunFunc(args ...string) nvidiaSmiCmdRun {
316+
// NvidiaSmiRunFunc returns a function which executes the nvidia-smi command with the given arguments
317+
// and returns an error, if any.
318+
func NvidiaSmiRunFunc(args ...string) NvidiaSmiCmdRun {
310319
cmd := fmt.Sprintf("%s/bin/nvidia-smi", InstallationHostDir)
311320
return func() error { return exec.Command(cmd, args...).Run() }
312321
}

launcher/internal/gpu/driverinstaller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func TestGetInstallerImageReference(t *testing.T) {
7777
func TestVerifyDriverInstallation(t *testing.T) {
7878
tests := []struct {
7979
name string
80-
mockVerifyCmd nvidiaSmiCmdRun
80+
mockVerifyCmd NvidiaSmiCmdRun
8181
wantErr bool
8282
errSubstr string
8383
}{
@@ -110,7 +110,7 @@ func TestVerifyDriverInstallation(t *testing.T) {
110110
func TestSetGPUStateToReady(t *testing.T) {
111111
tests := []struct {
112112
name string
113-
mockCmd nvidiaSmiCmdRun
113+
mockCmd NvidiaSmiCmdRun
114114
wantErr bool
115115
errSubstr string
116116
}{
@@ -143,8 +143,8 @@ func TestSetGPUStateToReady(t *testing.T) {
143143
func TestQueryCCMode(t *testing.T) {
144144
tests := []struct {
145145
name string
146-
mockCCModeCmd nvidiaSmiCmdOutput
147-
mockDevToolsCmd nvidiaSmiCmdOutput
146+
mockCCModeCmd NvidiaSmiCmdOutput
147+
mockDevToolsCmd NvidiaSmiCmdOutput
148148
expectedCCMode attest.GPUDeviceCCMode
149149
wantErr bool
150150
errSubstr string

launcher/launcher/main.go

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"time"
1515

1616
"cloud.google.com/go/compute/metadata"
17+
"cos.googlesource.com/cos/tools.git/src/cmd/cos_gpu_installer/deviceinfo"
1718
"github.com/containerd/containerd"
1819
"github.com/containerd/containerd/defaults"
1920
"github.com/containerd/containerd/namespaces"
@@ -189,20 +190,6 @@ func startLauncher(launchSpec spec.LaunchSpec, serialConsole *os.File) error {
189190
}
190191
defer containerdClient.Close()
191192

192-
ctx := namespaces.WithNamespace(context.Background(), namespaces.Default)
193-
if launchSpec.InstallGpuDriver {
194-
if launchSpec.Experiments.EnableConfidentialGPUSupport {
195-
installer := gpu.NewDriverInstaller(containerdClient, launchSpec, logger)
196-
err = installer.InstallGPUDrivers(ctx)
197-
if err != nil {
198-
return fmt.Errorf("failed to install gpu drivers: %v", err)
199-
}
200-
} else {
201-
logger.Info("Confidential GPU support experiment flag is not enabled for this project. Ensure that it is enabled when tee-install-gpu-driver is set to true")
202-
return fmt.Errorf("confidential gpu support experiment flag is not enabled")
203-
}
204-
}
205-
206193
tpm, err := tpm2.OpenTPM("/dev/tpmrm0")
207194
if err != nil {
208195
return &launcher.RetryableError{Err: err}
@@ -245,6 +232,26 @@ func startLauncher(launchSpec spec.LaunchSpec, serialConsole *os.File) error {
245232
logger.Info(fmt.Sprintf("failed to retrieve auth token: %v, using empty auth for image pulling\n", err))
246233
}
247234

235+
ctx := namespaces.WithNamespace(context.Background(), namespaces.Default)
236+
if launchSpec.InstallGpuDriver {
237+
if launchSpec.Experiments.EnableConfidentialGPUSupport {
238+
installer := gpu.NewDriverInstaller(containerdClient, launchSpec, logger)
239+
err = installer.InstallGPUDrivers(ctx)
240+
if err != nil {
241+
return fmt.Errorf("failed to install gpu drivers: %v", err)
242+
}
243+
} else {
244+
logger.Error("Confidential GPU support experiment flag is not enabled for this project. Ensure that it is enabled when tee-install-gpu-driver is set to true")
245+
return fmt.Errorf("confidential gpu support experiment flag is not enabled")
246+
}
247+
} else {
248+
deviceInfo, _ := deviceinfo.GetGPUTypeInfo()
249+
if deviceInfo != deviceinfo.NO_GPU {
250+
logger.Error("GPU is attached, tee-install-gpu-driver is not set")
251+
return fmt.Errorf("tee-install-gpu-driver is expected to set to true when GPU is attached")
252+
}
253+
}
254+
248255
logger.Info("Launch started", "duration_sec", time.Since(start).Seconds())
249256

250257
r, err := launcher.NewRunner(ctx, containerdClient, token, launchSpec, mdsClient, tpm, logger, serialConsole)

0 commit comments

Comments
 (0)