Skip to content

Proposal: new CSI integration design #2806

@yanndegat

Description

@yanndegat

hi.

i'm wondering if the current csi approach is not relying too much on end user pod customization.
Moreover it requires pods to run in privileged modes which may not be suitable in most k8s environments.

what would you think a a new approach where the peerpod vm would embed a new systemd agent service responsible to mount the volumes in the containers, according to pod's specs?

i've managed to do a proof of concept which works this way:

the cloud-api-adaptor generates a volumes.json spec file passed to the VM during cloudinit.

        podVolumes, _ := s.volumeService.GetVolumesForPod(ctx, podNamespace, podName)

        volumesConfig := volumes.VolumesConfig{
            Volumes: make([]volumes.VolumeConfig, len(podVolumes)),
        }
        for i, vol := range podVolumes {
            volumesConfig.Volumes[i] = volumes.VolumeConfig{
                VolumeID:          vol.VolumeID,
                VolumeName:        vol.VolumeName,
                FSType:            vol.FSType,
                MountPath:         vol.TargetPath,
                StagingTargetPath: vol.StagingTargetPath,
                MountOptions:      vol.MountOptions,
                ReadOnly:          vol.ReadOnly,
                FSGroup:           fsGroup,
            }
        }

        volumesJSON, _ := json.MarshalIndent(volumesConfig, "", "    ")
        cloudConfig.WriteFiles = append(cloudConfig.WriteFiles, cloudinit.WriteFile{
            Path:    "/run/peerpod/volumes.json",
            Content: string(volumesJSON),
        })

which can then be consumed by the new peerpod-volume-agent

    src/cloud-api-adaptor/cmd/peerpod-volume-agent/
     1  
     2  // SPDX-License-Identifier: Apache-2.0
     3
     4  package main
     5
     6  import (
     7          "encoding/json"
     8          "fmt"
     9          "log"
    10          "os"
    11          "os/exec"
    12          "path/filepath"
    13          "strings"
    14          "time"
    15
    16          "github.com/confidential-containers/cloud-api-adaptor/src/cloud-api-adaptor/pkg/paths"
    17          "github.com/confidential-containers/cloud-api-adaptor/src/cloud-api-adaptor/pkg/volumes"
    18  )
    19
    20  var logger = log.New(os.Stdout, "[peerpod-volume-agent] ", log.LstdFlags|log.Lmsgprefix)
    21
    22  const (
    23          // Maximum time to wait for a device to appear
    24          deviceWaitTimeout = 5 * time.Minute
    25          // Poll interval when waiting for devices
    26          devicePollInterval = 2 * time.Second
    27  )
    28
    29  func main() {
    30          logger.Println("Starting peerpod-volume-agent")
    31
    32          // Check if volumes config exists
    33          if _, err := os.Stat(paths.VolumesConfigPath); os.IsNotExist(err) {
    34                  logger.Printf("No volume config found at %s, nothing to do", paths.VolumesConfigPath)
    35                  signalReady()
    36                  return
    37          }
    38
    39          // Load volume configuration
    40          config, err := loadVolumeConfig(paths.VolumesConfigPath)
    41          if err != nil {
    42                  logger.Fatalf("Failed to load volume config: %v", err)
    43          }
    44
    45          if len(config.Volumes) == 0 {
    46                  logger.Println("No volumes configured, nothing to do")
    47                  signalReady()
    48                  return
    49          }
    50
    51          logger.Printf("Processing %d volume(s)", len(config.Volumes))
    52
    53          // Ensure mount directory exists
    54          if err := os.MkdirAll(paths.VolumesMountDir, 0755); err != nil {
    55                  logger.Fatalf("Failed to create mount directory %s: %v", paths.VolumesMountDir, err)
    56          }
    57
    58          // Process each volume
    59          for i, vol := range config.Volumes {
    60                  logger.Printf("Processing volume %d/%d: %s", i+1, len(config.Volumes), vol.VolumeID)
    61
    62                  if err := processVolume(vol); err != nil {
    63                          logger.Fatalf("Failed to process volume %s: %v", vol.VolumeID, err)
    64                  }
    65          }
    66
    67          logger.Println("All volumes processed successfully")
    68          signalReady()
    69  }
    70
    71  func loadVolumeConfig(configPath string) (*volumes.VolumesConfig, error) {
    72          data, err := os.ReadFile(configPath)
    73          if err != nil {
    74                  return nil, fmt.Errorf("failed to read config file: %w", err)
    75          }
    76
    77          var config volumes.VolumesConfig
    78          if err := json.Unmarshal(data, &config); err != nil {
    79                  return nil, fmt.Errorf("failed to parse config: %w", err)
    80          }
    81
    82          return &config, nil
    83  }
    84
    85  func processVolume(vol volumes.VolumeConfig) error {
    86          // Wait for device to appear
    87          devicePath, err := waitForDevice(vol.VolumeID, vol.DevicePath)
    88          if err != nil {
    89                  return fmt.Errorf("failed waiting for device: %w", err)
    90          }
    91          logger.Printf("Found device %s for volume %s", devicePath, vol.VolumeID)
    92
    93          // Ensure filesystem exists
    94          if err := ensureFilesystem(devicePath, vol.FSType); err != nil {
    95                  return fmt.Errorf("failed to ensure filesystem: %w", err)
    96          }
    97
    98          // Mount to staging path
    99          stagingPath := filepath.Join(paths.VolumesMountDir, vol.VolumeID)
   100          if err := mountVolume(devicePath, stagingPath, vol.FSType, vol.MountOptions, vol.ReadOnly); err != nil {
   101                  return fmt.Errorf("failed to mount volume: %w", err)
   102          }
   103          logger.Printf("Mounted volume %s at %s", vol.VolumeID, stagingPath)
   104
   105          // Set permissions on the mount point to allow container access
   106          // This mimics kubelet behavior with fsGroup
   107          if !vol.ReadOnly {
   108                  if vol.FSGroup != nil {
   109                          // fsGroup is set - chown to root:<fsGroup> and chmod 0775
   110                          // This allows processes running as that group to read/write
   111                          gid := int(*vol.FSGroup)
   112                          if err := os.Chown(stagingPath, 0, gid); err != nil {
   113                                  logger.Printf("Warning: failed to chown mount point to gid %d: %v", gid, err)
   114                          } else {
   115                                  logger.Printf("Set ownership root:%d on %s", gid, stagingPath)
   116                          }
   117                          if err := os.Chmod(stagingPath, 0775); err != nil {
   118                                  logger.Printf("Warning: failed to chmod mount point: %v", err)
   119                          } else {
   120                                  logger.Printf("Set permissions 0775 on %s", stagingPath)
   121                          }
   122                  } else {
   123                          // No fsGroup - set world-writable as fallback
   124                          if err := os.Chmod(stagingPath, 0777); err != nil {
   125                                  logger.Printf("Warning: failed to chmod mount point: %v", err)
   126                          } else {
   127                                  logger.Printf("Set permissions 0777 on %s (no fsGroup)", stagingPath)
   128                          }
   129                  }
   130          }
   131
   132          // Create bind mount at the expected kubelet path (what kata-agent expects)
   133          if vol.MountPath != "" {
   134                  if err := bindMount(stagingPath, vol.MountPath, vol.ReadOnly); err != nil {
   135                          return fmt.Errorf("failed to create bind mount at %s: %w", vol.MountPath, err)
   136                  }
   137                  logger.Printf("Created bind mount from %s to %s", stagingPath, vol.MountPath)
   138          }
   139
   140          return nil
   141  }
   142
   143  // waitForDevice waits for a block device to appear by volume ID or explicit path
   144  func waitForDevice(volumeID, explicitPath string) (string, error) {
   145          deadline := time.Now().Add(deviceWaitTimeout)
   146
   147          for time.Now().Before(deadline) {
   148                  // If explicit path is provided, check it first
   149                  if explicitPath != "" {
   150                          if _, err := os.Stat(explicitPath); err == nil {
   151                                  return explicitPath, nil
   152                          }
   153                  }
   154
   155                  // Try to find device by volume ID in /dev/disk/by-id/
   156                  // OpenStack/Cinder uses virtio-<volumeID[:20]> format
   157                  devicePath, err := findDeviceByVolumeID(volumeID)
   158                  if err == nil && devicePath != "" {
   159                          return devicePath, nil
   160                  }
   161
   162                  logger.Printf("Waiting for device for volume %s...", volumeID)
   163                  time.Sleep(devicePollInterval)
   164          }
   165
   166          return "", fmt.Errorf("timeout waiting for device for volume %s", volumeID)
   167  }
   168
   169  // findDeviceByVolumeID searches for a device by volume ID in /dev/disk/by-id/
   170  func findDeviceByVolumeID(volumeID string) (string, error) {
   171          byIDPath := "/dev/disk/by-id"
   172
   173          entries, err := os.ReadDir(byIDPath)
   174          if err != nil {
   175                  if os.IsNotExist(err) {
   176                          return "", nil // Directory doesn't exist yet
   177                  }
   178                  return "", err
   179          }
   180
   181          // OpenStack/Cinder typically uses the first 20 chars of volume ID
   182          // Format: virtio-<volumeID[:20]>
   183          shortID := volumeID
   184          if len(shortID) > 20 {
   185                  shortID = shortID[:20]
   186          }
   187
   188          // Also check without dashes (some hypervisors strip them)
   189          shortIDNoDash := strings.ReplaceAll(shortID, "-", "")
   190
   191          for _, entry := range entries {
   192                  name := entry.Name()
   193
   194                  // Check for virtio-<volumeID> pattern
   195                  if strings.HasPrefix(name, "virtio-") {
   196                          suffix := strings.TrimPrefix(name, "virtio-")
   197                          if strings.HasPrefix(suffix, shortID) || strings.HasPrefix(suffix, shortIDNoDash) {
   198                                  linkPath := filepath.Join(byIDPath, name)
   199                                  // Resolve the symlink to get actual device path
   200                                  realPath, err := filepath.EvalSymlinks(linkPath)
   201                                  if err != nil {
   202                                          continue
   203                                  }
   204                                  return realPath, nil
   205                          }
   206                  }
   207
   208                  // Check for scsi- pattern (some cloud providers use this)
   209                  if strings.HasPrefix(name, "scsi-") && strings.Contains(name, shortID) {
   210                          linkPath := filepath.Join(byIDPath, name)
   211                          realPath, err := filepath.EvalSymlinks(linkPath)
   212                          if err != nil {
   213                                  continue
   214                          }
   215                          return realPath, nil
   216                  }
   217          }
   218
   219          return "", nil
   220  }
   221
   222  // ensureFilesystem checks if a filesystem exists on the device and creates one if needed
   223  func ensureFilesystem(devicePath, fsType string) error {
   224          if fsType == "" {
   225                  fsType = "ext4"
   226          }
   227
   228          // Check if device already has a filesystem using blkid
   229          cmd := exec.Command("blkid", "-o", "value", "-s", "TYPE", devicePath)
   230          output, err := cmd.Output()
   231          if err == nil && len(output) > 0 {
   232                  existingFS := strings.TrimSpace(string(output))
   233                  logger.Printf("Device %s already has filesystem: %s", devicePath, existingFS)
   234                  return nil
   235          }
   236
   237          // No filesystem found, create one
   238          logger.Printf("Creating %s filesystem on %s", fsType, devicePath)
   239
   240          var mkfsCmd *exec.Cmd
   241          switch fsType {
   242          case "ext4":
   243                  mkfsCmd = exec.Command("mkfs.ext4", "-F", devicePath)
   244          case "xfs":
   245                  mkfsCmd = exec.Command("mkfs.xfs", "-f", devicePath)
   246          case "ext3":
   247                  mkfsCmd = exec.Command("mkfs.ext3", "-F", devicePath)
   248          default:
   249                  return fmt.Errorf("unsupported filesystem type: %s", fsType)
   250          }
   251
   252          mkfsCmd.Stdout = os.Stdout
   253          mkfsCmd.Stderr = os.Stderr
   254
   255          if err := mkfsCmd.Run(); err != nil {
   256                  return fmt.Errorf("mkfs failed: %w", err)
   257          }
   258
   259          logger.Printf("Created %s filesystem on %s", fsType, devicePath)
   260          return nil
   261  }
   262
   263  // mountVolume mounts a device to the specified path
   264  func mountVolume(devicePath, mountPath, fsType string, options []string, readOnly bool) error {
   265          // Create mount point directory
   266          if err := os.MkdirAll(mountPath, 0755); err != nil {
   267                  return fmt.Errorf("failed to create mount point: %w", err)
   268          }
   269
   270          // Check if already mounted
   271          if isMounted(mountPath) {
   272                  logger.Printf("Path %s is already mounted", mountPath)
   273                  return nil
   274          }
   275
   276          // Build mount options
   277          mountOpts := []string{}
   278          if readOnly {
   279                  mountOpts = append(mountOpts, "ro")
   280          }
   281          mountOpts = append(mountOpts, options...)
   282
   283          // Build mount command
   284          args := []string{"-t", fsType}
   285          if len(mountOpts) > 0 {
   286                  args = append(args, "-o", strings.Join(mountOpts, ","))
   287          }
   288          args = append(args, devicePath, mountPath)
   289
   290          cmd := exec.Command("mount", args...)
   291          cmd.Stdout = os.Stdout
   292          cmd.Stderr = os.Stderr
   293
   294          if err := cmd.Run(); err != nil {
   295                  return fmt.Errorf("mount failed: %w", err)
   296          }
   297
   298          return nil
   299  }
   300
   301  // isMounted checks if a path is a mount point
   302  func isMounted(path string) bool {
   303          cmd := exec.Command("mountpoint", "-q", path)
   304          return cmd.Run() == nil
   305  }
   306
   307  // bindMount creates a bind mount from source to target
   308  func bindMount(source, target string, readOnly bool) error {
   309          // Create target directory (and all parent directories)
   310          if err := os.MkdirAll(target, 0755); err != nil {
   311                  return fmt.Errorf("failed to create target directory: %w", err)
   312          }
   313
   314          // Check if already mounted
   315          if isMounted(target) {
   316                  logger.Printf("Path %s is already mounted", target)
   317                  return nil
   318          }
   319
   320          // Build mount options
   321          opts := "bind"
   322          if readOnly {
   323                  opts = "bind,ro"
   324          }
   325
   326          cmd := exec.Command("mount", "--bind", source, target)
   327          cmd.Stdout = os.Stdout
   328          cmd.Stderr = os.Stderr
   329
   330          if err := cmd.Run(); err != nil {
   331                  return fmt.Errorf("bind mount failed: %w", err)
   332          }
   333
   334          // If read-only, remount with ro option
   335          if readOnly {
   336                  cmd = exec.Command("mount", "-o", "remount,ro,bind", target)
   337                  cmd.Stdout = os.Stdout
   338                  cmd.Stderr = os.Stderr
   339                  if err := cmd.Run(); err != nil {
   340                          return fmt.Errorf("failed to remount read-only: %w", err)
   341                  }
   342          }
   343
   344          _ = opts // suppress unused variable warning
   345          return nil
   346  }
   347
   348  // signalReady creates the ready file to signal that all volumes are mounted
   349  func signalReady() {
   350          // Create parent directory if needed
   351          if err := os.MkdirAll(filepath.Dir(paths.VolumesReadyPath), 0755); err != nil {
   352                  logger.Printf("Warning: failed to create ready file directory: %v", err)
   353                  return
   354          }
   355
   356          if err := os.WriteFile(paths.VolumesReadyPath, []byte("ready\n"), 0644); err != nil {
   357                  logger.Printf("Warning: failed to write ready file: %v", err)
   358                  return
   359          }
   360
   361          logger.Printf("Signaled ready at %s", paths.VolumesReadyPath)
   362  }
    src/cloud-api-adaptor/podvm/files/etc/systemd/system/multi-user.target.wants/peerpod-volume-agent.service
    src/cloud-api-adaptor/podvm/files/etc/systemd/system/peerpod-volume-agent.service
[Unit]
Description=Peer Pod Volume Agent
Documentation=https://github.com/confidential-containers/cloud-api-adaptor
After=network.target cloud-init.service process-user-data.service
Before=kata-agent.service
ConditionPathExists=/run/peerpod/volumes.json

[Service]
Type=oneshot
ExecStart=/usr/local/bin/peerpod-volume-agent
RemainAfterExit=yes
TimeoutStartSec=300
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=multi-user.target

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions