Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cli/cmd/ctl/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/beclab/Olares/cli/cmd/ctl/user"
"github.com/beclab/Olares/cli/version"
"github.com/spf13/cobra"
"github.com/beclab/Olares/cli/cmd/ctl/nfd"
)

func NewDefaultCommand() *cobra.Command {
Expand Down
16 changes: 16 additions & 0 deletions cli/pkg/gpu/amd/embed.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package amd

import _ "embed"

// Source of truth file: put your AMD plugin YAML here and keep it versioned.
//go:generate bash -c "mkdir -p assets && cp -f ../../../infrastructure/gpu/.olares/config/amd/amd-gpu-device-plugin.yaml assets/amd-gpu-device-plugin.yaml"

//go:embed assets/amd-gpu-device-plugin.yaml
var amdManifest []byte

func mustManifestYAML() []byte {
if len(amdManifest) == 0 {
panic("embedded AMD GPU manifest is empty; run `go generate ./cli/pkg/gpu/amd` to populate it")
}
return amdManifest
}
41 changes: 41 additions & 0 deletions cli/pkg/gpu/amd/module.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package amd

import (
"time"

"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
)

// Module wires the actions to install the AMD GPU device plugin (ROCm).
type Module struct {
common.KubeModule
}

func (m *Module) Init() {
write := &task.KubeTask{
Name: "WriteAmdGpuManifest",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{new(common.OnlyFirstMaster)},
Action: new(WriteManifest),
Parallel: false,
}
apply := &task.KubeTask{
Name: "ApplyAmdGpuManifest",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{new(common.OnlyFirstMaster)},
Action: new(ApplyManifest),
Parallel: false,
}
wait := &task.KubeTask{
Name: "WaitAmdGpuReady",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{new(common.OnlyFirstMaster)},
Action: new(WaitReady),
Parallel: false,
Timeout: 5 * time.Minute,
}

m.Tasks = []task.Interface{write, apply, wait}
}
83 changes: 83 additions & 0 deletions cli/pkg/gpu/amd/tasks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package amd

import (
"fmt"
"path/filepath"
"strings"

"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/logger"
)

const (
nsName = "kube-system" // most AMD plugin examples live here; change if your YAML uses another ns
dsName = "amd-gpu-device-plugin" // must match the DaemonSet metadata.name in your YAML
manifestOnHost = "/etc/kubernetes/addons/amd-gpu.yaml" // destination on first CP node
)

// -------- Detect helpers (used by orchestrator) --------

// HasAmdKernelBits returns true if amdgpu module or /dev/kfd present.
func HasAmdKernelBits(r connector.Runtime) bool {
checks := []string{
"test -e /dev/kfd && echo yes || true",
"lsmod | grep -q '^amdgpu' && echo yes || true",
}
for _, c := range checks {
out, _ := r.GetRunner().SudoCmd(c, false, false)
if strings.Contains(out, "yes") {
return true
}
}
return false
}

// HasAmdPci returns true if lspci sees an AMD/ATI VGA/3D controller.
func HasAmdPci(r connector.Runtime) bool {
out, err := r.GetRunner().SudoCmd(`lspci -nn | egrep -i 'vga|3d|display' | egrep -i 'amd|ati' && echo yes || true`, false, false)
return err == nil && strings.Contains(out, "yes")
}

// -------- Tasks --------

type WriteManifest struct{ common.KubeAction }

func (a *WriteManifest) Execute(runtime connector.Runtime) error {
dir := filepath.Dir(manifestOnHost)
if _, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("mkdir -p %s", dir), false, false); err != nil {
return err
}
payload := string(mustManifestYAML())
cmd := fmt.Sprintf("cat > %s <<'EOF'\n%s\nEOF", manifestOnHost, payload)
_, err := runtime.GetRunner().SudoCmd(cmd, false, false)
return err
}

type ApplyManifest struct{ common.KubeAction }

func (a *ApplyManifest) Execute(runtime connector.Runtime) error {
kubectl, _ := a.PipelineCache.GetMustString(common.CacheCommandKubectlPath)
if kubectl == "" {
kubectl = filepath.Join(common.BinDir, common.CommandKubectl)
}
cmd := fmt.Sprintf("%s apply -f %s", kubectl, manifestOnHost)
_, err := runtime.GetRunner().SudoCmd(cmd, false, false)
return err
}

type WaitReady struct{ common.KubeAction }

func (a *WaitReady) Execute(runtime connector.Runtime) error {
kubectl, _ := a.PipelineCache.GetMustString(common.CacheCommandKubectlPath)
if kubectl == "" {
kubectl = filepath.Join(common.BinDir, common.CommandKubectl)
}
wait := fmt.Sprintf("%s -n %s rollout status ds/%s --timeout=300s", kubectl, nsName, dsName)
out, err := runtime.GetRunner().SudoCmd(wait, false, false)
if err != nil && !strings.Contains(out, "successfully rolled out") {
logger.Errorf("waiting for AMD GPU DS failed: %v (%s)", err, out)
return err
}
return nil
}
98 changes: 98 additions & 0 deletions cli/pkg/gpu/amd_plugin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package gpu

import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/connector"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
)

// Applies the OFFICIAL AMD ROCm device-plugin DS if an AMD GPU is detected.
// Ref: https://github.com/ROCm/k8s-device-plugin (apply from web)
// kubectl apply -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml
// The DS name we wait for: amdgpu-device-plugin-daemonset (ns kube-system).
const (
amdDevicePluginURL = "https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml"
amdDPDaemonSetName = "amdgpu-device-plugin-daemonset"
amdDPNamespace = "kube-system"
)

type InstallAmdDevicePluginModule struct{ common.KubeModule }

func (m *InstallAmdDevicePluginModule) Init() {
m.Name = "InstallAmdDevicePlugin"

apply := &task.RemoteTask{
Name: "ApplyAmdDevicePlugin",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(AmdGpuPresent),
},
Action: new(ApplyAmdDevicePlugin),
Parallel: false,
Retry: 1,
Timeout: 2 * time.Minute,
}
wait := &task.RemoteTask{
Name: "WaitAmdDevicePluginReady",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(AmdGpuPresent),
},
Action: new(WaitAmdDevicePluginReady),
Parallel: false,
Timeout: 5 * time.Minute,
}
m.Tasks = []task.Interface{apply, wait}
}

// ---- detection prepare ----

type AmdGpuPresent struct{ common.KubePrepare }

func (p *AmdGpuPresent) PreCheck(runtime connector.Runtime) (bool, error) {
checks := []string{
"test -e /dev/kfd && echo yes || true",
"lsmod | grep -q '^amdgpu' && echo yes || true",
`lspci -nn | egrep -i 'vga|3d|display' | egrep -qi 'amd|ati' && echo yes || true`,
}
for _, c := range checks {
out, _ := runtime.GetRunner().SudoCmd(c, false, false)
if strings.Contains(out, "yes") {
return true, nil
}
}
return false, nil
}

// ---- actions ----

type ApplyAmdDevicePlugin struct{ common.KubeAction }
func (a *ApplyAmdDevicePlugin) Execute(runtime connector.Runtime) error {
kubectl, _ := a.PipelineCache.GetMustString(common.CacheCommandKubectlPath)
if kubectl == "" {
kubectl = filepath.Join(common.BinDir, "kubectl")
}
_, err := runtime.GetRunner().SudoCmd(fmt.Sprintf("%s apply -f %s", kubectl, amdDevicePluginURL), false, false)
return err
}

type WaitAmdDevicePluginReady struct{ common.KubeAction }
func (a *WaitAmdDevicePluginReady) Execute(runtime connector.Runtime) error {
kubectl, _ := a.PipelineCache.GetMustString(common.CacheCommandKubectlPath)
if kubectl == "" {
kubectl = filepath.Join(common.BinDir, "kubectl")
}
_, err := runtime.GetRunner().SudoCmd(
fmt.Sprintf("%s -n %s rollout status ds/%s --timeout=300s", kubectl, amdDPNamespace, amdDPDaemonSetName),
false, false,
)
return err
}
88 changes: 88 additions & 0 deletions cli/pkg/gpu/install.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package gpu

import (
"fmt"
"strings"

"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/logger"
"github.com/beclab/Olares/cli/pkg/core/pipeline"

"github.com/beclab/Olares/cli/pkg/gpu/amd"
// import your existing NVIDIA package here; assumed path:
nvidia "github.com/beclab/Olares/cli/pkg/gpu/nvidia"
)

type Vendor string

const (
VendorAuto Vendor = "auto"
VendorAMD Vendor = "amd"
VendorNVIDIA Vendor = "nvidia"
)

type InstallOptions struct {
Vendor Vendor // auto|amd|nvidia
}

func Install(runtime common.KubeRuntime, opt InstallOptions) error {
v := strings.ToLower(string(opt.Vendor))
if v == "" {
v = string(VendorAuto)
}
logger.Infof("GPU install requested (vendor=%s)", v)

switch Vendor(v) {
case VendorAMD:
return runAmd(runtime)
case VendorNVIDIA:
return runNvidia(runtime)
default:
// auto-detect on first control-plane
first := runtime.GetFirstMaster()
r := runtime.GetConnector().GetRuntime(first)

amdFound := amd.HasAmdKernelBits(r) || amd.HasAmdPci(r)
nvFound := hasNvidia(r)

if !amdFound && !nvFound {
return fmt.Errorf("no AMD or NVIDIA GPU detected on first control-plane node")
}
if nvFound {
if err := runNvidia(runtime); err != nil {
return err
}
}
if amdFound {
if err := runAmd(runtime); err != nil {
return err
}
}
return nil
}
}

// ---- NVIDIA detection (mirror your existing helpers if you have them) ----
func hasNvidia(r common.Runtime) bool {
out, _ := r.GetRunner().SudoCmd(`lsmod | grep -q '^nvidia' && echo yes || true`, false, false)
if strings.Contains(out, "yes") {
return true
}
out, _ = r.GetRunner().SudoCmd(`lspci -nn | egrep -i 'vga|3d|display' | grep -qi nvidia && echo yes || true`, false, false)
return strings.Contains(out, "yes")
}

// ---- runners ----
func runAmd(runtime common.KubeRuntime) error {
var m amd.Module
m.Runtime = runtime
m.Init()
return pipeline.Run(&m)
}

func runNvidia(runtime common.KubeRuntime) error {
var m nvidia.Module
m.Runtime = runtime
m.Init()
return pipeline.Run(&m)
}
20 changes: 20 additions & 0 deletions cli/pkg/ndf/install.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package nfd

import (
"log"

"github.com/beclab/Olares/cli/pkg/pipelines"
"github.com/spf13/cobra"
)

func NewCmdInstallNfd() *cobra.Command {
return &cobra.Command{
Use: "install",
Short: "Install Node Feature Discovery",
Run: func(cmd *cobra.Command, args []string) {
if err := pipelines.InstallNFD(); err != nil {
log.Fatalf("error: %v", err)
}
},
}
}
Loading
Loading