Add KServe inference test suite for Neuron

ybrodsky-rh · claude · ybrodsky-rh · commit 3de40be2099c · 2026-04-30T19:26:05.000+03:00
Add a new Ginkgo test suite under tests/hw-accel/neuron/kserve/ that validates KServe InferenceService deployment and inference on AWS Neuron hardware with OpenShift AI. Test cases: - kserve-001: Deploy InferenceService and verify Ready state - kserve-002: Send inference request and validate response Depends on eco-goinfra KServe builders (rh-ecosystem-edge/eco-goinfra#1337). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/tests/hw-accel/neuron/internal/do/kserve.go b/tests/hw-accel/neuron/internal/do/kserve.go
@@ -0,0 +1,171 @@
+package do
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/rh-ecosystem-edge/eco-goinfra/pkg/clients"
+	"github.com/rh-ecosystem-edge/eco-gotests/tests/hw-accel/neuron/params"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/klog/v2"
+)
+
+// KServeInferenceConfig holds configuration for KServe inference requests.
+type KServeInferenceConfig struct {
+	InferenceServiceURL string
+	Namespace           string
+	ModelName           string
+	Timeout             time.Duration
+}
+
+// ExecuteKServeInference sends an inference request to a KServe InferenceService endpoint.
+// It creates a temporary curl pod, sends the request, and retries until success or timeout.
+func ExecuteKServeInference(apiClient *clients.Settings, config KServeInferenceConfig) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), config.Timeout)
+	defer cancel()
+
+	jsonBody, err := buildInferenceRequestBody(config.ModelName)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal inference request: %w", err)
+	}
+
+	podName := "kserve-inference-test-curl"
+
+	if err := ensureCurlPod(ctx, apiClient, podName, config.Namespace); err != nil {
+		return "", fmt.Errorf("failed to create curl pod: %w", err)
+	}
+
+	defer cleanupCurlPod(apiClient, podName, config.Namespace)
+
+	endpoint := fmt.Sprintf("%s/v1/chat/completions", config.InferenceServiceURL)
+
+	curlCmd := []string{
+		"curl", "-sk",
+		"-X", "POST",
+		endpoint,
+		"-H", "Content-Type: application/json",
+		"-d", string(jsonBody),
+		"--max-time", "60",
+	}
+
+	const retryInterval = 30 * time.Second
+
+	var inferenceResult string
+
+	pollErr := wait.PollUntilContextTimeout(
+		ctx, retryInterval, config.Timeout, true,
+		func(pollCtx context.Context) (bool, error) {
+			execCtx, execCancel := context.WithTimeout(pollCtx, 90*time.Second)
+			defer execCancel()
+
+			response, execErr := executeInPod(execCtx, apiClient, podName, config.Namespace, "curl", curlCmd)
+			if execErr != nil {
+				klog.V(params.NeuronLogLevel).Infof(
+					"KServe inference attempt failed (model may still be compiling): %v", execErr)
+
+				return false, nil
+			}
+
+			content, extractErr := extractInferenceContent(response)
+			if extractErr != nil {
+				klog.V(params.NeuronLogLevel).Infof(
+					"KServe inference response not ready: %v", extractErr)
+
+				return false, nil
+			}
+
+			inferenceResult = content
+
+			return true, nil
+		})
+	if pollErr != nil {
+		return "", fmt.Errorf("KServe inference failed after %v: %w", config.Timeout, pollErr)
+	}
+
+	return inferenceResult, nil
+}
+
+// ensureCurlPod creates a long-running curl pod for executing inference requests.
+func ensureCurlPod(ctx context.Context, apiClient *clients.Settings, name, namespace string) error {
+	_, err := apiClient.Pods(namespace).Get(ctx, name, metav1.GetOptions{})
+	if err == nil {
+		return nil
+	}
+
+	pod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: namespace,
+			Annotations: map[string]string{
+				"sidecar.istio.io/inject": "false",
+			},
+		},
+		Spec: corev1.PodSpec{
+			Containers: []corev1.Container{
+				{
+					Name:    "curl",
+					Image:   "registry.access.redhat.com/ubi9/ubi-minimal:latest",
+					Command: []string{"sleep", "3600"},
+				},
+			},
+			RestartPolicy: corev1.RestartPolicyNever,
+		},
+	}
+
+	_, err = apiClient.Pods(namespace).Create(ctx, pod, metav1.CreateOptions{})
+	if err != nil && !apierrors.IsAlreadyExists(err) {
+		return err
+	}
+
+	return wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true,
+		func(pollCtx context.Context) (bool, error) {
+			p, getErr := apiClient.Pods(namespace).Get(pollCtx, name, metav1.GetOptions{})
+			if getErr != nil {
+				return false, nil
+			}
+
+			return p.Status.Phase == corev1.PodRunning, nil
+		})
+}
+
+// cleanupCurlPod removes the temporary curl pod.
+func cleanupCurlPod(apiClient *clients.Settings, name, namespace string) {
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	err := apiClient.Pods(namespace).Delete(ctx, name, metav1.DeleteOptions{})
+	if err != nil && !apierrors.IsNotFound(err) {
+		klog.V(params.NeuronLogLevel).Infof("Failed to delete curl pod: %v", err)
+	}
+}
+
+// ParseInferenceResponse parses a raw chat completions JSON response.
+func ParseInferenceResponse(response string) (string, error) {
+	var result map[string]interface{}
+	if err := json.Unmarshal([]byte(response), &result); err != nil {
+		return "", fmt.Errorf("failed to decode response: %w, raw: %s", err, response)
+	}
+
+	if errMsg, ok := result["error"]; ok {
+		raw, _ := json.Marshal(errMsg)
+
+		return "", fmt.Errorf("inference returned error: %s", string(raw))
+	}
+
+	var buf bytes.Buffer
+
+	enc := json.NewEncoder(&buf)
+	enc.SetIndent("", "  ")
+
+	if err := enc.Encode(result); err != nil {
+		return fmt.Sprintf("%v", result), nil
+	}
+
+	return buf.String(), nil
+}
diff --git a/tests/hw-accel/neuron/internal/neuronconfig/config.go b/tests/hw-accel/neuron/internal/neuronconfig/config.go
@@ -43,6 +43,14 @@ type NeuronConfig struct {
 	InstanceType string
 	// StorageClassName is the storage class for model PVC (default: gp3-csi).
 	StorageClassName string
+	// KServeModelName is the HuggingFace model for KServe inference tests.
+	KServeModelName string
+	// KServeVLLMImage is the vLLM Neuron image for the KServe ServingRuntime.
+	KServeVLLMImage string
+	// KServeNamespace is the namespace where KServe resources are deployed.
+	KServeNamespace string
+	// KServeTensorParallelSize is the tensor parallel size for KServe vLLM.
+	KServeTensorParallelSize string
 }
 
 // NewNeuronConfig creates a new NeuronConfig from environment variables.
@@ -65,6 +73,10 @@ func NewNeuronConfig() *NeuronConfig {
 		ImageRepoSecretName:       os.Getenv("ECO_HWACCEL_NEURON_IMAGE_REPO_SECRET"),
 		InstanceType:              os.Getenv("ECO_HWACCEL_NEURON_INSTANCE_TYPE"),
 		StorageClassName:          os.Getenv("ECO_HWACCEL_NEURON_STORAGE_CLASS"),
+		KServeModelName:           os.Getenv("ECO_HWACCEL_NEURON_KSERVE_MODEL_NAME"),
+		KServeVLLMImage:           os.Getenv("ECO_HWACCEL_NEURON_KSERVE_VLLM_IMAGE"),
+		KServeNamespace:           os.Getenv("ECO_HWACCEL_NEURON_KSERVE_NAMESPACE"),
+		KServeTensorParallelSize:  os.Getenv("ECO_HWACCEL_NEURON_KSERVE_TENSOR_PARALLEL_SIZE"),
 	}
 
 	// Set defaults
@@ -86,10 +98,21 @@ func NewNeuronConfig() *NeuronConfig {
 	}
 
 	if config.StorageClassName == "" {
-		// Default storage class for ROSA/AWS
 		config.StorageClassName = "gp3-csi"
 	}
 
+	if config.KServeModelName == "" {
+		config.KServeModelName = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+	}
+
+	if config.KServeNamespace == "" {
+		config.KServeNamespace = "neuron-inference"
+	}
+
+	if config.KServeTensorParallelSize == "" {
+		config.KServeTensorParallelSize = "1"
+	}
+
 	klog.V(params.NeuronLogLevel).Infof("NeuronConfig loaded: DriversImage=%s, DevicePluginImage=%s, NodeMetricsImage=%s",
 		config.DriversImage, config.DevicePluginImage, config.NodeMetricsImage)
 
@@ -110,3 +133,8 @@ func (c *NeuronConfig) IsVLLMConfigured() bool {
 func (c *NeuronConfig) IsUpgradeConfigured() bool {
 	return c.UpgradeTargetVersion != "" && c.UpgradeTargetDriversImage != ""
 }
+
+// IsKServeConfigured checks if KServe testing configuration is present.
+func (c *NeuronConfig) IsKServeConfigured() bool {
+	return c.HuggingFaceToken != "" && c.KServeNamespace != ""
+}
diff --git a/tests/hw-accel/neuron/kserve/internal/tsparams/consts.go b/tests/hw-accel/neuron/kserve/internal/tsparams/consts.go
@@ -0,0 +1,21 @@
+package tsparams
+
+import "time"
+
+const (
+	LabelSuite = "kserve"
+
+	KServeTestNamespace = "neuron-inference"
+
+	ServingRuntimeName = "vllm-neuron-runtime"
+
+	ModelFormatName = "vllm-neuron"
+
+	ServiceAccountName = "kserve-neuron-sa"
+
+	InferenceServiceReadyTimeout = 30 * time.Minute
+
+	InferenceRequestTimeout = 5 * time.Minute
+
+	CurlPodName = "kserve-inference-test-curl"
+)
diff --git a/tests/hw-accel/neuron/kserve/kserve_suite_test.go b/tests/hw-accel/neuron/kserve/kserve_suite_test.go
@@ -0,0 +1,34 @@
+package kserve
+
+import (
+	"runtime"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/rh-ecosystem-edge/eco-goinfra/pkg/reportxml"
+	_ "github.com/rh-ecosystem-edge/eco-gotests/tests/hw-accel/neuron/kserve/tests"
+	. "github.com/rh-ecosystem-edge/eco-gotests/tests/internal/inittools"
+)
+
+var _, currentFile, _, _ = runtime.Caller(0)
+
+func TestKServe(t *testing.T) {
+	_, reporterConfig := GinkgoConfiguration()
+	reporterConfig.JUnitReport = GeneralConfig.GetJunitReportPath(currentFile)
+
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Neuron KServe Suite", reporterConfig)
+}
+
+var _ = BeforeSuite(func() {
+	By("Setting up Neuron KServe test suite")
+})
+
+var _ = AfterSuite(func() {
+	By("Tearing down Neuron KServe test suite")
+})
+
+var _ = ReportAfterSuite("", func(report Report) {
+	reportxml.Create(report, GeneralConfig.GetReportPath(), GeneralConfig.TCPrefix)
+})
diff --git a/tests/hw-accel/neuron/kserve/tests/kserve-inference-test.go b/tests/hw-accel/neuron/kserve/tests/kserve-inference-test.go