Skip to content

Commit d67570e

Browse files
Updating e2e test
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
1 parent 338ee25 commit d67570e

File tree

9 files changed

+134
-35
lines changed

9 files changed

+134
-35
lines changed

.github/workflows/e2e.yml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
name: End-to-end Tests
1616

1717
on:
18+
push:
19+
branches:
20+
- e2e-*
1821
workflow_run:
1922
workflows: [Image]
2023
types:
@@ -23,12 +26,13 @@ on:
2326
- "pull-request/[0-9]+"
2427
- main
2528
- release-*
29+
- e2e-*
2630

2731
jobs:
2832
e2e-tests:
2933
runs-on: linux-amd64-cpu4
30-
#if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }}
31-
if: false # TODO: Disabled until e2e test infra is fixed
34+
if: ${{ github.event.workflow_run.conclusion == 'success' }} && ${{ github.event.workflow_run.event == 'push' }}
35+
# if: false # TODO: Disabled until e2e test infra is fixed
3236
steps:
3337
- name: Check out code
3438
uses: actions/checkout@v4
@@ -41,7 +45,7 @@ jobs:
4145
echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION := }" >> $GITHUB_ENV
4246
4347
- name: Set up Holodeck
44-
uses: NVIDIA/holodeck@v0.2.10
48+
uses: NVIDIA/holodeck@v0.2.7
4549
with:
4650
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
4751
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@@ -53,7 +57,7 @@ jobs:
5357
with:
5458
go-version: ${{ env.GOLANG_VERSION }}
5559

56-
- name: Intall dependencies
60+
- name: Install Dependencies
5761
run: |
5862
sudo apt-get update
5963
sudo apt-get install -y make
@@ -70,7 +74,7 @@ jobs:
7074
./hack/e2e_tests.sh
7175
7276
- name: Archive test logs
73-
if: ${{ failure() }}
77+
if: ${{ always() }}
7478
uses: actions/upload-artifact@v4
7579
with:
7680
name: e2e-test-logs

.github/workflows/image.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ on:
2727
branches:
2828
- main
2929
- release-*
30+
- e2e-*
3031

3132
jobs:
3233
build:

api/apps/v1alpha1/nimcache_types.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,9 +303,9 @@ func (n *NIMCache) GetTolerations() []corev1.Toleration {
303303

304304
// GetNodeSelectors returns nodeselectors configured for the NIMCache Job.
305305
func (n *NIMCache) GetNodeSelectors() map[string]string {
306-
if n.Spec.NodeSelector == nil {
306+
/*if n.Spec.NodeSelector == nil {
307307
return map[string]string{"feature.node.kubernetes.io/pci-10de.present": "true"}
308-
}
308+
}*/
309309
return n.Spec.NodeSelector
310310
}
311311

internal/controller/nimcache_controller.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -608,47 +608,49 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
608608
logger.Error(err, "failed to create", "pod", pod.Name)
609609
return false, err
610610
}
611+
logger.Info("Created pod for model manifest extraction", "pod", pod.Name)
611612

612613
existingPod := &corev1.Pod{}
613614
err = r.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: nimCache.Namespace}, existingPod)
614615
if err != nil {
615616
logger.Error(err, "failed to get pod for model selection", "pod", pod.Name)
616-
return false, err
617+
return true, err
617618
}
618-
619+
logger.Info("HERE", "pod", pod.Name)
619620
if existingPod.Status.Phase != corev1.PodRunning {
620621
// requeue request with delay until the pod is ready
622+
logger.Info("HERE", "pod", existingPod.Status)
621623
return true, nil
622624
}
623-
625+
logger.Info("HERE1", "pod", pod.Name)
624626
// Extract manifest file
625627
output, err := k8sutil.GetPodLogs(ctx, existingPod, NIMCacheContainerName)
626628
if err != nil {
627629
logger.Error(err, "failed to get pod logs for parsing model manifest file", "pod", pod.Name)
628630
return false, err
629631
}
630-
632+
logger.Info("HERE2", "pod", pod.Name)
631633
if output == "" {
632634
logger.Info("Requeuing to wait for the manifest to be copied from the container")
633635
return true, nil
634636
}
635-
637+
logger.Info("HERE3", "pod", pod.Name)
636638
parser := nimparserutils.GetNIMParser([]byte(output))
637639
// Parse the file
638640
manifest, err := parser.ParseModelManifestFromRawOutput([]byte(output))
639641
if err != nil {
640642
logger.Error(err, "Failed to parse model manifest from the pod")
641643
return false, err
642644
}
643-
logger.V(2).Info("manifest file", "nimcache", nimCache.Name, "manifest", manifest)
644-
645+
logger.Info("manifest file", "nimcache", nimCache.Name, "manifest", manifest)
646+
logger.Info("HERE4", "pod", pod.Name)
645647
// Create a ConfigMap with the model manifest file for re-use
646648
err = r.createManifestConfigMap(ctx, nimCache, &manifest)
647649
if err != nil {
648650
logger.Error(err, "Failed to create model manifest config map")
649651
return false, err
650652
}
651-
653+
logger.Info("HERE5", "pod", pod.Name)
652654
// Model manifest is successfully extracted, cleanup temporary pod
653655
err = r.Delete(ctx, existingPod)
654656
if err != nil && !errors.IsNotFound(err) {
@@ -662,6 +664,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
662664

663665
func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error {
664666
logger := r.GetLogger()
667+
logger.Info("Reconciling model selection", "nimcache", nimCache.Name)
665668

666669
// reconcile model selection pod
667670
if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) {
@@ -707,7 +710,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
707710

708711
func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error {
709712
logger := r.GetLogger()
710-
713+
logger.Info("Reconciling job", "nimcache", nimCache.Name)
711714
// reconcile model caching job
712715
job := &batchv1.Job{}
713716
jobName := types.NamespacedName{Name: getJobName(nimCache), Namespace: nimCache.GetNamespace()}
@@ -857,7 +860,7 @@ func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *ap
857860
}
858861

859862
if requeue {
860-
logger.V(2).Info("requeueing for reconciliation for model selection", "pod", getPodName(nimCache))
863+
logger.Info("requeueing for reconciliation for model selection", "pod", getPodName(nimCache))
861864
return ctrl.Result{RequeueAfter: time.Second * 30}, err
862865
}
863866

test/e2e/data/nimcache.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,18 @@ kind: NIMCache
33
metadata:
44
labels:
55
app.kubernetes.io/name: k8s-nim-operator
6-
name: meta-llama3-8b-instruct
6+
name: meta-llama3-2-1b-instruct
77
spec:
88
source:
99
ngc:
10-
modelPuller: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
10+
modelPuller: nvcr.io/nim/meta/llama-3.2-1b-instruct:1.12.0
1111
pullSecret: ngc-secret
1212
authSecret: ngc-api-secret
1313
model:
1414
profiles: []
1515
lora: false
16-
precision: "fp16"
17-
engine: "tensorrt_llm"
18-
qosProfile: "throughput"
19-
gpus:
20-
- product: "A100"
21-
ids:
22-
- "20b2"
16+
precision: "bf16"
17+
engine: "vllm"
2318
tensorParallelism: "1"
2419
resources:
2520
cpu: 500m

test/e2e/data/nimservice.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
apiVersion: apps.nvidia.com/v1alpha1
22
kind: NIMService
33
metadata:
4-
name: meta-llama3-8b-instruct
4+
name: meta-llama3-2-1b-instruct
55
spec:
66
image:
7-
repository: nvcr.io/nim/meta/llama3-8b-instruct
8-
tag: 1.0.0
7+
repository: nvcr.io/nim/meta/llama-3.2-1b-instruct
8+
tag: 1.12.0
99
pullPolicy: IfNotPresent
1010
pullSecrets:
1111
- ngc-secret
1212
authSecret: ngc-api-secret
1313
storage:
1414
nimCache:
15-
name: meta-llama3-8b-instruct
15+
name: meta-llama3-2-1b-instruct
1616
profile: ''
1717
replicas: 1
1818
resources:

test/e2e/e2e_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ var (
106106
"namespaces",
107107
"deployments",
108108
"daemonsets",
109+
"jobs",
109110
}
110111

111112
// NEMO microservice variables.
@@ -325,10 +326,10 @@ func getTestEnv() {
325326
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
326327
Expect(ImageRepo).NotTo(BeEmpty(), "IMAGE_REPO must be set")
327328

328-
ImageTag = os.Getenv("E2E_IMAGE_TAG")
329+
ImageTag = "e2e-test" //os.Getenv("E2E_IMAGE_TAG")
329330
Expect(ImageTag).NotTo(BeEmpty(), "IMAGE_TAG must be set")
330331

331-
ImagePullPolicy = os.Getenv("E2E_IMAGE_PULL_POLICY")
332+
ImagePullPolicy = "Always" //os.Getenv("E2E_IMAGE_PULL_POLICY")
332333
Expect(ImagePullPolicy).NotTo(BeEmpty(), "IMAGE_PULL_POLICY must be set")
333334

334335
CollectLogsFrom = os.Getenv("COLLECT_LOGS_FROM")

test/e2e/infra/aws.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,6 @@ spec:
2929
kubernetes:
3030
install: true
3131
installer: kubeadm
32-
version: v1.32.1
32+
version: v1.32.3
33+
crictlVersion: v1.32.0
34+
calicoVersion: v3.29.3

test/e2e/nim-operator_test.go

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ import (
3131
helmValues "github.com/mittwald/go-helm-client/values"
3232
"helm.sh/helm/v3/pkg/repo"
3333
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
34+
clientset "k8s.io/client-go/kubernetes"
35+
kubernetes "k8s.io/client-go/kubernetes"
3436
"sigs.k8s.io/yaml"
3537

3638
"github.com/NVIDIA/k8s-test-infra/pkg/diagnostics"
@@ -63,6 +65,21 @@ var _ = Describe("NIM Operator", func() {
6365

6466
err = diagnosticsCollector.Collect(ctx)
6567
Expect(err).NotTo(HaveOccurred())
68+
69+
cli, err := versioned.NewForConfig(clientConfig)
70+
Expect(err).NotTo(HaveOccurred())
71+
72+
nimCacheObject, _ := cli.AppsV1alpha1().NIMCaches(testNamespace.Name).Get(ctx, "meta-llama3-8b-instruct", metav1.GetOptions{})
73+
fmt.Printf("NIMCache object: %#v\n", nimCacheObject)
74+
75+
clientSet, err = clientset.NewForConfig(clientConfig)
76+
Expect(err).NotTo(HaveOccurred())
77+
pods, err := clientSet.CoreV1().Pods(testNamespace.Name).List(ctx, metav1.ListOptions{})
78+
Expect(err).NotTo(HaveOccurred())
79+
for _, pod := range pods.Items {
80+
fmt.Println("Pod name:", pod.Name)
81+
}
82+
6683
}
6784
})
6885

@@ -84,9 +101,11 @@ var _ = Describe("NIM Operator", func() {
84101
values := helmValues.Options{
85102
Values: []string{
86103
fmt.Sprintf("operator.image.repository=%s", ImageRepo),
87-
fmt.Sprintf("operator.image.tag=%s", ImageTag),
104+
"operator.image.tag=e2e-test",
105+
//fmt.Sprintf("operator.image.tag=%s", ImageTag),
88106
fmt.Sprintf("operator.image.pullPolicy=%s", ImagePullPolicy),
89107
fmt.Sprintf("operator.image.pullSecrets={%s}", strings.Join(pullSecrets, ",")),
108+
"operator.admissionController.enabled=false",
90109
},
91110
}
92111

@@ -120,6 +139,9 @@ var _ = Describe("NIM Operator", func() {
120139
cli, err := versioned.NewForConfig(clientConfig)
121140
Expect(err).NotTo(HaveOccurred())
122141

142+
clientSet, err = clientset.NewForConfig(clientConfig)
143+
Expect(err).NotTo(HaveOccurred())
144+
123145
nimCache := &v1alpha1.NIMCache{}
124146
data, err := os.ReadFile(filepath.Join(cwd, "data", "nimcache.yml"))
125147
Expect(err).NotTo(HaveOccurred())
@@ -133,6 +155,16 @@ var _ = Describe("NIM Operator", func() {
133155
By("Checking the NIMCache object state is ready")
134156
Eventually(func() bool {
135157
nimCacheObject, _ := cli.AppsV1alpha1().NIMCaches(testNamespace.Name).Get(ctx, nimCache.Name, metav1.GetOptions{})
158+
fmt.Println("NIMCache object conditions:", nimCacheObject.Status.Conditions)
159+
fmt.Println("NIMCache object state:", nimCacheObject.Status.State)
160+
//pod, err := clientSet.CoreV1().Pods(testNamespace.Name).Get(ctx, fmt.Sprintf("%s-pod", nimCache.GetName()), metav1.GetOptions{})
161+
162+
err := DescribePod(clientSet.(*kubernetes.Clientset), testNamespace.Name, fmt.Sprintf("%s-pod", nimCache.GetName()))
163+
if err != nil {
164+
fmt.Println("Error:", err)
165+
}
166+
167+
fmt.Println("***************111")
136168
return nimCacheObject.Status.State == v1alpha1.NimCacheStatusReady
137169
}, Timeout, 5*time.Second).Should(BeTrue())
138170

@@ -377,9 +409,70 @@ func installEntitystoreDependencies() {
377409
CreateNamespace: false,
378410
Wait: true,
379411
WaitForJobs: true,
380-
Timeout: 10 * time.Minute,
412+
Timeout: 2 * time.Minute,
381413
CleanupOnFail: true,
382414
ValuesOptions: values}
383415
_, err = helmClient.InstallOrUpgradeChart(ctx, chartSpec, nil)
384416
Expect(err).NotTo(HaveOccurred())
385417
}
418+
419+
func DescribePod(clientset *kubernetes.Clientset, namespace, name string) error {
420+
ctx := context.Background()
421+
422+
// --- Get Pod
423+
pod, err := clientset.CoreV1().Pods(namespace).Get(ctx, name, metav1.GetOptions{})
424+
if err != nil {
425+
return fmt.Errorf("failed to get pod: %w", err)
426+
}
427+
428+
fmt.Printf("Name: %s\n", pod.Name)
429+
fmt.Printf("Namespace: %s\n", pod.Namespace)
430+
fmt.Printf("Node: %s\n", pod.Spec.NodeName)
431+
fmt.Printf("Start Time: %s\n", pod.Status.StartTime)
432+
fmt.Printf("Phase: %s\n", pod.Status.Phase)
433+
434+
fmt.Println("\nConditions:")
435+
for _, c := range pod.Status.Conditions {
436+
fmt.Printf(" - Type=%s Status=%s Reason=%s\n", c.Type, c.Status, c.Reason)
437+
}
438+
439+
fmt.Println("\nContainers:")
440+
for _, cs := range pod.Status.ContainerStatuses {
441+
state := cs.State
442+
fmt.Printf(" * %s:\n", cs.Name)
443+
if state.Running != nil {
444+
fmt.Printf(" Running since %s\n", state.Running.StartedAt)
445+
}
446+
if state.Waiting != nil {
447+
fmt.Printf(" Waiting: %s (%s)\n", state.Waiting.Reason, state.Waiting.Message)
448+
}
449+
if state.Terminated != nil {
450+
fmt.Printf(" Terminated: %s at %s (exit %d)\n",
451+
state.Terminated.Reason, state.Terminated.FinishedAt, state.Terminated.ExitCode)
452+
}
453+
fmt.Printf(" Ready=%v Restarts=%d\n", cs.Ready, cs.RestartCount)
454+
}
455+
456+
// --- Get related Events
457+
events, err := clientset.CoreV1().Events(namespace).List(ctx, metav1.ListOptions{
458+
FieldSelector: fmt.Sprintf("involvedObject.kind=Pod,involvedObject.name=%s", pod.Name),
459+
})
460+
if err != nil {
461+
return fmt.Errorf("failed to get events: %w", err)
462+
}
463+
464+
fmt.Println("\nEvents:")
465+
if len(events.Items) == 0 {
466+
fmt.Println(" <none>")
467+
}
468+
for _, e := range events.Items {
469+
t := e.LastTimestamp.Time
470+
if t.IsZero() {
471+
t = e.EventTime.Time
472+
}
473+
fmt.Printf(" %s %s %s: %s\n",
474+
t.Format(time.RFC3339), e.Type, e.Reason, e.Message)
475+
}
476+
477+
return nil
478+
}

0 commit comments

Comments
 (0)