@@ -65,12 +65,39 @@ properties([
6565 ])
6666])
6767
68- // CI image tag is intentionally NOT a job parameter: Jenkins persists the
69- // last-used parameter value and ignores subsequent defaultValue changes in the
70- // Jenkinsfile, so bumping the tag with a parameter required a manual "Build
71- // with Parameters" round-trip every time. Keep this as a plain script var so
72- // every commit that bumps the tag takes effect on the next /build comment.
73- def ciImage = ' gitlab-master.nvidia.com:5005/epeer/nova-test/nova-kernel-ci:2026-05-22-d62390752'
68+ // CI image is content-addressed from ci/image-manifest. Phase 1 below
69+ // computes the tag (sha256(manifest + listed inputs)[:12]), probes the
70+ // GitLab Container Registry for it, and either reuses the existing image
71+ // or kaniko-builds + pushes a new one. ciImage is assigned by that stage
72+ // and consumed by the Phase 2 podTemplate as the nova-ci container image.
73+ //
74+ // This replaces the previous "edit a hardcoded :YYYY-MM-DD-<sha> string"
75+ // scheme. Bumping a pin (or any listed input under ci/) automatically
76+ // invalidates the cache; the rebuild happens inline as the first stage
77+ // of the next CI run, without any separate "nova-ci-image" pipeline.
78+ String ciImage = ' '
79+
80+ // Pipeline-wide state shared across Phase 1 (jnlp + kaniko pod, for
81+ // image probe/build) and Phase 2 (jnlp + nova-ci pod, for the kernel
82+ // build + hardware test). All entries are plain Strings/booleans so
83+ // they survive CPS save points across the pod boundary.
84+ // buildDescription / prNum / cloneUrl / prState / repoName -- captured
85+ // in Phase 1's Get Token stage from githubData; reused for status
86+ // posts and the second checkout in Phase 2 (kernel tree). The PR
87+ // branch / merge ref selection logic is identical in both phases.
88+ // tokenAcquired -- gated by the catch blocks: only post a "Failed"
89+ // GitHub commit status if we actually obtained a token; otherwise
90+ // the status API call itself would fail and we'd lose the original
91+ // exception.
92+ // stageName -- updated as each stage enters; the catch block uses it
93+ // to report which stage died.
94+ String buildDescription = ' '
95+ String prNum = ' '
96+ String cloneUrl = ' '
97+ String prState = ' '
98+ String repoName = ' '
99+ boolean tokenAcquired = false
100+ def stageName = ' '
74101
75102def runUid = params. RUN_AS_UID ?. trim()
76103def runGid = params. RUN_AS_GID ?. trim()
@@ -84,6 +111,175 @@ if (!runUid || !runGid) {
84111// Inlining sidesteps the helper-method dispatch entirely. We also do not retain a GithubHelper
85112// instance in a long-lived local across save points (see comment inside the node block below).
86113
114+ // ============================================================================
115+ // Phase 1: image probe + optional rebuild.
116+ // ----------------------------------------------------------------------------
117+ // Pod is jnlp + kaniko only -- no nova-ci, because what nova-ci image to run
118+ // is exactly the question this phase answers. We do the Get Token + a sparse
119+ // checkout of ci/ here (the hash inputs all live under ci/, and the kernel
120+ // tree isn't needed yet). On a registry hit (the common case) this phase
121+ // completes in well under a minute and Phase 2 reuses the cached image. On
122+ // a miss we stage the build context on jnlp and kaniko-build + push the
123+ // new tag, then proceed to Phase 2 with it.
124+ //
125+ // Why a separate pod rather than nesting kaniko into the Phase 2 pod: a pod's
126+ // container images are fixed at allocation time, so we can't allocate the
127+ // heavy nova-ci pod until we know what tag to pull. Two pods sequentially is
128+ // the cleanest way to break the chicken-and-egg.
129+ // ============================================================================
130+ podTemplate(
131+ cloud : ' sc-ipp-blossom-prod' ,
132+ yaml : """
133+ apiVersion: v1
134+ kind: Pod
135+ spec:
136+ nodeSelector:
137+ kubernetes.io/os: "linux"
138+ containers:
139+ - name: jnlp
140+ command: ["/bin/sh", "-c"]
141+ args: ["umask 0002 && exec /usr/local/bin/jenkins-agent"]
142+ securityContext:
143+ runAsGroup: ${ runGid.toInteger()}
144+ resources:
145+ requests:
146+ memory: 1Gi
147+ cpu: 500m
148+ limits:
149+ memory: 2Gi
150+ - name: kaniko
151+ image: gcr.io/kaniko-project/executor:v1.23.0-debug
152+ imagePullPolicy: IfNotPresent
153+ command: ["/busybox/sh", "-c", "sleep infinity"]
154+ tty: true
155+ resources:
156+ requests:
157+ memory: 2Gi
158+ cpu: "1"
159+ limits:
160+ # kaniko's memory usage scales with layer size; the buildroot
161+ # pre-build creates a ~6 GiB writable layer (host tools + target
162+ # rootfs staging). 16 GiB is comfortably above that with room
163+ # for the snapshotter's working set.
164+ memory: 16Gi
165+ cpu: "8"
166+ """
167+ ) {
168+ node(POD_LABEL ) {
169+ try {
170+ stage(' Get Token' ) {
171+ withCredentials([usernamePassword(
172+ credentialsId : ' github-token' ,
173+ passwordVariable : ' GIT_PASSWORD' ,
174+ usernameVariable : ' GIT_USERNAME'
175+ )]) {
176+ def h = GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData)
177+ buildDescription = h. getBuildDescription()
178+ prNum = h. getPRNumber(). toString()
179+ cloneUrl = h. getCloneUrl()
180+ prState = h. getPRState()
181+ repoName = h. getRepoName()
182+ tokenAcquired = true
183+ }
184+ }
185+
186+ currentBuild. description = buildDescription
187+
188+ stageName = ' Code checkout'
189+ stage(stageName) {
190+ echo " DIAG: entered ${ stageName} body (Phase 1, sparse ci/ only)"
191+ withCredentials([usernamePassword(
192+ credentialsId : ' github-token' ,
193+ passwordVariable : ' GIT_PASSWORD' ,
194+ usernameVariable : ' GIT_USERNAME'
195+ )]) {
196+ GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData). updateCommitStatus(" ${ BUILD_URL} " , " ${ stageName} Running" , GitHubCommitState . PENDING )
197+ }
198+ echo " DIAG: posted ${ stageName} Running status; preparing shallow sparse checkout"
199+ echo " DIAG: prNum=${ prNum} prState=${ prState} "
200+ // Sparse-checkout `ci/` only. The hash inputs and the kaniko
201+ // build context all live under ci/; the kernel tree is huge
202+ // and not needed in this pod. Phase 2's checkout is the full
203+ // tree.
204+ def cloneExtensions = [
205+ [$class : ' CloneOption' , shallow : true , depth : 1 , noTags : true , honorRefspec : true , timeout : 30 ],
206+ [$class : ' SparseCheckoutPaths' , sparseCheckoutPaths : [[path : ' ci/' ]]],
207+ ]
208+ if (' Open' . equalsIgnoreCase(prState)) {
209+ checkout changelog : true , poll : false , scm : [
210+ $class : ' GitSCM' ,
211+ branches : [[name : " refs/remotes/origin/pr/${ prNum} " ]],
212+ extensions : cloneExtensions,
213+ userRemoteConfigs : [[
214+ credentialsId : ' github-token' ,
215+ url : cloneUrl,
216+ refspec : " +refs/pull/${ prNum} /head:refs/remotes/origin/pr/${ prNum} "
217+ ]]
218+ ]
219+ } else if (' Merged' . equalsIgnoreCase(prState)) {
220+ checkout changelog : true , poll : false , scm : [
221+ $class : ' GitSCM' ,
222+ branches : [[name : " refs/remotes/origin/pr/${ prNum} " ]],
223+ extensions : cloneExtensions,
224+ userRemoteConfigs : [[
225+ credentialsId : ' github-token' ,
226+ url : cloneUrl,
227+ refspec : " +refs/pull/${ prNum} /merge:refs/remotes/origin/pr/${ prNum} "
228+ ]]
229+ ]
230+ } else {
231+ error(" PR state '${ prState} ' is neither Open nor Merged; nothing to check out." )
232+ }
233+ echo " DIAG: Phase 1 checkout returned"
234+ }
235+
236+ stageName = ' Ensure agent image'
237+ stage(stageName) {
238+ withCredentials([usernamePassword(
239+ credentialsId : ' github-token' ,
240+ passwordVariable : ' GIT_PASSWORD' ,
241+ usernameVariable : ' GIT_USERNAME'
242+ )]) {
243+ GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData). updateCommitStatus(" ${ BUILD_URL} " , " ${ stageName} Running" , GitHubCommitState . PENDING )
244+ }
245+ def imageBuild = load ' ci/imageBuild.groovy'
246+ ciImage = imageBuild. ensureImage([:])
247+ echo " Phase 1 resolved ciImage = ${ ciImage} "
248+ }
249+ } catch (Exception ex) {
250+ currentBuild. result = ' FAILURE'
251+ echo " DIAG-catch (Phase 1): ${ ex} "
252+ if (tokenAcquired) {
253+ try {
254+ withCredentials([usernamePassword(
255+ credentialsId : ' github-token' ,
256+ passwordVariable : ' GIT_PASSWORD' ,
257+ usernameVariable : ' GIT_USERNAME'
258+ )]) {
259+ GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData). updateCommitStatus(" ${ BUILD_URL} " , " ${ stageName} Failed" , GitHubCommitState . FAILURE )
260+ echo " DIAG: posted ${ stageName} Failed status (Phase 1)"
261+ }
262+ } catch (Exception ignored) {
263+ echo " Could not update GitHub status: ${ ignored} "
264+ }
265+ }
266+ throw ex
267+ }
268+ }
269+ }
270+
271+ if (! ciImage) {
272+ error(' Phase 1 did not resolve a ciImage; refusing to start Phase 2.' )
273+ }
274+
275+ // ============================================================================
276+ // Phase 2: kernel build + hardware test.
277+ // ----------------------------------------------------------------------------
278+ // Heavy pod: jnlp + nova-ci@${ciImage} + the NFS scratch mount for ccache.
279+ // We re-checkout the full PR tree here (Phase 1's sparse checkout only
280+ // had ci/). No GitHub status post for the re-checkout: from the PR's
281+ // perspective, Code checkout already passed in Phase 1.
282+ // ============================================================================
87283podTemplate(
88284 cloud : ' sc-ipp-blossom-prod' ,
89285 yaml : """
@@ -139,49 +335,24 @@ spec:
139335 ]
140336) {
141337 node(POD_LABEL ) {
142- // PR/commit metadata captured as plain strings during Get Token. We deliberately do NOT
143- // keep a GithubHelper instance in a long-lived local: the helper holds GHCommitStatus
144- // references (returned by updateCommitStatus) which are not Serializable, and CPS saves
145- // the program state on every step boundary -- a single live helper local causes
146- // NotSerializableException: org.kohsuke.github.GHCommitStatus on the next save.
147- String buildDescription = ' '
148- String prNum = ' '
149- String cloneUrl = ' '
150- String prState = ' '
151- String repoName = ' '
152- boolean tokenAcquired = false
153- def stageName = ' '
154-
155- stage(' Get Token' ) {
156- withCredentials([usernamePassword(
157- credentialsId : ' github-token' ,
158- passwordVariable : ' GIT_PASSWORD' ,
159- usernameVariable : ' GIT_USERNAME'
160- )]) {
161- def h = GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData)
162- buildDescription = h. getBuildDescription()
163- prNum = h. getPRNumber(). toString()
164- cloneUrl = h. getCloneUrl()
165- prState = h. getPRState()
166- repoName = h. getRepoName()
167- tokenAcquired = true
168- }
169- }
338+ // PR/commit metadata + tokenAcquired/stageName/ciImage are hoisted to
339+ // script scope (above the Phase 1 podTemplate) so both pods see the
340+ // same values; we deliberately do NOT keep a GithubHelper instance
341+ // in a long-lived local because the helper holds GHCommitStatus
342+ // references (returned by updateCommitStatus) which are not
343+ // Serializable -- CPS saves program state on every step boundary
344+ // and a single live helper local causes NotSerializableException:
345+ // org.kohsuke.github.GHCommitStatus on the next save.
170346
171347 try {
172- currentBuild. description = buildDescription
173-
174- stageName = ' Code checkout'
175- stage(stageName) {
176- echo " DIAG: entered ${ stageName} body"
177- withCredentials([usernamePassword(
178- credentialsId : ' github-token' ,
179- passwordVariable : ' GIT_PASSWORD' ,
180- usernameVariable : ' GIT_USERNAME'
181- )]) {
182- GithubHelper . getInstance(" ${ GIT_PASSWORD} " , githubData). updateCommitStatus(" ${ BUILD_URL} " , " ${ stageName} Running" , GitHubCommitState . PENDING )
183- }
184- echo " DIAG: posted ${ stageName} Running status; preparing shallow checkout"
348+ // Phase 2 workspace setup: re-checkout the full PR tree (Phase 1
349+ // only had ci/ via sparse-checkout). No status post here -- from
350+ // the PR's perspective, "Code checkout" already succeeded in
351+ // Phase 1; this is an implementation detail of the two-pod
352+ // arrangement. We use the same prState branching so the merge
353+ // commit / PR head choice stays consistent across phases.
354+ stage(' Workspace setup' ) {
355+ echo " DIAG: Phase 2 full checkout (no sparse filter)"
185356 echo " DIAG: prNum=${ prNum} prState=${ prState} "
186357 def cloneExtensions = [
187358 [$class : ' CloneOption' , shallow : true , depth : 1 , noTags : true , honorRefspec : true , timeout : 30 ],
@@ -211,7 +382,7 @@ spec:
211382 } else {
212383 error(" PR state '${ prState} ' is neither Open nor Merged; nothing to check out." )
213384 }
214- echo " DIAG: checkout returned"
385+ echo " DIAG: Phase 2 checkout returned"
215386 }
216387
217388 stageName = ' Build'
0 commit comments