diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 166fe831a4a2..64328c27b732 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -134,6 +134,37 @@ SLURM_INFRA_SINGLE_RETRY_PATTERNS = [ // Maximum number of retries for infrastructure failures (total attempts = SLURM_INFRA_RETRY_MAX + 1) SLURM_INFRA_RETRY_MAX = 2 +// Infrastructure failure patterns specific to K8s test pods (the path that does +// not go through SLURM). Passed as `extraInfraPatterns` to classifyInfraFailure +// from the runLLMTestlistOnPlatform retry loop. SLURM_INFRA_FAILURE_PATTERNS +// already covers shared symptoms (ChannelClosedException, marked offline, +// Reason: Evicted, etc.) and is always checked first. +K8S_INFRA_FAILURE_PATTERNS = [ + // Image pull / pod startup + "ImagePullBackOff", + "ErrImagePull", + // Container runtime hiccups + "OCI runtime exec failed", + // Pod / node lifecycle + "OOMKilled", + "node status is not ready", + // JNLP agent disconnect (trailing space narrows the match) + "Cannot contact ", + // JNLP / HTTP-handshake transient (broad string -- single-retry-only below) + "Connection failed", +] + +// K8s patterns capped at a single retry. Mirrors the SLURM list's caveat: +// every entry here must also appear in K8S_INFRA_FAILURE_PATTERNS. +K8S_INFRA_SINGLE_RETRY_PATTERNS = [ + "OOMKilled", // resource shortage; multi-retry rarely helps + "Connection failed", // short string; cap to bound false-positive cost +] + +// Kept distinct from SLURM_INFRA_RETRY_MAX so the two paths can be tuned +// independently as production telemetry comes in. +K8S_INFRA_RETRY_MAX = 2 + // ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false @@ -142,18 +173,24 @@ COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/nul /** * Checks if an exception represents a transient infrastructure failure - * that warrants retrying the Slurm job. + * that warrants retrying. * * Walks the exception cause chain to catch wrapped exceptions (e.g., * AbortException wrapping ChannelClosedException). * + * Callers may pass additional pattern lists (e.g., K8S_INFRA_FAILURE_PATTERNS) + * to extend classification for path-specific symptoms without disturbing the + * SLURM defaults. + * * @param ex The caught exception + * @param extraInfraPatterns Additional patterns appended to the infra-failure list + * @param extraSingleRetryPatterns Additional patterns appended to the single-retry list * @return A map with keys: * - isInfraFailure (boolean): true if this is a retryable infra failure * - isSingleRetryOnly (boolean): true if this pattern should only retry once * - matchedPattern (String): the pattern that matched, for logging */ -def classifyInfraFailure(Exception ex) { +def classifyInfraFailure(Exception ex, List extraInfraPatterns=[], List extraSingleRetryPatterns=[]) { def result = [isInfraFailure: false, isSingleRetryOnly: false, matchedPattern: ""] // Build the full exception text by walking the cause chain @@ -165,8 +202,8 @@ def classifyInfraFailure(Exception ex) { } def lowerText = exceptionText.toLowerCase() - // Check against infrastructure failure patterns - for (pattern in SLURM_INFRA_FAILURE_PATTERNS) { + // Check against infrastructure failure patterns (SLURM defaults + caller extras) + for (pattern in (SLURM_INFRA_FAILURE_PATTERNS + extraInfraPatterns)) { if (lowerText.contains(pattern.toLowerCase())) { result.isInfraFailure = true result.matchedPattern = pattern @@ -179,7 +216,7 @@ def classifyInfraFailure(Exception ex) { } // Check if this is a single-retry-only pattern - for (pattern in SLURM_INFRA_SINGLE_RETRY_PATTERNS) { + for (pattern in (SLURM_INFRA_SINGLE_RETRY_PATTERNS + extraSingleRetryPatterns)) { if (lowerText.contains(pattern.toLowerCase())) { result.isSingleRetryOnly = true break @@ -812,7 +849,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p }, { // If the execution test list is null, remove the test result xml sh """ - ls -all ${stageName}/ + ls -al ${stageName}/ if ! grep -q ' 1) { + echo "[INFRA-RETRY] ${stageName}: Starting attempt ${attempt} of ${K8S_INFRA_RETRY_MAX + 1}" + } + // Attempt 1 keeps the caller-supplied postTag verbatim so the + // canonical artifact name is unchanged for downstream consumers. + // Retries append "-attempt-N" to dodge the upload-once guard and + // preserve every attempt's tarball in Artifactory. + def attemptTag = (attempt == 1) ? "" : "-attempt-${attempt}" + // isFinalAttempt uses the worst-case retry budget. Single-retry-only + // patterns terminate early at attempt 2, in which case the synthetic + // stage-fail XML is suppressed even though the attempt is effectively + // final. The tar still uploads to Artifactory for forensics and the + // Jenkins build itself surfaces as failed; this is an acceptable + // edge case to keep the helper's pre-call decision simple. + boolean isFinalAttempt = (attempt > K8S_INFRA_RETRY_MAX) + trtllm_utils.launchKubernetesPod(pipeline, podSpec, containerName, { runner(attemptTag, isFinalAttempt) }) + if (attempt > 1) { + echo "[INFRA-RETRY] ${stageName}: Succeeded on attempt ${attempt}" + } + return + } catch (InterruptedException e) { + // User abort / pipeline timeout -- never retry + throw e + } catch (Exception e) { + // FlowInterruptedException may not extend InterruptedException in all Jenkins versions + if (e.toString().contains("FlowInterruptedException") || + e.toString().contains("AbortException: script returned exit code 143")) { + throw e + } + + def classification = classifyInfraFailure(e, K8S_INFRA_FAILURE_PATTERNS, K8S_INFRA_SINGLE_RETRY_PATTERNS) + + if (!classification.isInfraFailure) { + // Not an infrastructure failure (test failure, compilation error, etc.) + throw e + } + + def effectiveMax = classification.isSingleRetryOnly ? 1 : K8S_INFRA_RETRY_MAX + + if (attempt > effectiveMax) { + echo "[INFRA-RETRY] ${stageName}: Infrastructure failure (${classification.matchedPattern}) " + + "but max retries (${effectiveMax}) exhausted after ${attempt} attempts. Failing." + throw e + } + + echo "[INFRA-RETRY] ${stageName}: Infrastructure failure detected on attempt ${attempt}: " + + "${classification.matchedPattern}" + echo "[INFRA-RETRY] ${stageName}: Exception: ${e.toString()}" + echo "[INFRA-RETRY] ${stageName}: Will retry (attempt ${attempt + 1} of ${effectiveMax + 1}) after 60s cooldown." + + sleep(60) + } + } +} + def buildStageConfigs(stageName, platform, testlist, testCount, gpuCount, nodeCount, runWithSbatch=false) { def configs = [:] for (int k = 1; k <= testCount; k++) { @@ -3399,7 +3547,7 @@ def launchTestJobs(pipeline, testFilter) "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4], ] - parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), { + parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), { attemptTag, isFinalAttempt -> def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -3407,7 +3555,7 @@ def launchTestJobs(pipeline, testFilter) if (key.contains("llvm")) { config = LLVM_CONFIG } - runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3]) + runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], false, "cp312", attemptTag, false, isFinalAttempt) }]]} fullSet = parallelJobs.keySet() @@ -3459,7 +3607,10 @@ def launchTestJobs(pipeline, testFilter) ] fullSet += x86SlurmTestConfigs.keySet() - parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), { + parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), { attemptTag, isFinalAttempt -> + // attemptTag/isFinalAttempt come from runKubernetesPodWithInfraRetry + // for the outer dispatcher pod; the inner SLURM job runs its own + // retry loop (runLLMTestlistOnSlurm) so we don't thread these through. def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -3667,12 +3818,14 @@ def launchTestJobs(pipeline, testFilter) fullSet += multiNodesSBSAConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { - parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { - runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3]) + parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { attemptTag, isFinalAttempt -> + runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3], false, "cp312", attemptTag, false, isFinalAttempt) }]]} // Add SBSA Slurm jobs - parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { + parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { attemptTag, isFinalAttempt -> + // SLURM dispatchers run their own retry loop (runLLMTestlistOnSlurm); + // the outer pod-level retry args are accepted but unused here. def config = LINUX_AARCH64_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -3685,7 +3838,7 @@ def launchTestJobs(pipeline, testFilter) parallelJobs += parallelSlurmJobs // Add SBSA multi node Slurm jobs - parallelMultiNodesSBSAJobs = multiNodesSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { + parallelMultiNodesSBSAJobs = multiNodesSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { attemptTag, isFinalAttempt -> def config = LINUX_AARCH64_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -3713,9 +3866,12 @@ def launchTestJobs(pipeline, testFilter) docBuildConfigs = [:] } - docBuildJobs = docBuildConfigs.collectEntries{key, values -> [key, [values[0], { + docBuildJobs = docBuildConfigs.collectEntries{key, values -> [key, [values[0], { attemptTag, isFinalAttempt -> + // attemptTag uniquifies the upload-once guard key and tar filename per + // pod-launch attempt; isFinalAttempt suppresses synthetic stage-fail XML + // and junit() on intermediate retryable infra failures. stage("[${key}] Run") { - cacheErrorAndUploadResult("${key}", values[1], {}, true) + cacheErrorAndUploadResult("${key}", values[1], {}, true, attemptTag, isFinalAttempt) } }]]} @@ -3843,7 +3999,7 @@ def launchTestJobs(pipeline, testFilter) if (checkPipStage) { stage("Run LLMAPI Test") { pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) - trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", { + runKubernetesPodWithInfraRetry(pipeline, pipInstallSanitySpec, "trt-llm", toStageName(values[1], key), { attemptTag, isFinalAttempt -> echo "###### Prerequisites Start ######" echoNodeAndGpuInfo(pipeline, toStageName(values[1], key)) // Clean up the pip constraint file from the base NGC PyTorch image. @@ -3898,7 +4054,7 @@ def launchTestJobs(pipeline, testFilter) } withEnv(libEnv) { sh "env | sort" - runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, toStageName(values[1], key), 1, 1, true, cpver, "-SubJob-RunTest", true) + runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, toStageName(values[1], key), 1, 1, true, cpver, "-SubJob-RunTest" + attemptTag, true, isFinalAttempt) } }) } @@ -4055,8 +4211,8 @@ def launchTestJobs(pipeline, testFilter) echo "Skip - Passed in the previous pipelines." } } else if (values instanceof List) { - trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { - values[1]() + runKubernetesPodWithInfraRetry(pipeline, values[0], "trt-llm", key, { attemptTag, isFinalAttempt -> + values[1](attemptTag, isFinalAttempt) }) } else { values() @@ -4129,10 +4285,10 @@ def launchTestJobsForImagesSanityCheck(pipeline, globalVars) { stage(values.name) { echo "Run ${values.name} sanity test." imageSanitySpec = createKubernetesPodConfig(values.image, values.gpuType, values.k8sArch) - trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", { + runKubernetesPodWithInfraRetry(pipeline, imageSanitySpec, "trt-llm", values.name, { attemptTag, isFinalAttempt -> sh "env | sort" trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y git rsync curl") - runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name, 1, 1, true, null, "-SubJob-TestImage", true) + runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name, 1, 1, true, null, "-SubJob-TestImage" + attemptTag, true, isFinalAttempt) }) } } else {