From a55574ac58284e8a02a97a29f699f409df038b5a Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 30 Sep 2025 16:51:15 -0700 Subject: [PATCH 1/4] Add logic to remove TPU requests from the main pod if TPU is requested by a service container --- packages/k8s/src/hooks/prepare-job.ts | 55 +++++++++++++++++++++------ 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/packages/k8s/src/hooks/prepare-job.ts b/packages/k8s/src/hooks/prepare-job.ts index f518592f..74ad4e60 100644 --- a/packages/k8s/src/hooks/prepare-job.ts +++ b/packages/k8s/src/hooks/prepare-job.ts @@ -5,6 +5,7 @@ import { JobContainerInfo, ContextPorts, PrepareJobArgs, + ServiceContainerInfo, writeToResponseFile } from 'hooklib' import path from 'path' @@ -71,17 +72,7 @@ export async function prepareJob( let services: k8s.V1Container[] = [] if (args.services?.length) { - generateServicesName(args.services) - services = args.services.map(service => { - core.debug(`Adding service '${service.image}' to pod definition`) - return createContainerSpec( - service, - generateContainerName(service.image), - false, - extension, - service.createOptions - ) - }) + processServiceContainers(args.services, container, extension) } if (!container && !services?.length) { @@ -152,6 +143,48 @@ export async function prepareJob( generateResponseFile(responseFile, args, createdPod, isAlpine) } +async function processServiceContainers( + services: ServiceContainerInfo[], + container?: k8s.V1Container, + extension?: k8s.V1PodTemplateSpec +) { + generateServicesName(services) + services = services.map(service => { + core.debug(`Adding service '${service.image}' to pod definition`) + return createContainerSpec( + service, + generateContainerName(service.image), + false, + extension, + service.createOptions + ) + }) + + // Only 1 container in a pod can request TPU's. + if ( + services.find( + service => + service.resources?.limits && service.resources.limits['google.com/tpu'] + ) + ) { + if ( + container?.resources?.requests && + container.resources.requests['google.com/tpu'] + ) { + core.debug( + 'removing tpu from main container resources request and limits as they are requested by the service container and only 1 container in a pod can request TPU.' + ) + delete container.resources.requests['google.com/tpu'] + if ( + container.resources.limits && + container.resources.limits['google.com/tpu'] + ) { + delete container.resources.limits['google.com/tpu'] + } + } + } +} + // Create JobSet and waits for it to come online async function prepareJobSet( args: PrepareJobArgs, From 03fd04d953df0d76c6fba6b80f49914075bf065c Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 30 Sep 2025 16:54:45 -0700 Subject: [PATCH 2/4] Add error message --- packages/k8s/src/hooks/prepare-job.ts | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/k8s/src/hooks/prepare-job.ts b/packages/k8s/src/hooks/prepare-job.ts index 74ad4e60..30bc3a7c 100644 --- a/packages/k8s/src/hooks/prepare-job.ts +++ b/packages/k8s/src/hooks/prepare-job.ts @@ -160,13 +160,18 @@ async function processServiceContainers( ) }) - // Only 1 container in a pod can request TPU's. - if ( - services.find( - service => - service.resources?.limits && service.resources.limits['google.com/tpu'] + const tpuRequestingContainers = services.filter( + service => + service.resources?.limits && service.resources.limits['google.com/tpu'] + ) + + if (tpuRequestingContainers.length > 1) { + throw new Error( + `${tpuRequestingContainers.length} containers request for TPU's. Only 1 container per pod can request for TPU's.` ) - ) { + } + + if (tpuRequestingContainers.length === 1) { if ( container?.resources?.requests && container.resources.requests['google.com/tpu'] From d57e40e856869296f50088f96bf6a88f086eed2c Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Tue, 30 Sep 2025 17:05:04 -0700 Subject: [PATCH 3/4] Fix lint --- packages/k8s/src/hooks/prepare-job.ts | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/packages/k8s/src/hooks/prepare-job.ts b/packages/k8s/src/hooks/prepare-job.ts index 30bc3a7c..2cb65147 100644 --- a/packages/k8s/src/hooks/prepare-job.ts +++ b/packages/k8s/src/hooks/prepare-job.ts @@ -70,10 +70,11 @@ export async function prepareJob( ) } - let services: k8s.V1Container[] = [] - if (args.services?.length) { - processServiceContainers(args.services, container, extension) - } + const services: k8s.V1Container[] = processServiceContainers( + args.services, + container, + extension + ) if (!container && !services?.length) { throw new Error('No containers exist, skipping hook invocation') @@ -143,13 +144,16 @@ export async function prepareJob( generateResponseFile(responseFile, args, createdPod, isAlpine) } -async function processServiceContainers( - services: ServiceContainerInfo[], +function processServiceContainers( + services?: ServiceContainerInfo[], container?: k8s.V1Container, extension?: k8s.V1PodTemplateSpec -) { +): k8s.V1Container[] { + if (!services?.length) { + return [] + } generateServicesName(services) - services = services.map(service => { + const serviceContainers = services.map(service => { core.debug(`Adding service '${service.image}' to pod definition`) return createContainerSpec( service, @@ -188,6 +192,7 @@ async function processServiceContainers( } } } + return serviceContainers } // Create JobSet and waits for it to come online From 86fcd39096b29a47f351533ab543632bfbfe76e6 Mon Sep 17 00:00:00 2001 From: Quoc Truong Date: Wed, 1 Oct 2025 10:49:21 -0700 Subject: [PATCH 4/4] Add tests --- packages/k8s/src/hooks/prepare-job.ts | 31 ++++++----- packages/k8s/tests/prepare-job-test.ts | 71 ++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 19 deletions(-) diff --git a/packages/k8s/src/hooks/prepare-job.ts b/packages/k8s/src/hooks/prepare-job.ts index 2cb65147..ea430ccf 100644 --- a/packages/k8s/src/hooks/prepare-job.ts +++ b/packages/k8s/src/hooks/prepare-job.ts @@ -144,7 +144,7 @@ export async function prepareJob( generateResponseFile(responseFile, args, createdPod, isAlpine) } -function processServiceContainers( +export function processServiceContainers( services?: ServiceContainerInfo[], container?: k8s.V1Container, extension?: k8s.V1PodTemplateSpec @@ -157,7 +157,7 @@ function processServiceContainers( core.debug(`Adding service '${service.image}' to pod definition`) return createContainerSpec( service, - generateContainerName(service.image), + service.name, false, extension, service.createOptions @@ -188,6 +188,7 @@ function processServiceContainers( container.resources.limits && container.resources.limits['google.com/tpu'] ) { + core.debug('removing tpu from main container resource limits') delete container.resources.limits['google.com/tpu'] } } @@ -399,11 +400,20 @@ export function createContainerSpec( } podContainer.env = [] - for (const [key, value] of Object.entries( - container['environmentVariables'] - )) { - if (value && key !== 'HOME') { - podContainer.env.push({ name: key, value: value as string }) + if (container['environmentVariables']) { + for (const [key, value] of Object.entries( + container['environmentVariables'] + )) { + if (value && key !== 'HOME') { + podContainer.env.push({ name: key, value: value as string }) + } + } + + if (!('CI' in container['environmentVariables'])) { + podContainer.env.push({ + name: 'CI', + value: 'true' + }) } } @@ -412,13 +422,6 @@ export function createContainerSpec( value: 'true' }) - if (!('CI' in container['environmentVariables'])) { - podContainer.env.push({ - name: 'CI', - value: 'true' - }) - } - podContainer.volumeMounts = containerVolumes( container.userMountVolumes, jobContainer diff --git a/packages/k8s/tests/prepare-job-test.ts b/packages/k8s/tests/prepare-job-test.ts index a9ee70a1..67901664 100644 --- a/packages/k8s/tests/prepare-job-test.ts +++ b/packages/k8s/tests/prepare-job-test.ts @@ -1,18 +1,20 @@ import * as fs from 'fs' import * as path from 'path' import { cleanupJob } from '../src/hooks' -import { createContainerSpec, prepareJob } from '../src/hooks/prepare-job' +import { + createContainerSpec, + prepareJob, + processServiceContainers +} from '../src/hooks/prepare-job' import { TestHelper } from './test-setup' import { ENV_HOOK_TEMPLATE_PATH, ENV_NUMBER_OF_HOSTS, ENV_USE_KUBE_SCHEDULER, - generateContainerName, - readExtensionFromFile + generateContainerName } from '../src/k8s/utils' -import { getEvents, getPodByName } from '../src/k8s' +import { getPodByName } from '../src/k8s' import { V1Container } from '@kubernetes/client-node' -import * as yaml from 'js-yaml' import { JOB_CONTAINER_NAME } from '../src/hooks/constants' jest.useRealTimers() @@ -324,3 +326,62 @@ describe('Prepare job', () => { } ) }) + +describe('processServiceContainers', () => { + it('generate names for service containers', () => { + expect( + processServiceContainers( + [ + { + image: 'gcr.io/server' + }, + { + image: 'gcr.io/server' + } + ], + { + name: 'nginx', + image: 'nginx:latest', + imagePullPolicy: 'IfNotPresent' + } as V1Container + ) + ).toEqual( + expect.arrayContaining([ + expect.objectContaining({ name: 'server' }), + expect.objectContaining({ name: 'server-1' }) + ]) + ) + }) + + it('generate TPU request for service containers', () => { + expect( + processServiceContainers( + [ + { + image: 'gcr.io/server', + createOptions: '--tpu=4' + } + ], + { + name: 'nginx', + image: 'nginx:latest', + imagePullPolicy: 'IfNotPresent' + } as V1Container + ) + ).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + name: 'server', + resources: { + limits: { + 'google.com/tpu': '4' + }, + requests: { + 'google.com/tpu': '4' + } + } + }) + ]) + ) + }) +})