Skip to content

Commit f657bf1

Browse files
authored
Remove TPU's requests and limits from the main container if a service container is requesting for it (#41)
* Add logic to remove TPU requests from the main pod if TPU is requested by a service container * Add error message * Fix lint * Add tests
1 parent 1b8351c commit f657bf1

File tree

2 files changed

+138
-31
lines changed

2 files changed

+138
-31
lines changed

packages/k8s/src/hooks/prepare-job.ts

Lines changed: 72 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import {
55
JobContainerInfo,
66
ContextPorts,
77
PrepareJobArgs,
8+
ServiceContainerInfo,
89
writeToResponseFile
910
} from 'hooklib'
1011
import path from 'path'
@@ -69,20 +70,11 @@ export async function prepareJob(
6970
)
7071
}
7172

72-
let services: k8s.V1Container[] = []
73-
if (args.services?.length) {
74-
generateServicesName(args.services)
75-
services = args.services.map(service => {
76-
core.debug(`Adding service '${service.image}' to pod definition`)
77-
return createContainerSpec(
78-
service,
79-
generateContainerName(service.image),
80-
false,
81-
extension,
82-
service.createOptions
83-
)
84-
})
85-
}
73+
const services: k8s.V1Container[] = processServiceContainers(
74+
args.services,
75+
container,
76+
extension
77+
)
8678

8779
if (!container && !services?.length) {
8880
throw new Error('No containers exist, skipping hook invocation')
@@ -152,6 +144,58 @@ export async function prepareJob(
152144
generateResponseFile(responseFile, args, createdPod, isAlpine)
153145
}
154146

147+
export function processServiceContainers(
148+
services?: ServiceContainerInfo[],
149+
container?: k8s.V1Container,
150+
extension?: k8s.V1PodTemplateSpec
151+
): k8s.V1Container[] {
152+
if (!services?.length) {
153+
return []
154+
}
155+
generateServicesName(services)
156+
const serviceContainers = services.map(service => {
157+
core.debug(`Adding service '${service.image}' to pod definition`)
158+
return createContainerSpec(
159+
service,
160+
service.name,
161+
false,
162+
extension,
163+
service.createOptions
164+
)
165+
})
166+
167+
const tpuRequestingContainers = services.filter(
168+
service =>
169+
service.resources?.limits && service.resources.limits['google.com/tpu']
170+
)
171+
172+
if (tpuRequestingContainers.length > 1) {
173+
throw new Error(
174+
`${tpuRequestingContainers.length} containers request for TPU's. Only 1 container per pod can request for TPU's.`
175+
)
176+
}
177+
178+
if (tpuRequestingContainers.length === 1) {
179+
if (
180+
container?.resources?.requests &&
181+
container.resources.requests['google.com/tpu']
182+
) {
183+
core.debug(
184+
'removing tpu from main container resources request and limits as they are requested by the service container and only 1 container in a pod can request TPU.'
185+
)
186+
delete container.resources.requests['google.com/tpu']
187+
if (
188+
container.resources.limits &&
189+
container.resources.limits['google.com/tpu']
190+
) {
191+
core.debug('removing tpu from main container resource limits')
192+
delete container.resources.limits['google.com/tpu']
193+
}
194+
}
195+
}
196+
return serviceContainers
197+
}
198+
155199
// Create JobSet and waits for it to come online
156200
async function prepareJobSet(
157201
args: PrepareJobArgs,
@@ -356,11 +400,20 @@ export function createContainerSpec(
356400
}
357401

358402
podContainer.env = []
359-
for (const [key, value] of Object.entries(
360-
container['environmentVariables']
361-
)) {
362-
if (value && key !== 'HOME') {
363-
podContainer.env.push({ name: key, value: value as string })
403+
if (container['environmentVariables']) {
404+
for (const [key, value] of Object.entries(
405+
container['environmentVariables']
406+
)) {
407+
if (value && key !== 'HOME') {
408+
podContainer.env.push({ name: key, value: value as string })
409+
}
410+
}
411+
412+
if (!('CI' in container['environmentVariables'])) {
413+
podContainer.env.push({
414+
name: 'CI',
415+
value: 'true'
416+
})
364417
}
365418
}
366419

@@ -369,13 +422,6 @@ export function createContainerSpec(
369422
value: 'true'
370423
})
371424

372-
if (!('CI' in container['environmentVariables'])) {
373-
podContainer.env.push({
374-
name: 'CI',
375-
value: 'true'
376-
})
377-
}
378-
379425
podContainer.volumeMounts = containerVolumes(
380426
container.userMountVolumes,
381427
jobContainer

packages/k8s/tests/prepare-job-test.ts

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
import * as fs from 'fs'
22
import * as path from 'path'
33
import { cleanupJob } from '../src/hooks'
4-
import { createContainerSpec, prepareJob } from '../src/hooks/prepare-job'
4+
import {
5+
createContainerSpec,
6+
prepareJob,
7+
processServiceContainers
8+
} from '../src/hooks/prepare-job'
59
import { TestHelper } from './test-setup'
610
import {
711
ENV_HOOK_TEMPLATE_PATH,
812
ENV_NUMBER_OF_HOSTS,
913
ENV_USE_KUBE_SCHEDULER,
10-
generateContainerName,
11-
readExtensionFromFile
14+
generateContainerName
1215
} from '../src/k8s/utils'
13-
import { getEvents, getPodByName } from '../src/k8s'
16+
import { getPodByName } from '../src/k8s'
1417
import { V1Container } from '@kubernetes/client-node'
15-
import * as yaml from 'js-yaml'
1618
import { JOB_CONTAINER_NAME } from '../src/hooks/constants'
1719

1820
jest.useRealTimers()
@@ -324,3 +326,62 @@ describe('Prepare job', () => {
324326
}
325327
)
326328
})
329+
330+
describe('processServiceContainers', () => {
331+
it('generate names for service containers', () => {
332+
expect(
333+
processServiceContainers(
334+
[
335+
{
336+
image: 'gcr.io/server'
337+
},
338+
{
339+
image: 'gcr.io/server'
340+
}
341+
],
342+
{
343+
name: 'nginx',
344+
image: 'nginx:latest',
345+
imagePullPolicy: 'IfNotPresent'
346+
} as V1Container
347+
)
348+
).toEqual(
349+
expect.arrayContaining([
350+
expect.objectContaining({ name: 'server' }),
351+
expect.objectContaining({ name: 'server-1' })
352+
])
353+
)
354+
})
355+
356+
it('generate TPU request for service containers', () => {
357+
expect(
358+
processServiceContainers(
359+
[
360+
{
361+
image: 'gcr.io/server',
362+
createOptions: '--tpu=4'
363+
}
364+
],
365+
{
366+
name: 'nginx',
367+
image: 'nginx:latest',
368+
imagePullPolicy: 'IfNotPresent'
369+
} as V1Container
370+
)
371+
).toEqual(
372+
expect.arrayContaining([
373+
expect.objectContaining({
374+
name: 'server',
375+
resources: {
376+
limits: {
377+
'google.com/tpu': '4'
378+
},
379+
requests: {
380+
'google.com/tpu': '4'
381+
}
382+
}
383+
})
384+
])
385+
)
386+
})
387+
})

0 commit comments

Comments
 (0)