Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 57 additions & 14 deletions packages/k8s/src/hooks/prepare-job.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import {
JobContainerInfo,
ContextPorts,
PrepareJobArgs,
ServiceContainerInfo,
writeToResponseFile
} from 'hooklib'
import path from 'path'
Expand Down Expand Up @@ -69,20 +70,11 @@ export async function prepareJob(
)
}

let services: k8s.V1Container[] = []
if (args.services?.length) {
generateServicesName(args.services)
services = args.services.map(service => {
core.debug(`Adding service '${service.image}' to pod definition`)
return createContainerSpec(
service,
generateContainerName(service.image),
false,
extension,
service.createOptions
)
})
}
const services: k8s.V1Container[] = processServiceContainers(
args.services,
container,
extension
)

if (!container && !services?.length) {
throw new Error('No containers exist, skipping hook invocation')
Expand Down Expand Up @@ -152,6 +144,57 @@ export async function prepareJob(
generateResponseFile(responseFile, args, createdPod, isAlpine)
}

function processServiceContainers(
services?: ServiceContainerInfo[],
container?: k8s.V1Container,
extension?: k8s.V1PodTemplateSpec
): k8s.V1Container[] {
if (!services?.length) {
return []
}
generateServicesName(services)
const serviceContainers = services.map(service => {
core.debug(`Adding service '${service.image}' to pod definition`)
return createContainerSpec(
service,
generateContainerName(service.image),
false,
extension,
service.createOptions
)
})

const tpuRequestingContainers = services.filter(
service =>
service.resources?.limits && service.resources.limits['google.com/tpu']
)

if (tpuRequestingContainers.length > 1) {
throw new Error(
`${tpuRequestingContainers.length} containers request for TPU's. Only 1 container per pod can request for TPU's.`
)
}

if (tpuRequestingContainers.length === 1) {
if (
container?.resources?.requests &&
container.resources.requests['google.com/tpu']
) {
core.debug(
'removing tpu from main container resources request and limits as they are requested by the service container and only 1 container in a pod can request TPU.'
)
delete container.resources.requests['google.com/tpu']
if (
container.resources.limits &&
container.resources.limits['google.com/tpu']
) {
delete container.resources.limits['google.com/tpu']
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Let's also add a log for indicating the delete operation was finished successfully for debugging?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I don't see it's added yet. Maybe the latest commit hasn't been uploaded?

}
}
return serviceContainers
}

// Create JobSet and waits for it to come online
async function prepareJobSet(
args: PrepareJobArgs,
Expand Down
Loading