Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion kubeflow/trainer/backends/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_container_devices(
# TODO (andreyvelich): We should discuss how to get container device type.
# Potentially, we can use the trainer.kubeflow.org/device label from the runtime or
# node types.
# TODO (andreyvelich): Support other resource labels (e.g. NPUs).
# TODO (andreyvelich): Support other resource labels.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can remove this TODO statement.

Suggested change
# TODO (andreyvelich): Support other resource labels.

if constants.GPU_LABEL in resources.limits:
device = constants.GPU_LABEL.split("/")[1]
device_count = resources.limits[constants.GPU_LABEL].actual_instance
Expand All @@ -55,6 +55,13 @@ def get_container_devices(
mig_key = mig_keys[0]
device = mig_key.split("/")[1]
device_count = resources.limits[mig_key].actual_instance
elif npu_keys := [k for k in resources.limits if k.endswith(constants.NPU_LABEL_SUFFIX)]:
if len(npu_keys) > 1:
raise ValueError(f"Multiple NPU resource types are not supported yet: {npu_keys}")
Comment on lines +59 to +60
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this?


npu_key = npu_keys[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you taking the first item? I thought NPU resources are set similarly to TPU and GPU

device = npu_key.rsplit("/", 1)[1]
device_count = resources.limits[npu_key].actual_instance
elif constants.CPU_LABEL in resources.limits:
device = constants.CPU_LABEL
device_count = resources.limits[constants.CPU_LABEL].actual_instance
Expand Down
25 changes: 25 additions & 0 deletions kubeflow/trainer/backends/kubernetes/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,31 @@ def _build_runtime() -> types.Runtime:
},
expected_error=ValueError,
),
TestCase(
name="single NPU limit returns device and count",
expected_status=SUCCESS,
config={
"resources": models.IoK8sApiCoreV1ResourceRequirements(
limits={
"huawei.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(2),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove vendors from unit tests

Suggested change
"huawei.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(2),
"example.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(2),

}
)
},
expected_output=("npu", "2.0"),
),
TestCase(
name="multiple NPU resource types are not supported",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not be the case.

expected_status=FAILED,
config={
"resources": models.IoK8sApiCoreV1ResourceRequirements(
limits={
"huawei.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(1),
"vendor.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(1),
}
)
},
expected_error=ValueError,
),
],
)
def test_get_container_devices(test_case: TestCase):
Expand Down
3 changes: 3 additions & 0 deletions kubeflow/trainer/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@
# The label for TPU in the container resources.
TPU_LABEL = "google.com/tpu"

# The Suffix label for NPU in the container resources.
NPU_LABEL_SUFFIX = "/npu"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the resource name is: huawei.com/Ascend910.
@danish9039 Do you know details about it?
Ref: #264


# The label key to identify the JobSet name of the Pod.
JOBSET_NAME_LABEL = "jobset.sigs.k8s.io/jobset-name"

Expand Down
Loading