-
Notifications
You must be signed in to change notification settings - Fork 145
feat(trainer): Support NPU labels in TrainJob device #336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,7 +41,7 @@ def get_container_devices( | |
| # TODO (andreyvelich): We should discuss how to get container device type. | ||
| # Potentially, we can use the trainer.kubeflow.org/device label from the runtime or | ||
| # node types. | ||
| # TODO (andreyvelich): Support other resource labels (e.g. NPUs). | ||
| # TODO (andreyvelich): Support other resource labels. | ||
| if constants.GPU_LABEL in resources.limits: | ||
| device = constants.GPU_LABEL.split("/")[1] | ||
| device_count = resources.limits[constants.GPU_LABEL].actual_instance | ||
|
|
@@ -55,6 +55,13 @@ def get_container_devices( | |
| mig_key = mig_keys[0] | ||
| device = mig_key.split("/")[1] | ||
| device_count = resources.limits[mig_key].actual_instance | ||
| elif npu_keys := [k for k in resources.limits if k.endswith(constants.NPU_LABEL_SUFFIX)]: | ||
| if len(npu_keys) > 1: | ||
| raise ValueError(f"Multiple NPU resource types are not supported yet: {npu_keys}") | ||
|
Comment on lines
+59
to
+60
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this? |
||
|
|
||
| npu_key = npu_keys[0] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you taking the first item? I thought NPU resources are set similarly to TPU and GPU |
||
| device = npu_key.rsplit("/", 1)[1] | ||
| device_count = resources.limits[npu_key].actual_instance | ||
| elif constants.CPU_LABEL in resources.limits: | ||
| device = constants.CPU_LABEL | ||
| device_count = resources.limits[constants.CPU_LABEL].actual_instance | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -61,6 +61,31 @@ def _build_runtime() -> types.Runtime: | |||||
| }, | ||||||
| expected_error=ValueError, | ||||||
| ), | ||||||
| TestCase( | ||||||
| name="single NPU limit returns device and count", | ||||||
| expected_status=SUCCESS, | ||||||
| config={ | ||||||
| "resources": models.IoK8sApiCoreV1ResourceRequirements( | ||||||
| limits={ | ||||||
| "huawei.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(2), | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove vendors from unit tests
Suggested change
|
||||||
| } | ||||||
| ) | ||||||
| }, | ||||||
| expected_output=("npu", "2.0"), | ||||||
| ), | ||||||
| TestCase( | ||||||
| name="multiple NPU resource types are not supported", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should not be the case. |
||||||
| expected_status=FAILED, | ||||||
| config={ | ||||||
| "resources": models.IoK8sApiCoreV1ResourceRequirements( | ||||||
| limits={ | ||||||
| "huawei.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(1), | ||||||
| "vendor.com/npu": models.IoK8sApimachineryPkgApiResourceQuantity(1), | ||||||
| } | ||||||
| ) | ||||||
| }, | ||||||
| expected_error=ValueError, | ||||||
| ), | ||||||
| ], | ||||||
| ) | ||||||
| def test_get_container_devices(test_case: TestCase): | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -104,6 +104,9 @@ | |
| # The label for TPU in the container resources. | ||
| TPU_LABEL = "google.com/tpu" | ||
|
|
||
| # The Suffix label for NPU in the container resources. | ||
| NPU_LABEL_SUFFIX = "/npu" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought the resource name is: huawei.com/Ascend910. |
||
|
|
||
| # The label key to identify the JobSet name of the Pod. | ||
| JOBSET_NAME_LABEL = "jobset.sigs.k8s.io/jobset-name" | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can remove this TODO statement.