Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions kubeflow/trainer/backends/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ def get_command_using_train_func(
train_func_parameters: dict[str, Any] | None,
pip_index_urls: list[str],
packages_to_install: list[str] | None,
enable_profiler: bool = False,
profiler_dir: str = "/artifacts/profile",
) -> list[str]:
"""
Get the Trainer container command from the given training function and parameters.
Expand Down Expand Up @@ -358,7 +360,24 @@ def get_command_using_train_func(
func_call = f"{train_func.__name__}(**{train_func_parameters})"

# Combine everything into the final code string.
func_code = f"{func_code}\n{func_call}\n"
if enable_profiler:
profiler_code = textwrap.dedent(f"""\
import torch
from torch.profiler import profile, record_function, ProfilerActivity

with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
profile_memory=True,
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler("{profiler_dir}")
) as prof:
with record_function("model_training"):
{func_call}
""")
func_code = f"{func_code}\n{profiler_code}\n"
else:
func_code = f"{func_code}\n{func_call}\n"

is_mpi = runtime.trainer.command[0] == "mpirun"
# The default file location for OpenMPI is: /home/mpiuser/<FILE_NAME>.py
Expand Down Expand Up @@ -420,14 +439,16 @@ def get_trainer_cr_from_custom_trainer(
trainer.func_args,
trainer.pip_index_urls,
trainer.packages_to_install,
trainer.enable_profiler,
trainer.profiler_dir,
)

# Set the TrainJob trainer image if that is set.
if trainer.image:
if getattr(trainer, "image", None):
trainer_cr.image = trainer.image

# Add environment variables to the Trainer.
if trainer.env:
if getattr(trainer, "env", None):
trainer_cr.env = [
models.IoK8sApiCoreV1EnvVar(name=key, value=value) for key, value in trainer.env.items()
]
Expand Down
35 changes: 35 additions & 0 deletions kubeflow/trainer/backends/kubernetes/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,39 @@ def test_get_script_for_python_packages(test_case):
),
],
),
TestCase(
name="with profiler enabled",
expected_status=SUCCESS,
config={
"func": (lambda: print("Hello World")),
"func_args": None,
"runtime": _build_runtime(),
"enable_profiler": True,
"profiler_dir": "/custom/profile/dir",
},
expected_output=[
"bash",
"-c",
(
"\nread -r -d '' SCRIPT << EOM\n\n"
'"func": (lambda: print("Hello World")),\n\n'
"import torch\n"
"from torch.profiler import profile, record_function, ProfilerActivity\n\n"
"with profile(\n"
" activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],\n"
" record_shapes=True,\n"
" profile_memory=True,\n"
" with_stack=True,\n"
' on_trace_ready=torch.profiler.tensorboard_trace_handler("/custom/profile/dir")\n'
") as prof:\n"
' with record_function("model_training"):\n'
" <lambda>()\n\n\n"
"EOM\n"
'printf "%s" "$SCRIPT" > "utils_test.py"\n'
'python "utils_test.py"'
),
],
),
],
)
def test_get_command_using_train_func(test_case: TestCase):
Expand All @@ -487,6 +520,8 @@ def test_get_command_using_train_func(test_case: TestCase):
train_func_parameters=test_case.config.get("func_args"),
pip_index_urls=constants.DEFAULT_PIP_INDEX_URLS,
packages_to_install=test_case.config.get("packages_to_install", []),
enable_profiler=test_case.config.get("enable_profiler", False),
profiler_dir=test_case.config.get("profiler_dir", "/artifacts/profile"),
)

assert test_case.expected_status == SUCCESS
Expand Down
27 changes: 25 additions & 2 deletions kubeflow/trainer/backends/localprocess/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ def get_command_using_train_func(
train_func_parameters: dict[str, Any] | None,
venv_dir: str,
train_job_name: str,
enable_profiler: bool = False,
profiler_dir: str = "/artifacts/profile",
) -> str:
"""
Get the Trainer container command from the given training function and parameters.
Expand Down Expand Up @@ -214,9 +216,28 @@ def get_command_using_train_func(
# print('Start Training...')
# train({'lr': 0.01})
if train_func_parameters is None:
func_code = f"{func_code}\n{train_func.__name__}()\n"
func_call = f"{train_func.__name__}()"
else:
func_code = f"{func_code}\n{train_func.__name__}({train_func_parameters})\n"
func_call = f"{train_func.__name__}(**{train_func_parameters})"

if enable_profiler:
profiler_code = textwrap.dedent(f"""\
import torch
from torch.profiler import profile, record_function, ProfilerActivity

with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
profile_memory=True,
with_stack=True,
on_trace_ready=torch.profiler.tensorboard_trace_handler("{profiler_dir}")
) as prof:
with record_function("model_training"):
{func_call}
""")
func_code = f"{func_code}\n{profiler_code}\n"
else:
func_code = f"{func_code}\n{func_call}\n"

with open(func_file, "w") as f:
f.write(func_code)
Expand Down Expand Up @@ -286,6 +307,8 @@ def get_local_train_job_script(
train_func=trainer.func,
train_func_parameters=trainer.func_args,
train_job_name=train_job_name,
enable_profiler=trainer.enable_profiler,
profiler_dir=trainer.profiler_dir,
)

cleanup_script = get_cleanup_venv_script(cleanup_venv=cleanup_venv, venv_dir=venv_dir)
Expand Down
2 changes: 2 additions & 0 deletions kubeflow/trainer/types/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class CustomTrainer:
num_nodes: int | None = None
resources_per_node: dict | None = None
env: dict[str, str] | None = None
enable_profiler: bool = False
profiler_dir: str = "/artifacts/profile"


# Configuration for the Custom Trainer Container.
Expand Down
Loading