Skip to content

Commit b5cde70

Browse files
committed
modify dockerfile gpu and add logging in efa test
1 parent 26fd863 commit b5cde70

File tree

2 files changed

+15
-1
lines changed

2 files changed

+15
-1
lines changed

pytorch/training/docker/2.8/py3/cu129/Dockerfile.gpu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ RUN apt-get update \
105105
cuda-toolkit-12=${CUDA_VERSION}-1 \
106106
libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
107107
libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
108-
libcudnn9-headers-cuda-12=${CUDNN_VERSION}-1 \
109108
libhwloc-dev \
110109
libgomp1 \
111110
libibverbs-dev \

test/dlc_tests/ec2/test_efa.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,16 +376,31 @@ def _create_master_mpi_hosts_file(efa_ec2_connections, worker_instance_ids, inst
376376
)
377377
else:
378378
# Configure MPI hosts file with IP addresses and slots for worker nodes
379+
# TODO: remove debug logging after testing
380+
LOGGER.info(f"Creating hosts file with master_ip={master_ip}, slots={slots}")
381+
LOGGER.info(f"Worker IPs: {worker_instance_private_ips}")
382+
379383
hosts_string = f"localhost slots={slots} "
380384
for worker_ip in worker_instance_private_ips:
381385
hosts_string += f"\n{worker_ip} slots={slots} "
386+
LOGGER.info(f"Final hosts file content:\n{hosts_string}")
382387

383388
run_cmd_on_container(
384389
MASTER_CONTAINER_NAME,
385390
master_connection,
386391
f"""echo -e "{hosts_string}" > {HOSTS_FILE_LOCATION}""",
387392
)
388393

394+
# TODO: remove debug logging after testing
395+
# check to make sure file was created
396+
LOGGER.info("Verifying hosts file creation:")
397+
run_cmd_on_container(
398+
MASTER_CONTAINER_NAME,
399+
master_connection,
400+
f"ls -l {HOSTS_FILE_LOCATION} && cat {HOSTS_FILE_LOCATION}",
401+
hide=False
402+
)
403+
389404

390405
def _setup_worker_efa_ssh_config(connection, master_pub_key):
391406
"""

0 commit comments

Comments
 (0)