Adding rank based logging for torch distributed examples#3897
Adding rank based logging for torch distributed examples#3897apbose wants to merge 3 commits intoabose/trt_llm_installation_distfrom
Conversation
31666e3 to
52ae92a
Compare
| return device_mesh, world_size, rank | ||
| # Set C++ TensorRT runtime log level based on most verbose handler | ||
| # this is similar to set_log_level() | ||
| cpp_level = min(file_level_int, console_level_int) |
There was a problem hiding this comment.
Dont we have an API that abstracts needing to detect if the C++ runtime is available? If not we should add one
There was a problem hiding this comment.
I have added a function in _features.py for the above. And also moved all this to logging.py. Let me know if that function placment works
| not is_platform_supported_for_trtllm(), | ||
| "Skipped on Windows, Jetson and CUDA13: NCCL backend is not supported.", | ||
| not is_distributed_nccl_available(), | ||
| "Skipped: NCCL backend is not available (Windows/Jetson not supported).", |
There was a problem hiding this comment.
Is it jetson or just Orin?
There was a problem hiding this comment.
yeah Orin. Changed to Jetson Orin
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 00:37:46.920408+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 00:38:18.669710+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summaryThere was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 07:00:24.693914+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 07:00:53.634960+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summaryaa4183e to
2ea29e4
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 15:34:05.984305+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 15:34:37.144980+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary2ea29e4 to
6833fec
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 22:41:27.269191+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 22:41:58.523912+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary6833fec to
f8befae
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 23:49:16.116928+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-02 23:49:48.815063+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary
--- /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-02 23:49:16.689930+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-02 23:50:00.341840+00:00
@@ -74,19 +74,19 @@
try:
size = os.path.getsize(path)
shm_files.append((path, size))
except OSError:
shm_files.append((path, -1))
-
+
# Sort by size descending
shm_files.sort(key=lambda x: x[1], reverse=True)
for path, size in shm_files:
if size >= 0:
print(f" {path}: {size / (1024 * 1024):.2f} MB")
else:
print(f" {path}: <unable to get size>")
-
+
if not shm_files:
print(" (no files found)")
except Exception as e:
print(f" Error listing /dev/shm: {e}")
f8befae to
f40e84b
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 00:44:03.183076+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 00:44:33.293930+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary
--- /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-03 00:44:03.634077+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-03 00:44:44.650284+00:00
@@ -67,41 +67,39 @@
# List ALL files in /dev/shm to see what's consuming space
print("\nAll files in /dev/shm (including hidden):")
try:
import subprocess
+
# Use ls -la to see all files including hidden ones
result = subprocess.run(
- ["ls", "-la", "/dev/shm"],
- capture_output=True,
- text=True,
- timeout=5
+ ["ls", "-la", "/dev/shm"], capture_output=True, text=True, timeout=5
)
print(result.stdout)
-
+
# Also run du to see actual disk usage
print("\nDisk usage breakdown (du -sh /dev/shm/*):")
result = subprocess.run(
["du", "-sh", "/dev/shm/*"],
capture_output=True,
text=True,
shell=False,
- timeout=5
+ timeout=5,
)
# du with glob needs shell=True
result = subprocess.run(
"du -sh /dev/shm/* 2>/dev/null | head -20",
capture_output=True,
text=True,
shell=True,
- timeout=5
+ timeout=5,
)
print(result.stdout if result.stdout else " (no output)")
-
+
except Exception as e:
print(f" Error listing /dev/shm: {e}")
-
+
# Also list using Python for comparison
print("\nPython os.listdir():")
try:
shm_files = []
for f in os.listdir("/dev/shm"):
@@ -109,25 +107,27 @@
try:
size = os.path.getsize(path)
shm_files.append((path, size))
except OSError:
shm_files.append((path, -1))
-
+
# Sort by size descending
shm_files.sort(key=lambda x: x[1], reverse=True)
total_listed = 0
for path, size in shm_files:
if size >= 0:
print(f" {path}: {size / (1024 * 1024):.2f} MB")
total_listed += size
else:
print(f" {path}: <unable to get size>")
-
+
print(f"\nTotal from listed files: {total_listed / (1024 * 1024):.2f} MB")
print(f"Reported used: {usage_before.get('used_mb', 'N/A')} MB")
- print(f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!")
-
+ print(
+ f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!"
+ )
+
if not shm_files:
print(" (no files found)")
except Exception as e:
print(f" Error: {e}")
@@ -135,11 +135,11 @@
"/dev/shm/nccl-*",
"/dev/shm/torch_*",
"/dev/shm/py_shared_memory_*",
"/dev/shm/*multiprocessing*",
"/dev/shm/vader_segment*", # Open MPI shared memory
- "/dev/shm/sem.*", # POSIX semaphores
+ "/dev/shm/sem.*", # POSIX semaphores
]
total_files = 0
total_bytes_freed = 0
f40e84b to
99ded8c
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 05:24:09.711666+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 05:24:42.564679+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary
--- /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-03 05:24:10.282669+00:00
+++ /home/runner/work/TensorRT/TensorRT/tests/py/dynamo/distributed/test_nccl_ops.py 2025-12-03 05:24:54.082736+00:00
@@ -67,33 +67,31 @@
# List ALL files in /dev/shm to see what's consuming space
print("\nAll files in /dev/shm (including hidden):")
try:
import subprocess
+
# Use ls -la to see all files including hidden ones
result = subprocess.run(
- ["ls", "-la", "/dev/shm"],
- capture_output=True,
- text=True,
- timeout=5
+ ["ls", "-la", "/dev/shm"], capture_output=True, text=True, timeout=5
)
print(result.stdout)
-
+
# Also run du to see actual disk usage
print("\nDisk usage breakdown (du -sh /dev/shm/*):")
result = subprocess.run(
"du -sh /dev/shm/* 2>/dev/null | head -20",
capture_output=True,
text=True,
shell=True,
- timeout=5
+ timeout=5,
)
print(result.stdout if result.stdout else " (no output)")
-
+
except Exception as e:
print(f" Error listing /dev/shm: {e}")
-
+
# Also list using Python for comparison
print("\nPython os.listdir():")
try:
shm_files = []
for f in os.listdir("/dev/shm"):
@@ -101,25 +99,27 @@
try:
size = os.path.getsize(path)
shm_files.append((path, size))
except OSError:
shm_files.append((path, -1))
-
+
# Sort by size descending
shm_files.sort(key=lambda x: x[1], reverse=True)
total_listed = 0
for path, size in shm_files:
if size >= 0:
print(f" {path}: {size / (1024 * 1024):.2f} MB")
total_listed += size
else:
print(f" {path}: <unable to get size>")
-
+
print(f"\nTotal from listed files: {total_listed / (1024 * 1024):.2f} MB")
print(f"Reported used: {usage_before.get('used_mb', 'N/A')} MB")
- print(f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!")
-
+ print(
+ f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!"
+ )
+
if not shm_files:
print(" (no files found)")
except Exception as e:
print(f" Error: {e}")
@@ -127,11 +127,11 @@
"/dev/shm/nccl-*",
"/dev/shm/torch_*",
"/dev/shm/py_shared_memory_*",
"/dev/shm/*multiprocessing*",
"/dev/shm/vader_segment*", # Open MPI shared memory
- "/dev/shm/sem.*", # POSIX semaphores
+ "/dev/shm/sem.*", # POSIX semaphores
]
total_files = 0
total_bytes_freed = 0
99ded8c to
6e91c4e
Compare
There was a problem hiding this comment.
There are some changes that do not conform to Python style guidelines:
--- /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 14:38:26.671953+00:00
+++ /home/runner/work/TensorRT/TensorRT/.github/scripts/filter-matrix.py 2025-12-03 14:38:59.549613+00:00
@@ -148,11 +148,11 @@
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
- filtered_includes.append(item)
+ filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
# Debug: Show summary6e91c4e to
3e42d12
Compare
…ting TRT-LLM installation fallback cases
3e42d12 to
091c2e4
Compare
| else: | ||
| logger.setLevel(level) | ||
|
|
||
| if has_torchscript_frontend(): |
There was a problem hiding this comment.
If we have the frontend we necessarily have the runtime, I dont think we need to use these APIs
| _LOGGER.setLevel(logging.CRITICAL) | ||
|
|
||
| if ENABLED_FEATURES.torchscript_frontend: | ||
| if has_torchscript_frontend(): |
There was a problem hiding this comment.
lets just remove the has_torchscript_frontend cases
narendasan
left a comment
There was a problem hiding this comment.
Just remove the TS ones since we should be able to handle both with the runtime and then LGTM
This PR