|
| 1 | +"""PyTorch GPU load test utilities for stress testing and monitoring GPU performance.""" |
| 2 | + |
| 3 | +import subprocess |
| 4 | +import time |
| 5 | +import torch |
| 6 | + |
| 7 | + |
| 8 | +def get_gpu_memory_info(gpu_idx: int) -> tuple[float, int, int]: |
| 9 | + """Get GPU memory info: usage percent, total MB, and used MB. |
| 10 | + |
| 11 | + Args: |
| 12 | + gpu_idx: Index of the GPU to query. |
| 13 | + |
| 14 | + Returns: |
| 15 | + Tuple of (percent_used, total_mb, used_mb). |
| 16 | + """ |
| 17 | + result = subprocess.run( |
| 18 | + ['nvidia-smi', '--query-gpu=memory.total,memory.used', |
| 19 | + '--format=csv,noheader,nounits', f'--id={gpu_idx}'], |
| 20 | + capture_output=True, text=True |
| 21 | + ) |
| 22 | + |
| 23 | + total, used = map(int, result.stdout.strip().split(', ')) |
| 24 | + percent = (used / total) * 100 |
| 25 | + |
| 26 | + return percent, total, used |
| 27 | + |
| 28 | + |
| 29 | +def get_gpu_temp(gpu_idx: int) -> int: |
| 30 | + """Get GPU temperature (C) using nvidia-smi. |
| 31 | + |
| 32 | + Args: |
| 33 | + gpu_idx: Index of the GPU to query. |
| 34 | + |
| 35 | + Returns: |
| 36 | + Temperature in degrees Celsius. |
| 37 | + """ |
| 38 | + result = subprocess.run( |
| 39 | + ['nvidia-smi', '--query-gpu=temperature.gpu', |
| 40 | + '--format=csv,noheader,nounits', f'--id={gpu_idx}'], |
| 41 | + capture_output=True, text=True |
| 42 | + ) |
| 43 | + |
| 44 | + return int(result.stdout.strip()) |
| 45 | + |
| 46 | + |
| 47 | +def get_available_gpus() -> list[int]: |
| 48 | + """Get list of available CUDA GPU indices. |
| 49 | + |
| 50 | + Returns: |
| 51 | + List of GPU indices available for PyTorch. |
| 52 | + """ |
| 53 | + if not torch.cuda.is_available(): |
| 54 | + print('CUDA is not available') |
| 55 | + return [] |
| 56 | + |
| 57 | + num_gpus = torch.cuda.device_count() |
| 58 | + print(f'Found {num_gpus} CUDA GPU(s)') |
| 59 | + |
| 60 | + for i in range(num_gpus): |
| 61 | + print(f' GPU {i}: {torch.cuda.get_device_name(i)}') |
| 62 | + |
| 63 | + return list(range(num_gpus)) |
| 64 | + |
| 65 | + |
| 66 | +def scale_matrix_to_memory_target(gpu_idx: int, target_memory_percent: float = 75.0, |
| 67 | + initial_size: int = 1000, step_size: int = 1000): |
| 68 | + """Scale matrix size to reach target GPU memory usage. |
| 69 | + |
| 70 | + Args: |
| 71 | + gpu_idx: Index of the GPU to use. |
| 72 | + target_memory_percent: Target percentage of GPU memory to use. |
| 73 | + initial_size: Starting matrix size. |
| 74 | + step_size: Amount to increase matrix size each iteration. |
| 75 | + |
| 76 | + Returns: |
| 77 | + Tuple of (matrix_a, matrix_b, final_matrix_size). |
| 78 | + """ |
| 79 | + device = torch.device(f'cuda:{gpu_idx}') |
| 80 | + _, total_mem, initial_used = get_gpu_memory_info(gpu_idx) |
| 81 | + |
| 82 | + matrix_size = initial_size |
| 83 | + last_successful_size = matrix_size |
| 84 | + matrix_a = None |
| 85 | + matrix_b = None |
| 86 | + |
| 87 | + print(f'Total GPU memory: {total_mem} MB') |
| 88 | + print(f'Initial memory used: {initial_used} MB ({initial_used/total_mem*100:.1f}%)') |
| 89 | + print(f'Target: {target_memory_percent}% memory usage\n') |
| 90 | + |
| 91 | + while True: |
| 92 | + # Clear PyTorch cache |
| 93 | + torch.cuda.empty_cache() |
| 94 | + |
| 95 | + print(f'Trying matrix size: {matrix_size}x{matrix_size}...', end=' ', flush=True) |
| 96 | + |
| 97 | + try: |
| 98 | + a = torch.randn(matrix_size, matrix_size, device=device) |
| 99 | + b = torch.randn(matrix_size, matrix_size, device=device) |
| 100 | + |
| 101 | + c = torch.matmul(a, b) |
| 102 | + torch.cuda.synchronize(device) |
| 103 | + |
| 104 | + mem_percent, _, mem_used = get_gpu_memory_info(gpu_idx) |
| 105 | + print(f'Memory: {mem_used} MB ({mem_percent:.1f}%)') |
| 106 | + |
| 107 | + last_successful_size = matrix_size |
| 108 | + matrix_a = a |
| 109 | + matrix_b = b |
| 110 | + |
| 111 | + if mem_percent >= target_memory_percent: |
| 112 | + print(f'\nReached {mem_percent:.1f}% memory usage with {matrix_size}x{matrix_size} matrices') |
| 113 | + break |
| 114 | + |
| 115 | + matrix_size += step_size |
| 116 | + |
| 117 | + except (torch.cuda.OutOfMemoryError, RuntimeError) as e: |
| 118 | + print(f'\nOOM! ({type(e).__name__})') |
| 119 | + print(f'Using last successful size: {last_successful_size}x{last_successful_size}') |
| 120 | + |
| 121 | + torch.cuda.empty_cache() |
| 122 | + |
| 123 | + matrix_size = last_successful_size |
| 124 | + matrix_a = torch.randn(matrix_size, matrix_size, device=device) |
| 125 | + matrix_b = torch.randn(matrix_size, matrix_size, device=device) |
| 126 | + c = torch.matmul(matrix_a, matrix_b) |
| 127 | + torch.cuda.synchronize(device) |
| 128 | + break |
| 129 | + |
| 130 | + return matrix_a, matrix_b, matrix_size |
| 131 | + |
| 132 | + |
| 133 | +def run_stress_test(gpu_idx: int, matrix_a, matrix_b, matrix_size: int, |
| 134 | + duration_seconds: int = 600, temp_record_interval: int = 5, |
| 135 | + progress_interval: int = 30) -> dict: |
| 136 | + """Run GPU stress test for specified duration. |
| 137 | + |
| 138 | + Args: |
| 139 | + gpu_idx: Index of the GPU to use. |
| 140 | + matrix_a: First matrix for multiplication. |
| 141 | + matrix_b: Second matrix for multiplication. |
| 142 | + matrix_size: Size of the matrices. |
| 143 | + duration_seconds: Duration of the stress test in seconds. |
| 144 | + temp_record_interval: Interval in seconds for recording temperature. |
| 145 | + progress_interval: Interval in seconds for printing progress. |
| 146 | + |
| 147 | + Returns: |
| 148 | + Dictionary containing test results and temperature data. |
| 149 | + """ |
| 150 | + device = torch.device(f'cuda:{gpu_idx}') |
| 151 | + initial_temp = get_gpu_temp(gpu_idx) |
| 152 | + |
| 153 | + temp_data = {'times': [0], 'temps': [initial_temp]} |
| 154 | + |
| 155 | + print(f'\nRunning stress test for {duration_seconds // 60} minutes at {matrix_size}x{matrix_size}...') |
| 156 | + print(f'Initial temperature: {initial_temp}°C') |
| 157 | + |
| 158 | + start_time = time.time() |
| 159 | + iteration_count = 0 |
| 160 | + last_update_time = start_time |
| 161 | + last_temp_record_time = start_time |
| 162 | + |
| 163 | + while (time.time() - start_time) < duration_seconds: |
| 164 | + c = torch.matmul(matrix_a, matrix_b) |
| 165 | + torch.cuda.synchronize(device) |
| 166 | + iteration_count += 1 |
| 167 | + |
| 168 | + current_time = time.time() |
| 169 | + |
| 170 | + if current_time - last_temp_record_time >= temp_record_interval: |
| 171 | + current_temp = get_gpu_temp(gpu_idx) |
| 172 | + elapsed = current_time - start_time |
| 173 | + temp_data['times'].append(elapsed) |
| 174 | + temp_data['temps'].append(current_temp) |
| 175 | + last_temp_record_time = current_time |
| 176 | + |
| 177 | + if current_time - last_update_time >= progress_interval: |
| 178 | + current_temp = get_gpu_temp(gpu_idx) |
| 179 | + elapsed = current_time - start_time |
| 180 | + remaining = duration_seconds - elapsed |
| 181 | + print(f' {elapsed:.0f}s elapsed, {remaining:.0f}s remaining - ' |
| 182 | + f'Temp: {current_temp}°C, Iterations: {iteration_count}') |
| 183 | + last_update_time = current_time |
| 184 | + |
| 185 | + elapsed_time = time.time() - start_time |
| 186 | + final_temp = get_gpu_temp(gpu_idx) |
| 187 | + |
| 188 | + temp_data['times'].append(elapsed_time) |
| 189 | + temp_data['temps'].append(final_temp) |
| 190 | + |
| 191 | + final_mem_percent, _, final_mem_used = get_gpu_memory_info(gpu_idx) |
| 192 | + |
| 193 | + return { |
| 194 | + 'matrix_size': matrix_size, |
| 195 | + 'final_mem_used': final_mem_used, |
| 196 | + 'final_mem_percent': final_mem_percent, |
| 197 | + 'iteration_count': iteration_count, |
| 198 | + 'elapsed_time': elapsed_time, |
| 199 | + 'initial_temp': initial_temp, |
| 200 | + 'final_temp': final_temp, |
| 201 | + 'temp_data': temp_data |
| 202 | + } |
| 203 | + |
| 204 | + |
| 205 | +def run_pytorch_gpu_load_test(target_memory_percent: float = 75.0, |
| 206 | + test_duration_seconds: int = 600) -> dict: |
| 207 | + """Run complete GPU load test on all available GPUs using PyTorch. |
| 208 | + |
| 209 | + Args: |
| 210 | + target_memory_percent: Target percentage of GPU memory to use. |
| 211 | + test_duration_seconds: Duration of the stress test in seconds. |
| 212 | + |
| 213 | + Returns: |
| 214 | + Dictionary mapping GPU index to test results including temperature data. |
| 215 | + """ |
| 216 | + gpu_indices = get_available_gpus() |
| 217 | + |
| 218 | + if not gpu_indices: |
| 219 | + print('No GPUs available for testing') |
| 220 | + return {} |
| 221 | + |
| 222 | + gpu_results = {} |
| 223 | + |
| 224 | + for gpu_idx in gpu_indices: |
| 225 | + gpu_name = torch.cuda.get_device_name(gpu_idx) |
| 226 | + |
| 227 | + print(f'{"="*60}') |
| 228 | + print(f'Testing GPU {gpu_idx}: {gpu_name}') |
| 229 | + print(f'{"="*60}') |
| 230 | + |
| 231 | + matrix_a, matrix_b, matrix_size = scale_matrix_to_memory_target( |
| 232 | + gpu_idx, target_memory_percent |
| 233 | + ) |
| 234 | + |
| 235 | + results = run_stress_test( |
| 236 | + gpu_idx, matrix_a, matrix_b, matrix_size, test_duration_seconds |
| 237 | + ) |
| 238 | + results['name'] = gpu_name |
| 239 | + |
| 240 | + gpu_results[gpu_idx] = results |
| 241 | + |
| 242 | + print(f'\nGPU {gpu_idx} stress test completed!') |
| 243 | + print(f' - Final matrix size: {results["matrix_size"]}x{results["matrix_size"]}') |
| 244 | + print(f' - Peak memory usage: {results["final_mem_used"]} MB ({results["final_mem_percent"]:.1f}%)') |
| 245 | + print(f' - Total iterations: {results["iteration_count"]}') |
| 246 | + print(f' - Total time: {results["elapsed_time"]:.1f} seconds ({results["elapsed_time"]/60:.1f} minutes)') |
| 247 | + print(f' - Avg time per operation: {results["elapsed_time"]/results["iteration_count"]:.4f} seconds') |
| 248 | + print(f' - Temperature: {results["initial_temp"]}°C → {results["final_temp"]}°C ' |
| 249 | + f'(Δ{results["final_temp"] - results["initial_temp"]:+d}°C)\n') |
| 250 | + |
| 251 | + print(f'{"="*60}') |
| 252 | + print(f'All {len(gpu_indices)} GPU(s) stress tested successfully with PyTorch!') |
| 253 | + print(f'{"="*60}') |
| 254 | + |
| 255 | + return gpu_results |
| 256 | + |
| 257 | + |
| 258 | +def plot_temperature_results(gpu_results: dict): |
| 259 | + """Plot temperature over time for each GPU. |
| 260 | + |
| 261 | + Args: |
| 262 | + gpu_results: Dictionary of GPU test results from run_pytorch_gpu_load_test. |
| 263 | + """ |
| 264 | + import matplotlib.pyplot as plt |
| 265 | + |
| 266 | + plt.figure(figsize=(8, 4)) |
| 267 | + plt.title('GPU temperature over time during PyTorch stress test') |
| 268 | + |
| 269 | + for gpu_idx, results in gpu_results.items(): |
| 270 | + temp_data = results['temp_data'] |
| 271 | + times_minutes = [t / 60 for t in temp_data['times']] |
| 272 | + plt.plot(times_minutes, temp_data['temps'], label=f'GPU {gpu_idx}') |
| 273 | + |
| 274 | + plt.xlabel('Time (minutes)') |
| 275 | + plt.ylabel('Temperature (°C)') |
| 276 | + plt.legend() |
| 277 | + plt.tight_layout() |
| 278 | + plt.show() |
0 commit comments