Skip to content

Commit 9a2133d

Browse files
committed
Added PyTorch load test
1 parent 5e18e7b commit 9a2133d

File tree

2 files changed

+333
-6
lines changed

2 files changed

+333
-6
lines changed

notebooks/environment_test.ipynb

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
"\n",
1010
"## 1. Verify data science/ML libraries\n",
1111
"\n",
12-
"Check that TensorFlow is importable and is the correct version (2.16)"
12+
"Check that TensorFlow and PyTorch are importable and are the correct versions"
1313
]
1414
},
1515
{
1616
"cell_type": "code",
17-
"execution_count": 1,
17+
"execution_count": null,
1818
"id": "543d7e55",
1919
"metadata": {},
2020
"outputs": [
@@ -28,20 +28,23 @@
2828
],
2929
"source": [
3030
"import tensorflow as tf\n",
31-
"print('TensorFlow version:', tf.__version__)"
31+
"import torch\n",
32+
"\n",
33+
"print('TensorFlow version:', tf.__version__)\n",
34+
"print('PyTorch version:', torch.__version__)"
3235
]
3336
},
3437
{
3538
"cell_type": "markdown",
3639
"id": "0adb8e86",
3740
"metadata": {},
3841
"source": [
39-
"Check that GPU(s) are correctly detected and available"
42+
"Check that GPU(s) are correctly detected and available for both frameworks"
4043
]
4144
},
4245
{
4346
"cell_type": "code",
44-
"execution_count": 2,
47+
"execution_count": null,
4548
"id": "fe03f200",
4649
"metadata": {},
4750
"outputs": [
@@ -54,7 +57,16 @@
5457
}
5558
],
5659
"source": [
57-
"print(tf.config.list_physical_devices('GPU'))"
60+
"# TensorFlow GPU detection\n",
61+
"print('TensorFlow GPUs:', tf.config.list_physical_devices('GPU'))\n",
62+
"\n",
63+
"# PyTorch GPU detection\n",
64+
"print(f'PyTorch CUDA available: {torch.cuda.is_available()}')\n",
65+
"print(f'PyTorch CUDA device count: {torch.cuda.device_count()}')\n",
66+
"\n",
67+
"if torch.cuda.is_available():\n",
68+
" for i in range(torch.cuda.device_count()):\n",
69+
" print(f' GPU {i}: {torch.cuda.get_device_name(i)}')"
5870
]
5971
},
6072
{
@@ -265,6 +277,43 @@
265277
"\n",
266278
"plot_temperature_results(gpu_results)"
267279
]
280+
},
281+
{
282+
"cell_type": "markdown",
283+
"id": "5ae4b268",
284+
"metadata": {},
285+
"source": [
286+
"## 3. PyTorch GPU load test"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"id": "0a53d5e8",
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"# PyTorch GPU load test - Matrix multiplication with memory scaling\n",
297+
"from functions.pytorch_gpu_load_test import run_pytorch_gpu_load_test\n",
298+
"\n",
299+
"target_memory_percent = 75.0\n",
300+
"test_duration_seconds = 10 * 60\n",
301+
"\n",
302+
"pytorch_gpu_results = run_pytorch_gpu_load_test(target_memory_percent, test_duration_seconds)"
303+
]
304+
},
305+
{
306+
"cell_type": "code",
307+
"execution_count": null,
308+
"id": "bc997887",
309+
"metadata": {},
310+
"outputs": [],
311+
"source": [
312+
"# Plot temperature over time for each GPU\n",
313+
"from functions.pytorch_gpu_load_test import plot_temperature_results\n",
314+
"\n",
315+
"plot_temperature_results(pytorch_gpu_results)"
316+
]
268317
}
269318
],
270319
"metadata": {
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
"""PyTorch GPU load test utilities for stress testing and monitoring GPU performance."""
2+
3+
import subprocess
4+
import time
5+
import torch
6+
7+
8+
def get_gpu_memory_info(gpu_idx: int) -> tuple[float, int, int]:
9+
"""Get GPU memory info: usage percent, total MB, and used MB.
10+
11+
Args:
12+
gpu_idx: Index of the GPU to query.
13+
14+
Returns:
15+
Tuple of (percent_used, total_mb, used_mb).
16+
"""
17+
result = subprocess.run(
18+
['nvidia-smi', '--query-gpu=memory.total,memory.used',
19+
'--format=csv,noheader,nounits', f'--id={gpu_idx}'],
20+
capture_output=True, text=True
21+
)
22+
23+
total, used = map(int, result.stdout.strip().split(', '))
24+
percent = (used / total) * 100
25+
26+
return percent, total, used
27+
28+
29+
def get_gpu_temp(gpu_idx: int) -> int:
30+
"""Get GPU temperature (C) using nvidia-smi.
31+
32+
Args:
33+
gpu_idx: Index of the GPU to query.
34+
35+
Returns:
36+
Temperature in degrees Celsius.
37+
"""
38+
result = subprocess.run(
39+
['nvidia-smi', '--query-gpu=temperature.gpu',
40+
'--format=csv,noheader,nounits', f'--id={gpu_idx}'],
41+
capture_output=True, text=True
42+
)
43+
44+
return int(result.stdout.strip())
45+
46+
47+
def get_available_gpus() -> list[int]:
48+
"""Get list of available CUDA GPU indices.
49+
50+
Returns:
51+
List of GPU indices available for PyTorch.
52+
"""
53+
if not torch.cuda.is_available():
54+
print('CUDA is not available')
55+
return []
56+
57+
num_gpus = torch.cuda.device_count()
58+
print(f'Found {num_gpus} CUDA GPU(s)')
59+
60+
for i in range(num_gpus):
61+
print(f' GPU {i}: {torch.cuda.get_device_name(i)}')
62+
63+
return list(range(num_gpus))
64+
65+
66+
def scale_matrix_to_memory_target(gpu_idx: int, target_memory_percent: float = 75.0,
67+
initial_size: int = 1000, step_size: int = 1000):
68+
"""Scale matrix size to reach target GPU memory usage.
69+
70+
Args:
71+
gpu_idx: Index of the GPU to use.
72+
target_memory_percent: Target percentage of GPU memory to use.
73+
initial_size: Starting matrix size.
74+
step_size: Amount to increase matrix size each iteration.
75+
76+
Returns:
77+
Tuple of (matrix_a, matrix_b, final_matrix_size).
78+
"""
79+
device = torch.device(f'cuda:{gpu_idx}')
80+
_, total_mem, initial_used = get_gpu_memory_info(gpu_idx)
81+
82+
matrix_size = initial_size
83+
last_successful_size = matrix_size
84+
matrix_a = None
85+
matrix_b = None
86+
87+
print(f'Total GPU memory: {total_mem} MB')
88+
print(f'Initial memory used: {initial_used} MB ({initial_used/total_mem*100:.1f}%)')
89+
print(f'Target: {target_memory_percent}% memory usage\n')
90+
91+
while True:
92+
# Clear PyTorch cache
93+
torch.cuda.empty_cache()
94+
95+
print(f'Trying matrix size: {matrix_size}x{matrix_size}...', end=' ', flush=True)
96+
97+
try:
98+
a = torch.randn(matrix_size, matrix_size, device=device)
99+
b = torch.randn(matrix_size, matrix_size, device=device)
100+
101+
c = torch.matmul(a, b)
102+
torch.cuda.synchronize(device)
103+
104+
mem_percent, _, mem_used = get_gpu_memory_info(gpu_idx)
105+
print(f'Memory: {mem_used} MB ({mem_percent:.1f}%)')
106+
107+
last_successful_size = matrix_size
108+
matrix_a = a
109+
matrix_b = b
110+
111+
if mem_percent >= target_memory_percent:
112+
print(f'\nReached {mem_percent:.1f}% memory usage with {matrix_size}x{matrix_size} matrices')
113+
break
114+
115+
matrix_size += step_size
116+
117+
except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
118+
print(f'\nOOM! ({type(e).__name__})')
119+
print(f'Using last successful size: {last_successful_size}x{last_successful_size}')
120+
121+
torch.cuda.empty_cache()
122+
123+
matrix_size = last_successful_size
124+
matrix_a = torch.randn(matrix_size, matrix_size, device=device)
125+
matrix_b = torch.randn(matrix_size, matrix_size, device=device)
126+
c = torch.matmul(matrix_a, matrix_b)
127+
torch.cuda.synchronize(device)
128+
break
129+
130+
return matrix_a, matrix_b, matrix_size
131+
132+
133+
def run_stress_test(gpu_idx: int, matrix_a, matrix_b, matrix_size: int,
134+
duration_seconds: int = 600, temp_record_interval: int = 5,
135+
progress_interval: int = 30) -> dict:
136+
"""Run GPU stress test for specified duration.
137+
138+
Args:
139+
gpu_idx: Index of the GPU to use.
140+
matrix_a: First matrix for multiplication.
141+
matrix_b: Second matrix for multiplication.
142+
matrix_size: Size of the matrices.
143+
duration_seconds: Duration of the stress test in seconds.
144+
temp_record_interval: Interval in seconds for recording temperature.
145+
progress_interval: Interval in seconds for printing progress.
146+
147+
Returns:
148+
Dictionary containing test results and temperature data.
149+
"""
150+
device = torch.device(f'cuda:{gpu_idx}')
151+
initial_temp = get_gpu_temp(gpu_idx)
152+
153+
temp_data = {'times': [0], 'temps': [initial_temp]}
154+
155+
print(f'\nRunning stress test for {duration_seconds // 60} minutes at {matrix_size}x{matrix_size}...')
156+
print(f'Initial temperature: {initial_temp}°C')
157+
158+
start_time = time.time()
159+
iteration_count = 0
160+
last_update_time = start_time
161+
last_temp_record_time = start_time
162+
163+
while (time.time() - start_time) < duration_seconds:
164+
c = torch.matmul(matrix_a, matrix_b)
165+
torch.cuda.synchronize(device)
166+
iteration_count += 1
167+
168+
current_time = time.time()
169+
170+
if current_time - last_temp_record_time >= temp_record_interval:
171+
current_temp = get_gpu_temp(gpu_idx)
172+
elapsed = current_time - start_time
173+
temp_data['times'].append(elapsed)
174+
temp_data['temps'].append(current_temp)
175+
last_temp_record_time = current_time
176+
177+
if current_time - last_update_time >= progress_interval:
178+
current_temp = get_gpu_temp(gpu_idx)
179+
elapsed = current_time - start_time
180+
remaining = duration_seconds - elapsed
181+
print(f' {elapsed:.0f}s elapsed, {remaining:.0f}s remaining - '
182+
f'Temp: {current_temp}°C, Iterations: {iteration_count}')
183+
last_update_time = current_time
184+
185+
elapsed_time = time.time() - start_time
186+
final_temp = get_gpu_temp(gpu_idx)
187+
188+
temp_data['times'].append(elapsed_time)
189+
temp_data['temps'].append(final_temp)
190+
191+
final_mem_percent, _, final_mem_used = get_gpu_memory_info(gpu_idx)
192+
193+
return {
194+
'matrix_size': matrix_size,
195+
'final_mem_used': final_mem_used,
196+
'final_mem_percent': final_mem_percent,
197+
'iteration_count': iteration_count,
198+
'elapsed_time': elapsed_time,
199+
'initial_temp': initial_temp,
200+
'final_temp': final_temp,
201+
'temp_data': temp_data
202+
}
203+
204+
205+
def run_pytorch_gpu_load_test(target_memory_percent: float = 75.0,
206+
test_duration_seconds: int = 600) -> dict:
207+
"""Run complete GPU load test on all available GPUs using PyTorch.
208+
209+
Args:
210+
target_memory_percent: Target percentage of GPU memory to use.
211+
test_duration_seconds: Duration of the stress test in seconds.
212+
213+
Returns:
214+
Dictionary mapping GPU index to test results including temperature data.
215+
"""
216+
gpu_indices = get_available_gpus()
217+
218+
if not gpu_indices:
219+
print('No GPUs available for testing')
220+
return {}
221+
222+
gpu_results = {}
223+
224+
for gpu_idx in gpu_indices:
225+
gpu_name = torch.cuda.get_device_name(gpu_idx)
226+
227+
print(f'{"="*60}')
228+
print(f'Testing GPU {gpu_idx}: {gpu_name}')
229+
print(f'{"="*60}')
230+
231+
matrix_a, matrix_b, matrix_size = scale_matrix_to_memory_target(
232+
gpu_idx, target_memory_percent
233+
)
234+
235+
results = run_stress_test(
236+
gpu_idx, matrix_a, matrix_b, matrix_size, test_duration_seconds
237+
)
238+
results['name'] = gpu_name
239+
240+
gpu_results[gpu_idx] = results
241+
242+
print(f'\nGPU {gpu_idx} stress test completed!')
243+
print(f' - Final matrix size: {results["matrix_size"]}x{results["matrix_size"]}')
244+
print(f' - Peak memory usage: {results["final_mem_used"]} MB ({results["final_mem_percent"]:.1f}%)')
245+
print(f' - Total iterations: {results["iteration_count"]}')
246+
print(f' - Total time: {results["elapsed_time"]:.1f} seconds ({results["elapsed_time"]/60:.1f} minutes)')
247+
print(f' - Avg time per operation: {results["elapsed_time"]/results["iteration_count"]:.4f} seconds')
248+
print(f' - Temperature: {results["initial_temp"]}°C → {results["final_temp"]}°C '
249+
f'(Δ{results["final_temp"] - results["initial_temp"]:+d}°C)\n')
250+
251+
print(f'{"="*60}')
252+
print(f'All {len(gpu_indices)} GPU(s) stress tested successfully with PyTorch!')
253+
print(f'{"="*60}')
254+
255+
return gpu_results
256+
257+
258+
def plot_temperature_results(gpu_results: dict):
259+
"""Plot temperature over time for each GPU.
260+
261+
Args:
262+
gpu_results: Dictionary of GPU test results from run_pytorch_gpu_load_test.
263+
"""
264+
import matplotlib.pyplot as plt
265+
266+
plt.figure(figsize=(8, 4))
267+
plt.title('GPU temperature over time during PyTorch stress test')
268+
269+
for gpu_idx, results in gpu_results.items():
270+
temp_data = results['temp_data']
271+
times_minutes = [t / 60 for t in temp_data['times']]
272+
plt.plot(times_minutes, temp_data['temps'], label=f'GPU {gpu_idx}')
273+
274+
plt.xlabel('Time (minutes)')
275+
plt.ylabel('Temperature (°C)')
276+
plt.legend()
277+
plt.tight_layout()
278+
plt.show()

0 commit comments

Comments
 (0)