Skip to content

Commit b2f4033

Browse files
committed
Fix and test remove_stripe_fw memory estimator for large sizes
1 parent 4d5df93 commit b2f4033

File tree

2 files changed

+42
-13
lines changed

2 files changed

+42
-13
lines changed

httomolibgpu/prep/stripe.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -335,18 +335,20 @@ def _conv_transpose2d(
335335
if mem_stack:
336336
tmp_weighted_shape = (b, co, ho, wo)
337337
# The trouble here is that we allocate more than the returned size
338-
out_actual_bytes = np.prod(out_shape) * np.float32().itemsize
339-
mem_stack.malloc(out_actual_bytes)
338+
mem_stack.malloc(np.prod(out_shape) * np.float32().itemsize)
340339
mem_stack.malloc((np.prod(tmp_weighted_shape) + w.size) * np.float32().itemsize)
341340
mem_stack.free((np.prod(tmp_weighted_shape) + w.size) * np.float32().itemsize)
342341
if pad != 0:
343-
return [
342+
new_out_shape = [
344343
out_shape[0],
345344
out_shape[1],
346345
out_shape[2] - 2 * pad[0],
347346
out_shape[3] - 2 * pad[1],
348-
], out_actual_bytes
349-
return out_shape, out_actual_bytes
347+
]
348+
mem_stack.malloc(np.prod(new_out_shape) * np.float32().itemsize)
349+
mem_stack.free(np.prod(out_shape) * np.float32().itemsize)
350+
out_shape = new_out_shape
351+
return out_shape
350352

351353
out = cp.zeros(out_shape, dtype="float32")
352354
w = cp.asarray(w)
@@ -365,7 +367,7 @@ def _conv_transpose2d(
365367
)
366368
if pad != 0:
367369
out = out[:, :, pad[0] : out.shape[2] - pad[0], pad[1] : out.shape[3] - pad[1]]
368-
return out, None
370+
return cp.ascontiguousarray(out)
369371

370372

371373
def _afb1d(
@@ -434,17 +436,17 @@ def _sfb1d(
434436
g0 = np.concatenate([g0.reshape(*shape)] * C, axis=0)
435437
g1 = np.concatenate([g1.reshape(*shape)] * C, axis=0)
436438
pad = (L - 2, 0) if d == 2 else (0, L - 2)
437-
y_lo, y_lo_alloc_bytes = _conv_transpose2d(
439+
y_lo = _conv_transpose2d(
438440
lo, g0, stride=s, pad=pad, groups=C, mem_stack=mem_stack
439441
)
440-
y_hi, y_hi_alloc_bytes = _conv_transpose2d(
442+
y_hi = _conv_transpose2d(
441443
hi, g1, stride=s, pad=pad, groups=C, mem_stack=mem_stack
442444
)
443445
if mem_stack:
444446
# Allocation of the sum
445447
mem_stack.malloc(np.prod(y_hi) * np.float32().itemsize)
446-
mem_stack.free(y_lo_alloc_bytes)
447-
mem_stack.free(y_hi_alloc_bytes)
448+
mem_stack.free(np.prod(y_lo) * np.float32().itemsize)
449+
mem_stack.free(np.prod(y_hi) * np.float32().itemsize)
448450
return y_lo
449451
return y_lo + y_hi
450452

@@ -635,7 +637,10 @@ def remove_stripe_fw(
635637
# For the FFT
636638
mem_stack.malloc(2 * fcV_bytes)
637639
# This is "leaked" by the FFT
638-
fcV_fft_bytes = fcV_shape[0] * fcV_shape[2] * np.complex64().itemsize
640+
if fcV_shape[1] > 150:
641+
fcV_fft_bytes = fcV_bytes
642+
else:
643+
fcV_fft_bytes = fcV_shape[0] * fcV_shape[2] * np.complex64().itemsize
639644
mem_stack.malloc(fcV_fft_bytes)
640645
mem_stack.free(2 * fcV_bytes)
641646

tests/test_prep/test_stripe.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def test_remove_stripe_fw_on_data(data, flats, darks):
5656
# --- testing the CuPy implementation from TomoCupy ---#
5757
data = normalize(data, flats, darks, cutoff=10, minus_log=True)
5858

59-
data_after_stripe_removal = remove_stripe_fw(cp.copy(data)).get()
59+
data_after_stripe_removal = remove_stripe_fw(cp.copy(data), level=7).get()
6060

6161
assert_allclose(np.mean(data_after_stripe_removal), 0.279236, rtol=1e-05)
6262
assert_allclose(
@@ -104,7 +104,31 @@ def test_remove_stripe_fw_calc_mem(slices, level, dim_x, wname, ensure_clean_mem
104104
assert hook.max_mem == 0
105105

106106
assert actual_mem_peak * 0.99 <= estimated_mem_peak
107-
assert estimated_mem_peak <= actual_mem_peak * 1.01
107+
assert estimated_mem_peak <= actual_mem_peak * 1.2
108+
109+
110+
@pytest.mark.parametrize("wname", ['db4', 'sym16'])
111+
@pytest.mark.parametrize("slices", [177, 239, 320, 490, 607, 803, 859, 902, 951, 1019, 1074, 1105])
112+
def test_remove_stripe_fw_calc_mem_big(wname, slices, ensure_clean_memory):
113+
dim_y = 901
114+
dim_x = 1200
115+
data_shape = (slices, dim_x, dim_y)
116+
hook = MaxMemoryHook()
117+
with hook:
118+
estimated_mem_peak = remove_stripe_fw(data_shape, wname=wname, calc_peak_gpu_mem=True)
119+
assert hook.max_mem == 0
120+
av_mem = cp.cuda.Device().mem_info[0]
121+
if av_mem < estimated_mem_peak * 1.1:
122+
pytest.skip("Not enough GPU memory to run this test")
123+
124+
hook = MaxMemoryHook()
125+
with hook:
126+
data = cp.random.random_sample(data_shape, dtype=np.float32)
127+
remove_stripe_fw(data, wname=wname)
128+
actual_mem_peak = hook.max_mem
129+
130+
assert actual_mem_peak * 0.99 <= estimated_mem_peak
131+
assert estimated_mem_peak <= actual_mem_peak * 1.2
108132

109133

110134
@pytest.mark.parametrize("angles", [180, 181])

0 commit comments

Comments
 (0)