Fix and test remove_stripe_fw memory estimator for large sizes

mfep · mfep · commit b2f4033378c7 · 2025-11-27T16:38:45.000+01:00
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
@@ -335,18 +335,20 @@ def _conv_transpose2d(
     if mem_stack:
         tmp_weighted_shape = (b, co, ho, wo)
         # The trouble here is that we allocate more than the returned size
-        out_actual_bytes = np.prod(out_shape) * np.float32().itemsize
-        mem_stack.malloc(out_actual_bytes)
+        mem_stack.malloc(np.prod(out_shape) * np.float32().itemsize)
         mem_stack.malloc((np.prod(tmp_weighted_shape) + w.size) * np.float32().itemsize)
         mem_stack.free((np.prod(tmp_weighted_shape) + w.size) * np.float32().itemsize)
         if pad != 0:
-            return [
+            new_out_shape = [
                 out_shape[0],
                 out_shape[1],
                 out_shape[2] - 2 * pad[0],
                 out_shape[3] - 2 * pad[1],
-            ], out_actual_bytes
-        return out_shape, out_actual_bytes
+            ]
+            mem_stack.malloc(np.prod(new_out_shape) * np.float32().itemsize)
+            mem_stack.free(np.prod(out_shape) * np.float32().itemsize)
+            out_shape = new_out_shape
+        return out_shape
 
     out = cp.zeros(out_shape, dtype="float32")
     w = cp.asarray(w)
@@ -365,7 +367,7 @@ def _conv_transpose2d(
                 )
     if pad != 0:
         out = out[:, :, pad[0] : out.shape[2] - pad[0], pad[1] : out.shape[3] - pad[1]]
-    return out, None
+    return cp.ascontiguousarray(out)
 
 
 def _afb1d(
@@ -434,17 +436,17 @@ def _sfb1d(
     g0 = np.concatenate([g0.reshape(*shape)] * C, axis=0)
     g1 = np.concatenate([g1.reshape(*shape)] * C, axis=0)
     pad = (L - 2, 0) if d == 2 else (0, L - 2)
-    y_lo, y_lo_alloc_bytes = _conv_transpose2d(
+    y_lo = _conv_transpose2d(
         lo, g0, stride=s, pad=pad, groups=C, mem_stack=mem_stack
     )
-    y_hi, y_hi_alloc_bytes = _conv_transpose2d(
+    y_hi = _conv_transpose2d(
         hi, g1, stride=s, pad=pad, groups=C, mem_stack=mem_stack
     )
     if mem_stack:
         # Allocation of the sum
         mem_stack.malloc(np.prod(y_hi) * np.float32().itemsize)
-        mem_stack.free(y_lo_alloc_bytes)
-        mem_stack.free(y_hi_alloc_bytes)
+        mem_stack.free(np.prod(y_lo) * np.float32().itemsize)
+        mem_stack.free(np.prod(y_hi) * np.float32().itemsize)
         return y_lo
     return y_lo + y_hi
 
@@ -635,7 +637,10 @@ def remove_stripe_fw(
             # For the FFT
             mem_stack.malloc(2 * fcV_bytes)
             # This is "leaked" by the FFT
-            fcV_fft_bytes = fcV_shape[0] * fcV_shape[2] * np.complex64().itemsize
+            if fcV_shape[1] > 150:
+                fcV_fft_bytes = fcV_bytes
+            else:
+                fcV_fft_bytes = fcV_shape[0] * fcV_shape[2] * np.complex64().itemsize
             mem_stack.malloc(fcV_fft_bytes)
             mem_stack.free(2 * fcV_bytes)
 
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
@@ -56,7 +56,7 @@ def test_remove_stripe_fw_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
 
-    data_after_stripe_removal = remove_stripe_fw(cp.copy(data)).get()
+    data_after_stripe_removal = remove_stripe_fw(cp.copy(data), level=7).get()
 
     assert_allclose(np.mean(data_after_stripe_removal), 0.279236, rtol=1e-05)
     assert_allclose(
@@ -104,7 +104,31 @@ def test_remove_stripe_fw_calc_mem(slices, level, dim_x, wname, ensure_clean_mem
     assert hook.max_mem == 0
 
     assert actual_mem_peak * 0.99 <= estimated_mem_peak
-    assert estimated_mem_peak <= actual_mem_peak * 1.01
+    assert estimated_mem_peak <= actual_mem_peak * 1.2
+
+
+@pytest.mark.parametrize("wname", ['db4', 'sym16'])
+@pytest.mark.parametrize("slices", [177, 239, 320, 490, 607, 803, 859, 902, 951, 1019, 1074, 1105])
+def test_remove_stripe_fw_calc_mem_big(wname, slices, ensure_clean_memory):
+    dim_y = 901
+    dim_x = 1200
+    data_shape = (slices, dim_x, dim_y)
+    hook = MaxMemoryHook()
+    with hook:
+        estimated_mem_peak = remove_stripe_fw(data_shape, wname=wname, calc_peak_gpu_mem=True)
+    assert hook.max_mem == 0
+    av_mem = cp.cuda.Device().mem_info[0]
+    if av_mem < estimated_mem_peak * 1.1:
+        pytest.skip("Not enough GPU memory to run this test")
+
+    hook = MaxMemoryHook()
+    with hook:
+        data = cp.random.random_sample(data_shape, dtype=np.float32)
+        remove_stripe_fw(data, wname=wname)
+    actual_mem_peak = hook.max_mem
+
+    assert actual_mem_peak * 0.99 <= estimated_mem_peak
+    assert estimated_mem_peak <= actual_mem_peak * 1.2
 
 
 @pytest.mark.parametrize("angles", [180, 181])