Merge pull request #167 from neon60/raven_filter

dkazanc · web-flow · commit d2f83630f2a7 · 2024-11-18T12:40:26.000Z
Raven filter
diff --git a/httomolibgpu/cuda_kernels/raven_filter.cu b/httomolibgpu/cuda_kernels/raven_filter.cu
@@ -0,0 +1,49 @@
+#include <cupy/complex.cuh>
+
+template <typename Type>
+__global__ void 
+raven_filter(
+  complex<Type> *input,
+  complex<Type> *output,
+  int width, int images, int height, 
+  int u0, int n, int v0) {
+
+  const int px = threadIdx.x + blockIdx.x * blockDim.x;
+  const int py = threadIdx.y + blockIdx.y * blockDim.y;
+  const int pz = threadIdx.z + blockIdx.z * blockDim.z;
+
+  if (px >= width || py >= images || pz >= height)
+    return;
+
+  int centerx = width / 2;
+  int centerz = height / 2;
+
+  long long index = static_cast<long long>(px) + 
+                    width * static_cast<long long>(py) + 
+                    width * images * static_cast<long long>(pz);
+
+  complex<Type> value = input[index];
+  if( pz >= (centerz - v0) && pz < (centerz + v0 + 1) ) {
+    
+    // +1 needed to match with CPU implementation
+    Type base = Type(px - centerx + 1) / u0;
+    Type power = base;
+    for( int i = 1; i < 2 * n; i++ )
+      power *= base;
+
+    Type filtered_value = 1.f / (1.f + power);
+    value *= complex<Type>(filtered_value, filtered_value);
+  }
+
+  // ifftshifting positions
+  int xshift = (width + 1) / 2;
+  int zshift = (height + 1) / 2;
+  int outX = (px + xshift) % width;
+  int outZ = (pz + zshift) % height;
+
+  long long outIndex = static_cast<long long>(outX) + 
+                       width * static_cast<long long>(py) + 
+                       width * images * static_cast<long long>(outZ);
+
+  output[outIndex] = value;
+}
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
@@ -30,17 +30,24 @@
 
 if cupy_run:
     from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
+    from cupyx.scipy.fft import fft2, ifft2, fftshift, ifftshift
+    from httomolibgpu.cuda_kernels import load_cuda_module
 else:
     median_filter = Mock()
     binary_dilation = Mock()
     uniform_filter1d = Mock()
+    fft2 = Mock()
+    ifft2 = Mock()
+    fftshift = Mock()
+    ifftshift = Mock()
 
 from typing import Union
 
 __all__ = [
     "remove_stripe_based_sorting",
     "remove_stripe_ti",
     "remove_all_stripe",
+    "raven_filter",
 ]
 
 
@@ -359,6 +366,94 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
         sinogram = _rs_large(sinogram, snr, size, matindex)
     return sinogram
 
+def raven_filter(
+        sinogram,
+        uvalue: int = 20,
+        nvalue: int = 4,
+        vvalue: int = 2,
+        pad_y: int = 20,
+        pad_x: int = 20,
+        pad_method: str = "edge"):
+    """
+    Applies raven filter to a 3D CuPy array. For more detailed information, see :ref:`method_raven_filter`.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Input CuPy 3D array either float32 or uint16 data type.
+
+    pad_y : int, optional
+        Pad the top and bottom of projections.
+
+    pad_x : int, optional
+        Pad the left and right of projections.
+
+    pad_method : str, optional
+        Numpy pad method to use.
+
+    uvalue : int, optional
+        The shape of filter.
+
+    nvalue : int, optional
+        The shape of filter.
+
+    vvalue : int, optional
+        The number of rows to be applied the filter
+
+    Returns
+    -------
+    ndarray
+        Raven filtered 3D CuPy array in float32 data type.
+
+    Raises
+    ------
+    ValueError
+        If the input array is not three dimensional.
+    """
+
+    if sinogram.dtype != cp.float32:
+        raise ValueError("The input data should be float32 data type")
+
+    # Padding of the sinogram
+    sinogram = cp.pad(sinogram, ((pad_y, pad_y), (0, 0), (pad_x, pad_x)), mode=pad_method)
+
+    # FFT and shift of sinogram
+    fft_data = fft2(sinogram, axes=(0, 2), overwrite_x=True)
+    fft_data_shifted = fftshift(fft_data, axes=(0, 2))
+
+    # Calculation type
+    calc_type = fft_data_shifted.dtype
+
+    # Setup various values for the filter
+    height, images, width = sinogram.shape
+
+    # Set the input type of the kernel
+    kernel_args = "raven_filter<{0}>".format(
+        "float" if calc_type == "complex64" else "double"
+    )
+
+    # setting grid/block parameters
+    block_x = 128
+    block_dims = (block_x, 1, 1)
+    grid_x = (width + block_x - 1) // block_x
+    grid_y = images
+    grid_z = height
+    grid_dims = (grid_x, grid_y, grid_z)
+    params = (fft_data_shifted, fft_data, width, images, height, uvalue, nvalue, vvalue)
+
+    raven_module = load_cuda_module("raven_filter", name_expressions=[kernel_args])
+    raven_filt = raven_module.get_function(kernel_args)
+    
+    raven_filt(grid_dims, block_dims, params)
+    
+    # raven_filt already doing ifftshifting
+    # fft_data = ifftshift(fft_data_shifted, axes=(0, 2))
+    sinogram = ifft2(fft_data, axes=(0, 2), overwrite_x=True)
+
+    # Removing padding
+    sinogram = sinogram[pad_y:height-pad_y, :, pad_x:width-pad_x].real
+
+    return sinogram
 
 def _create_matindex(nrow, ncol):
     """
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,8 @@ dev = [
   "toml",
   "imageio",
   "h5py",
-  "pre-commit"
+  "pre-commit",
+  "pyfftw"
 ]
 
 
diff --git a/tests/test_prep/stripe_cpu_reference.py b/tests/test_prep/stripe_cpu_reference.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pyfftw
+import pyfftw.interfaces.numpy_fft as fft
+
+def raven_filter_numpy(
+        sinogram,
+        uvalue: int = 20,
+        nvalue: int = 4,
+        vvalue: int = 2,
+        pad_y: int = 20,
+        pad_x: int = 20,
+        pad_method: str = "edge"):
+    
+    # Parameters
+    v0 = vvalue
+    n = nvalue
+    u0 = uvalue
+
+    # Make a padded copy
+    sinogram_padded = np.pad(sinogram, ((pad_y,pad_y), (0, 0), (pad_x,pad_x)), pad_method)
+    
+    # Size
+    height, images, width = sinogram_padded.shape
+    
+    # Generate filter function
+    centerx = np.ceil(width / 2.0) - 1.0
+    centery = np.int16(np.ceil(height / 2.0) - 1)
+    row1 = centery - v0
+    row2 = centery + v0 + 1
+    listx = np.arange(width) - centerx
+    filtershape = 1.0 / (1.0 + np.power(listx / u0, 2 * n))
+    filtershapepad2d = np.zeros((row2 - row1, filtershape.size))
+    filtershapepad2d[:] = np.float64(filtershape)
+    filtercomplex = filtershapepad2d + filtershapepad2d * 1j
+    
+    # Generate filter objects
+    a = pyfftw.empty_aligned((height, images, width), dtype='complex128', n=16)
+    b = pyfftw.empty_aligned((height, images, width), dtype='complex128', n=16)
+    c = pyfftw.empty_aligned((height, images, width), dtype='complex128', n=16)
+    d = pyfftw.empty_aligned((height, images, width), dtype='complex128', n=16)
+    fft_object  = pyfftw.FFTW(a, b, axes=(0, 2))
+    ifft_object = pyfftw.FFTW(c, d, axes=(0, 2), direction='FFTW_BACKWARD')
+    
+    sino = fft.fftshift(fft_object(sinogram_padded), axes=(0, 2))
+    for m in range(sino.shape[1]):
+        sino[row1:row2, m] = sino[row1:row2, m] * filtercomplex
+    sino = ifft_object(fft.ifftshift(sino, axes=(0, 2)))
+    sinogram = sino[pad_y:height-pad_y, :, pad_x:width-pad_x]
+
+    return sinogram.real
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
@@ -3,14 +3,16 @@
 from cupy.cuda import nvtx
 import numpy as np
 import pytest
+
 from httomolibgpu.prep.normalize import normalize
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
     remove_all_stripe,
+    raven_filter,
 )
 from numpy.testing import assert_allclose
-
+from .stripe_cpu_reference import raven_filter_numpy
 
 def test_remove_stripe_ti_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
@@ -51,7 +53,6 @@ def test_remove_stripe_ti_on_data(data, flats, darks):
 #         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
 #     )
 
-
 def test_stripe_removal_sorting_cupy(data, flats, darks):
     # --- testing the CuPy port of TomoPy's implementation ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
@@ -66,6 +67,43 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
     assert corrected_data.dtype == np.float32
     assert corrected_data.flags.c_contiguous
 
+def test_stripe_raven_cupy(data, flats, darks):
+    # --- testing the CuPy port of TomoPy's implementation ---#
+
+    data = normalize(data, flats, darks, cutoff=10, minus_log=True)
+
+    data_after_raven_gpu = raven_filter(cp.copy(data)).get()
+    data_after_raven_cpu = raven_filter_numpy(cp.copy(data).get())
+
+    assert_allclose(data_after_raven_cpu, data_after_raven_gpu, rtol=0, atol=4e-01)
+
+    data = None  #: free up GPU memory
+    # make sure the output is float32
+    assert data_after_raven_gpu.dtype == np.float32
+    assert data_after_raven_gpu.shape == data_after_raven_cpu.shape
+
+@pytest.mark.parametrize("uvalue", [20, 50, 100])
+@pytest.mark.parametrize("nvalue", [2, 4, 6])
+@pytest.mark.parametrize("vvalue", [2, 4])
+@pytest.mark.parametrize("pad_x", [0, 10, 20])
+@pytest.mark.parametrize("pad_y", [0, 10, 20])
+@cp.testing.numpy_cupy_allclose(rtol=0, atol=3e-01)
+def test_stripe_raven_parameters_cupy(ensure_clean_memory, xp, uvalue, nvalue, vvalue, pad_x, pad_y):
+    # because it's random, we explicitly seed and use numpy only, to match the data
+    np.random.seed(12345)
+    data = np.random.random_sample(size=(256, 5, 512)).astype(np.float32) * 2.0 + 0.001
+    data = xp.asarray(data)
+
+    if xp.__name__ == "numpy":
+        results = raven_filter_numpy(
+            data, uvalue=uvalue, nvalue=nvalue, vvalue=vvalue, pad_x=pad_x, pad_y=pad_y
+        ).astype(np.float32) 
+    else:
+        results = raven_filter(
+            data, uvalue=uvalue, nvalue=nvalue, vvalue=vvalue, pad_x=pad_x, pad_y=pad_y
+        ).get()
+
+    return xp.asarray(results)
 
 @pytest.mark.perf
 def test_stripe_removal_sorting_cupy_performance(ensure_clean_memory):
@@ -116,6 +154,29 @@ def test_remove_stripe_ti_performance(ensure_clean_memory):
 
     assert "performance in ms" == duration_ms
 
+@pytest.mark.perf
+def test_raven_filter_performance(ensure_clean_memory):
+    data_host = (
+        np.random.random_sample(size=(1801, 5, 2560)).astype(np.float32) * 2.0 + 0.001
+    )
+    data = cp.asarray(data_host, dtype=np.float32)
+
+    # do a cold run first
+    raven_filter(cp.copy(data))
+
+    dev = cp.cuda.Device()
+    dev.synchronize()
+
+    start = time.perf_counter_ns()
+    nvtx.RangePush("Core")
+    for _ in range(10):
+        # have to take copy, as data is modified in-place
+        raven_filter(cp.copy(data))
+    nvtx.RangePop()
+    dev.synchronize()
+    duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
+
+    assert "performance in ms" == duration_ms
 
 def test_remove_all_stripe_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,8 @@ dev = [`
`57`	`57`	`"toml",`
`58`	`58`	`"imageio",`
`59`	`59`	`"h5py",`
`60`		`- "pre-commit"`
	`60`	`+ "pre-commit",`
	`61`	`+ "pyfftw"`
`61`	`62`	`]`
`62`	`63`
`63`	`64`