Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,15 @@ recon:
memory_gpu:
multiplier: None
method: module
LPRec3d_tomobar:
pattern: sinogram
output_dims_change: True
implementation: gpu_cupy
save_result_default: True
padding: False
memory_gpu:
multiplier: None
method: module
SIRT3d_tomobar:
pattern: sinogram
output_dims_change: True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@
import math
from typing import Tuple
import numpy as np
from httomo_backends.cufft import CufftType, cufft_estimate_1d
from httomo_backends.cufft import CufftType, cufft_estimate_1d, cufft_estimate_2d

__all__ = [
"_calc_memory_bytes_FBP3d_tomobar",
"_calc_memory_bytes_LPRec3d_tomobar",
"_calc_memory_bytes_SIRT3d_tomobar",
"_calc_memory_bytes_CGLS3d_tomobar",
"_calc_output_dim_FBP2d_astra",
"_calc_output_dim_FBP3d_tomobar",
"_calc_output_dim_LPRec3d_tomobar",
"_calc_output_dim_SIRT3d_tomobar",
"_calc_output_dim_CGLS3d_tomobar",
]
Expand Down Expand Up @@ -58,6 +60,10 @@ def _calc_output_dim_FBP3d_tomobar(non_slice_dims_shape, **kwargs):
return __calc_output_dim_recon(non_slice_dims_shape, **kwargs)


def _calc_output_dim_LPRec3d_tomobar(non_slice_dims_shape, **kwargs):
return __calc_output_dim_recon(non_slice_dims_shape, **kwargs)


def _calc_output_dim_SIRT3d_tomobar(non_slice_dims_shape, **kwargs):
return __calc_output_dim_recon(non_slice_dims_shape, **kwargs)

Expand Down Expand Up @@ -153,6 +159,112 @@ def _calc_memory_bytes_FBP3d_tomobar(
return (tot_memory_bytes, fixed_amount)



def _calc_memory_bytes_LPRec3d_tomobar(
non_slice_dims_shape: Tuple[int, int],
dtype: np.dtype,
**kwargs,
) -> Tuple[int, int]:
# Based on: https://github.com/dkazanc/ToMoBAR/pull/112/commits/4704ecdc6ded3dd5ec0583c2008aa104f30a8a39

angles_tot = non_slice_dims_shape[0]
DetectorsLengthH = non_slice_dims_shape[1]
SLICES = 200 # dummy multiplier+divisor to pass large batch size threshold

n = DetectorsLengthH

odd_horiz = False
if (n % 2) != 0:
n = n + 1 # dealing with the odd horizontal detector size
odd_horiz = True

eps = 1e-4 # accuracy of usfft
mu = -np.log(eps) / (2 * n * n)
m = int(np.ceil(2 * n * 1 / np.pi * np.sqrt(-mu * np.log(eps) + (mu * n) * (mu * n) / 4)))

center_size = 6144
center_size = min(center_size, n * 2 + m * 2)

oversampling_level = 2 # at least 2 or larger required
ne = oversampling_level * n
padding_m = ne // 2 - n // 2

if "angles" in kwargs:
angles = kwargs["angles"]
sorted_theta_cpu = np.sort(angles)
theta_full_range = abs(sorted_theta_cpu[angles_tot-1] - sorted_theta_cpu[0])
angle_range_pi_count = 1 + int(np.ceil(theta_full_range / math.pi))
else:
angle_range_pi_count = 1 + int(np.ceil(2)) # assume a 2 * PI projection angle range

output_dims = __calc_output_dim_recon(non_slice_dims_shape, **kwargs)
if odd_horiz:
output_dims = tuple(x + 1 for x in output_dims)

in_slice_size = np.prod(non_slice_dims_shape) * dtype.itemsize
padded_in_slice_size = np.prod(non_slice_dims_shape) * np.float32().itemsize
theta_size = angles_tot * np.float32().itemsize
sorted_theta_indices_size = angles_tot * np.int64().itemsize
sorted_theta_size = angles_tot * np.float32().itemsize
recon_output_size = (n + 1) * (n + 1) * np.float32().itemsize if odd_horiz else n * n * np.float32().itemsize # 264
linspace_size = n * np.float32().itemsize
meshgrid_size = 2 * n * n * np.float32().itemsize
phi_size = 6 * n * n * np.float32().itemsize
angle_range_size = center_size * center_size * 1 + angle_range_pi_count * 2 * np.int32().itemsize
c1dfftshift_size = n * np.int8().itemsize
c2dfftshift_slice_size = 4 * n * n * np.int8().itemsize
filter_size = (n // 2 + 1) * np.float32().itemsize
rfftfreq_size = filter_size
scaled_filter_size = filter_size
tmp_p_input_slice = np.prod(non_slice_dims_shape) * np.float32().itemsize
padded_tmp_p_input_slice = angles_tot * (n + padding_m * 2) * dtype.itemsize
rfft_result_size = padded_tmp_p_input_slice
filtered_rfft_result_size = rfft_result_size
rfft_plan_slice_size = cufft_estimate_1d(nx=(n + padding_m * 2),fft_type=CufftType.CUFFT_R2C,batch=angles_tot * SLICES) / SLICES
irfft_result_size = filtered_rfft_result_size
irfft_scratch_memory_size = filtered_rfft_result_size
irfft_plan_slice_size = cufft_estimate_1d(nx=(n + padding_m * 2),fft_type=CufftType.CUFFT_C2R,batch=angles_tot * SLICES) / SLICES
conversion_to_complex_size = np.prod(non_slice_dims_shape) * np.complex64().itemsize / 2
datac_size = np.prod(non_slice_dims_shape) * np.complex64().itemsize / 2
fde_size = (2 * m + 2 * n) * (2 * m + 2 * n) * np.complex64().itemsize / 2
shifted_datac_size = datac_size
fft_result_size = datac_size
backshifted_datac_size = datac_size
scaled_backshifted_datac_size = datac_size
fft_plan_slice_size = cufft_estimate_1d(nx=n,fft_type=CufftType.CUFFT_C2C,batch=angles_tot * SLICES) / SLICES
fde_view_size = 4 * n * n * np.complex64().itemsize / 2
shifted_fde_view_size = fde_view_size
ifft2_scratch_memory_size = fde_view_size
ifft2_plan_slice_size = cufft_estimate_2d(nx=(2 * n),ny=(2 * n),fft_type=CufftType.CUFFT_C2C) / 2
fde2_size = n * n * np.complex64().itemsize / 2
concatenate_size = fde2_size
circular_mask_size = np.prod(output_dims) / 2 * np.int64().itemsize * 4

after_recon_swapaxis_slice = np.prod(non_slice_dims_shape) * np.float32().itemsize

tot_memory_bytes = int(
max(
in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + tmp_p_input_slice + padded_tmp_p_input_slice + rfft_result_size + filtered_rfft_result_size + irfft_result_size + irfft_scratch_memory_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + tmp_p_input_slice + datac_size + conversion_to_complex_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + fde_size + datac_size + shifted_datac_size + fft_result_size + backshifted_datac_size + scaled_backshifted_datac_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + shifted_fde_view_size + ifft2_scratch_memory_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + fde2_size + concatenate_size
, in_slice_size + padded_in_slice_size + recon_output_size + rfft_plan_slice_size + irfft_plan_slice_size + fft_plan_slice_size + ifft2_plan_slice_size + after_recon_swapaxis_slice
)
)

fixed_amount = int(
max(
theta_size + phi_size + linspace_size + meshgrid_size
, theta_size + sorted_theta_indices_size + sorted_theta_size + phi_size + angle_range_size + c1dfftshift_size + c2dfftshift_slice_size + filter_size + rfftfreq_size + scaled_filter_size
, theta_size + sorted_theta_indices_size + sorted_theta_size + phi_size + circular_mask_size
)
)

return (tot_memory_bytes * 1.1, fixed_amount)



def _calc_memory_bytes_SIRT3d_tomobar(
non_slice_dims_shape: Tuple[int, int],
dtype: np.dtype,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from os import path
import sys
import traceback

from cupy.cuda import memory_hook


class PeakMemoryLineProfileHook(memory_hook.MemoryHook):
name = 'LineProfileHook'

def __init__(self, running_peak_root_file_names, max_depth=0):
self._memory_frames = {}
self._root = MemoryFrame(None, None)
self._filename = path.abspath(__file__)
self._max_depth = max_depth
self.running_peak_root_file_names = running_peak_root_file_names

# callback
def malloc_preprocess(self, device_id, size, mem_size):
self._create_frame_tree(used_bytes=mem_size)

# callback
def alloc_preprocess(self, device_id, mem_size):
self._create_frame_tree(acquired_bytes=mem_size)

def free_preprocess(self, **kwargs):
mem_size = kwargs.get("mem_size")
if mem_size is not None:
self._create_frame_tree(freed_bytes=mem_size)

def _create_frame_tree(self, used_bytes=0, acquired_bytes=0, freed_bytes=0):
self._root.used_bytes += used_bytes
self._root.acquired_bytes += acquired_bytes
self._root.freed_bytes += freed_bytes
parent = self._root
for depth, stackframe in enumerate(self._extract_stackframes()):
if 0 < self._max_depth <= depth + 1:
break
memory_frame = self._add_frame(parent, stackframe)
memory_frame.used_bytes += used_bytes
memory_frame.acquired_bytes += acquired_bytes
memory_frame.freed_bytes += freed_bytes
parent = memory_frame

def _extract_stackframes(self):
stackframes = traceback.extract_stack()
stackframes = [StackFrame(st) for st in stackframes]
stackframes = [
st for st in stackframes if st.filename != self._filename]
return stackframes

def _key_frame(self, parent, stackframe):
return (parent,
stackframe.filename,
stackframe.lineno,
stackframe.name)

def _add_frame(self, parent, stackframe):
key = self._key_frame(parent, stackframe)
if key in self._memory_frames:
memory_frame = self._memory_frames[key]
else:
memory_frame = MemoryFrame(parent, stackframe)
self._memory_frames[key] = memory_frame
return memory_frame

def print_report(self, file=sys.stdout):
"""Prints a report of line memory profiling."""
line = '_root (%s, %s, %s)\n' % self._root.humanized_bytes()
file.write(line)

running_peak_bytes = [0]
running_used_bytes = [0]
for child in self._root.children:
self._print_frame(child, running_peak_bytes, running_used_bytes, depth=1, file=file)
file.flush()

def _print_frame(self, memory_frame, running_peak_bytes, running_used_bytes, depth=0, file=sys.stdout):
indent = ' ' * (depth * 2)
st = memory_frame.stackframe
used_bytes, acquired_bytes, freed_bytes = memory_frame.humanized_bytes()

humanized_running_peak_bytes = None
humanized_running_used_bytes = None
if path.basename(st.filename) in self.running_peak_root_file_names:
running_used_bytes[0] += memory_frame.used_bytes
running_peak_bytes[0] = max(running_peak_bytes[0], running_used_bytes[0])
running_used_bytes[0] -= memory_frame.freed_bytes

humanized_running_peak_bytes = MemoryFrame.humanized_size(running_peak_bytes[0])
humanized_running_used_bytes = MemoryFrame.humanized_size(running_used_bytes[0])

line = '%s%s:%s:%s (%s, %s, %s, %s, %s)\n' % (
indent, st.filename, st.lineno, st.name,
used_bytes, acquired_bytes, freed_bytes, humanized_running_peak_bytes, humanized_running_used_bytes)
file.write(line)
for child in memory_frame.children:
self._print_frame(child, running_peak_bytes, running_used_bytes, depth=depth + 1, file=file)


class StackFrame(object):
"""Compatibility layer for outputs of traceback.extract_stack().

Attributes:
filename (string): filename
lineno (int): line number
name (string): function name
"""

def __init__(self, obj):
if isinstance(obj, tuple): # < 3.5
self.filename = obj[0]
self.lineno = obj[1]
self.name = obj[2]
else: # >= 3.5 FrameSummary
self.filename = obj.filename
self.lineno = obj.lineno
self.name = obj.name


class MemoryFrame(object):
"""A single stack frame along with sum of memory usage at the frame.

Attributes:
stackframe (FrameSummary): stackframe from traceback.extract_stack().
parent (MemoryFrame): parent frame, that is, caller.
children (list of MemoryFrame): child frames, that is, callees.
used_bytes (int): memory bytes that users used from CuPy memory pool.
acquired_bytes (int): memory bytes that CuPy memory pool acquired
freed_bytes (int): memory bytes that were released to the CuPy memory pool
from GPU device.
"""

def __init__(self, parent, stackframe):
self.stackframe = stackframe
self.children = []
self._set_parent(parent)
self.used_bytes = 0
self.acquired_bytes = 0
self.freed_bytes = 0

def humanized_bytes(self):
used_bytes = MemoryFrame.humanized_size(self.used_bytes)
acquired_bytes = MemoryFrame.humanized_size(self.acquired_bytes)
freed_bytes = MemoryFrame.humanized_size(self.freed_bytes)
return (used_bytes, acquired_bytes, freed_bytes)

@staticmethod
def humanized_size(size):
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E']:
if size < 1024.0:
return '%3.2f%sB' % (size, unit)
size /= 1024.0
return '%.2f%sB' % (size, 'Z')

def _set_parent(self, parent):
if parent and parent not in parent.children:
self.parent = parent
parent.children.append(self)
30 changes: 30 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,36 @@ def ensure_clean_memory():
cache.clear()


def pytest_configure(config):
config.addinivalue_line(
"markers", "full: mark tests to run more GPU-memory consuming tests"
)


def pytest_addoption(parser):
parser.addoption(
"--full",
action="store_true",
default=False,
help="run more GPU memory hungry tests",
)


def pytest_collection_modifyitems(config, items):
if config.getoption("--full"):
skip_other = pytest.mark.skip(reason="not a GPU hungry test")
for item in items:
if "full" not in item.keywords:
item.add_marker(skip_other)
else:
skip_perf = pytest.mark.skip(
reason="this GPU memory hungry test, use '--full' to run"
)
for item in items:
if "full" in item.keywords:
item.add_marker(skip_perf)


@pytest.fixture(scope="session")
def test_data_path():
return CUR_DIR / "test_data"
Expand Down
Loading