Skip to content

Commit eef8a2e

Browse files
JulioAPerazatsalo
andauthored
Use sparse array in ALE, ALESubtraction, SCALE, KDA, and MKDADensity (#725)
* Use 4D array in ALE * Use 4D sparse array * Add support to sparse array in CBMAEstimator._fit() * calculate histogram on sparse array * Update nimare/meta/cbma/ale.py Co-authored-by: Taylor Salo <tsalo006@fiu.edu> * Update nimare/meta/cbma/ale.py Co-authored-by: Taylor Salo <tsalo006@fiu.edu> * Use sparse in KDA * Use sparse in MKDADensity * Suport sparse in generate.create_coordinate_dataset * Add support for sparse when `null_method="reduced_montecarlo"` * Fix issue with tests * get sample_sizes only if self.fwhm is None * build sparse for coordinates inside mask_data * pass image object to compute_ale_ma in generate * Add support for np.ndarray back again * Mask coordinates before creating the sparse array * Fix sparse support in KDADensity * modify unique_rows to handle mask when sum_overlap in KDA * save both histogram_bins and histogram_means * fix style issues * consider `value` when creating `counts` * Support sparse in _run_fwe_permutation for MKDAChi2 * keep unique_rows as general as possible * Update docstring of the modified functions * Drop support of np.ndarray on some functions * Make return_type="sparse" default in _collect_ma_maps * Use return_type="array" in SCALE until sparse is supported * Use sparse in SCALE * Use sparse in ALESubtraction * Fix typo * Small update: replace return_type="array" by "sparse" * Set n_mask_voxels as a private parameter Co-authored-by: Taylor Salo <tsalo006@fiu.edu> * Apply @tsalo and @jdkent review * Move the check of mask type to _preprocess_input * Return sparse if pre-generated MA maps are found * Drop support for np.ndarray in ALE * Remove unnecessary `gc.collect()` lines * Update mkda.py * Remove gc import * Drop support for np.ndarray in mkda * Move the check of the masker to _fit * Update mkda.py * Drop support for list in mkda * Drop support for list in ale * Update ale.py * Use memmap in SCALE * Remove map reuse test in ALE for memory limit parameter * Add versionchanged directives * Update utils.py * Remove line from docstring when sparse is the only option * Add test for estimator with non-NiftiMasker * Add histogram_means description to docstring * Update nimare/meta/utils.py Co-authored-by: Taylor Salo <tsalo006@fiu.edu> Co-authored-by: Taylor Salo <tsalo006@fiu.edu>
1 parent ac57801 commit eef8a2e

9 files changed

Lines changed: 482 additions & 221 deletions

File tree

nimare/generate.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from itertools import zip_longest
33

44
import numpy as np
5+
import sparse
56

67
from nimare.dataset import Dataset
78
from nimare.io import convert_neurovault_to_dataset
@@ -266,20 +267,22 @@ def _create_foci(foci, foci_percentage, fwhm, n_studies, n_noise_foci, rng, spac
266267
# create a probability map for each peak
267268
kernel = get_ale_kernel(template_img, fwhm)[1]
268269
foci_prob_maps = {
269-
tuple(peak): compute_ale_ma(template_data.shape, np.atleast_2d(peak), kernel)
270+
tuple(peak): compute_ale_ma(template_img, np.atleast_2d(peak), kernel=kernel).reshape(
271+
template_data.shape
272+
)
270273
for peak in ground_truth_foci_ijks
271274
if peak.size
272275
}
273276

274277
# get study specific instances of each foci
275278
signal_studies = int(round(foci_percentage * n_studies))
276279
signal_ijks = {
277-
peak: np.argwhere(prob_map)[
280+
peak: sparse.argwhere(prob_map)[
278281
rng.choice(
279-
np.argwhere(prob_map).shape[0],
282+
sparse.argwhere(prob_map).shape[0],
280283
size=signal_studies,
281284
replace=True,
282-
p=prob_map[np.nonzero(prob_map)] / sum(prob_map[np.nonzero(prob_map)]),
285+
p=(prob_map[prob_map.nonzero()] / sum(prob_map[prob_map.nonzero()])).todense(),
283286
)
284287
]
285288
for peak, prob_map in foci_prob_maps.items()

nimare/meta/cbma/ale.py

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import numpy as np
55
import pandas as pd
6+
import sparse
67
from joblib import Parallel, delayed
78
from tqdm.auto import tqdm
89

@@ -33,6 +34,10 @@
3334
class ALE(CBMAEstimator):
3435
"""Activation likelihood estimation.
3536
37+
.. versionchanged:: 0.0.12
38+
39+
- Use a 4D sparse array for modeled activation maps.
40+
3641
Parameters
3742
----------
3843
kernel_transformer : :obj:`~nimare.meta.kernel.KernelTransformer`, optional
@@ -151,32 +156,43 @@ def __init__(
151156

152157
def _compute_summarystat_est(self, ma_values):
153158
stat_values = 1.0 - np.prod(1.0 - ma_values, axis=0)
159+
160+
# np.array type is used by _determine_histogram_bins to calculate max_poss_ale
161+
if isinstance(stat_values, sparse._coo.core.COO):
162+
# NOTE: This may not work correctly with a non-NiftiMasker.
163+
mask_data = self.masker.mask_img.get_fdata().astype(bool)
164+
165+
stat_values = stat_values.todense().reshape(-1) # Indexing a .reshape(-1) is faster
166+
stat_values = stat_values[mask_data.reshape(-1)]
167+
168+
# This is used by _compute_null_approximate
169+
self.__n_mask_voxels = stat_values.shape[0]
170+
154171
return stat_values
155172

156173
def _determine_histogram_bins(self, ma_maps):
157174
"""Determine histogram bins for null distribution methods.
158175
159176
Parameters
160177
----------
161-
ma_maps
178+
ma_maps : :obj:`sparse._coo.core.COO`
179+
MA maps.
162180
163181
Notes
164182
-----
165183
This method adds one entry to the null_distributions_ dict attribute: "histogram_bins".
166184
"""
167-
if isinstance(ma_maps, list):
168-
ma_values = self.masker.transform(ma_maps)
169-
elif isinstance(ma_maps, np.ndarray):
170-
ma_values = ma_maps
171-
else:
185+
if not isinstance(ma_maps, sparse._coo.core.COO):
172186
raise ValueError(f"Unsupported data type '{type(ma_maps)}'")
173187

174188
# Determine bins for null distribution histogram
175189
# Remember that numpy histogram bins are bin edges, not centers
176190
# Assuming values of 0, .001, .002, etc., bins are -.0005-.0005, .0005-.0015, etc.
177191
INV_STEP_SIZE = 100000
178192
step_size = 1 / INV_STEP_SIZE
179-
max_ma_values = np.max(ma_values, axis=1)
193+
# Need to convert to dense because np.ceil is too slow with sparse
194+
max_ma_values = ma_maps.max(axis=[1, 2, 3]).todense()
195+
180196
# round up based on resolution
181197
max_ma_values = np.ceil(max_ma_values * INV_STEP_SIZE) / INV_STEP_SIZE
182198
max_poss_ale = self._compute_summarystat(max_ma_values)
@@ -189,7 +205,7 @@ def _compute_null_approximate(self, ma_maps):
189205
190206
Parameters
191207
----------
192-
ma_maps : list of imgs or numpy.ndarray
208+
ma_maps : :obj:`sparse._coo.core.COO`
193209
MA maps.
194210
195211
Notes
@@ -199,27 +215,34 @@ def _compute_null_approximate(self, ma_maps):
199215
- "histogram_bins"
200216
- "histweights_corr-none_method-approximate"
201217
"""
202-
if isinstance(ma_maps, list):
203-
ma_values = self.masker.transform(ma_maps)
204-
elif isinstance(ma_maps, np.ndarray):
205-
ma_values = ma_maps
206-
else:
218+
if not isinstance(ma_maps, sparse._coo.core.COO):
207219
raise ValueError(f"Unsupported data type '{type(ma_maps)}'")
208220

209221
assert "histogram_bins" in self.null_distributions_.keys()
210222

211-
def just_histogram(*args, **kwargs):
212-
"""Collect the first output (weights) from numpy histogram."""
213-
return np.histogram(*args, **kwargs)[0].astype(float)
214-
215223
# Derive bin edges from histogram bin centers for numpy histogram function
216224
bin_centers = self.null_distributions_["histogram_bins"]
217225
step_size = bin_centers[1] - bin_centers[0]
218226
inv_step_size = 1 / step_size
219227
bin_edges = bin_centers - (step_size / 2)
220228
bin_edges = np.append(bin_centers, bin_centers[-1] + step_size)
221229

222-
ma_hists = np.apply_along_axis(just_histogram, 1, ma_values, bins=bin_edges, density=False)
230+
n_exp = ma_maps.shape[0]
231+
n_bins = bin_centers.shape[0]
232+
ma_hists = np.zeros((n_exp, n_bins))
233+
data = ma_maps.data
234+
coords = ma_maps.coords
235+
for exp_idx in range(n_exp):
236+
# The first column of coords is the fourth dimension of the dense array
237+
study_ma_values = data[coords[0, :] == exp_idx]
238+
239+
n_nonzero_voxels = study_ma_values.shape[0]
240+
n_zero_voxels = self.__n_mask_voxels - n_nonzero_voxels
241+
242+
ma_hists[exp_idx, :] = np.histogram(study_ma_values, bins=bin_edges, density=False)[
243+
0
244+
].astype(float)
245+
ma_hists[exp_idx, 0] += n_zero_voxels
223246

224247
# Normalize MA histograms to get probabilities
225248
ma_hists /= ma_hists.sum(1)[:, None]
@@ -258,6 +281,7 @@ class ALESubtraction(PairwiseCBMAEstimator):
258281
- Use memmapped array for null distribution and remove ``memory_limit`` parameter.
259282
- Support parallelization and add progress bar.
260283
- Add ALE-difference (stat) and -log10(p) (logp) maps to results.
284+
- Use a 4D sparse array for modeled activation maps.
261285
262286
.. versionchanged:: 0.0.8
263287
@@ -352,16 +376,17 @@ def _fit(self, dataset1, dataset2):
352376
coords_key="coordinates2",
353377
)
354378

355-
n_grp1, n_voxels = ma_maps1.shape
356-
357379
# Get ALE values for the two groups and difference scores
358380
grp1_ale_values = self._compute_summarystat_est(ma_maps1)
359381
grp2_ale_values = self._compute_summarystat_est(ma_maps2)
360382
diff_ale_values = grp1_ale_values - grp2_ale_values
361383
del grp1_ale_values, grp2_ale_values
362384

385+
n_grp1 = ma_maps1.shape[0]
386+
n_voxels = diff_ale_values.shape[0]
387+
363388
# Combine the MA maps into a single array to draw from for null distribution
364-
ma_arr = np.vstack((ma_maps1, ma_maps2))
389+
ma_arr = sparse.concatenate((ma_maps1, ma_maps2))
365390

366391
del ma_maps1, ma_maps2
367392

@@ -418,6 +443,14 @@ def _fit(self, dataset1, dataset2):
418443

419444
def _compute_summarystat_est(self, ma_values):
420445
stat_values = 1.0 - np.prod(1.0 - ma_values, axis=0)
446+
447+
if isinstance(stat_values, sparse._coo.core.COO):
448+
# NOTE: This may not work correctly with a non-NiftiMasker.
449+
mask_data = self.masker.mask_img.get_fdata().astype(bool)
450+
451+
stat_values = stat_values.todense().reshape(-1) # Indexing a .reshape(-1) is faster
452+
stat_values = stat_values[mask_data.reshape(-1)]
453+
421454
return stat_values
422455

423456
def _run_permutation(self, i_iter, n_grp1, ma_arr, iter_diff_values):
@@ -440,8 +473,8 @@ def _run_permutation(self, i_iter, n_grp1, ma_arr, iter_diff_values):
440473
gen = np.random.default_rng(seed=i_iter)
441474
id_idx = np.arange(ma_arr.shape[0])
442475
gen.shuffle(id_idx)
443-
iter_grp1_ale_values = 1.0 - np.prod(1.0 - ma_arr[id_idx[:n_grp1], :], axis=0)
444-
iter_grp2_ale_values = 1.0 - np.prod(1.0 - ma_arr[id_idx[n_grp1:], :], axis=0)
476+
iter_grp1_ale_values = self._compute_summarystat_est(ma_arr[id_idx[:n_grp1], :])
477+
iter_grp2_ale_values = self._compute_summarystat_est(ma_arr[id_idx[n_grp1:], :])
445478
iter_diff_values[i_iter, :] = iter_grp1_ale_values - iter_grp2_ale_values
446479

447480
def _alediff_to_p_voxel(self, i_voxel, stat_value, voxel_null):
@@ -480,6 +513,7 @@ class SCALE(CBMAEstimator):
480513
481514
- Remove unused parameters ``voxel_thresh`` and ``memory_limit``.
482515
- Use memmapped array for null distribution.
516+
- Use a 4D sparse array for modeled activation maps.
483517
484518
.. versionchanged:: 0.0.10
485519
@@ -584,14 +618,17 @@ def _fit(self, dataset):
584618
)
585619

586620
# Determine bins for null distribution histogram
587-
max_ma_values = np.max(ma_values, axis=1)
621+
max_ma_values = ma_values.max(axis=[1, 2, 3]).todense()
622+
588623
max_poss_ale = self._compute_summarystat_est(max_ma_values)
589624
self.null_distributions_["histogram_bins"] = np.round(
590625
np.arange(0, max_poss_ale + 0.001, 0.0001), 4
591626
)
592627

593628
stat_values = self._compute_summarystat_est(ma_values)
594629

630+
del ma_values
631+
595632
iter_df = self.inputs_["coordinates"].copy()
596633
rand_idx = np.random.choice(self.xyz.shape[0], size=(iter_df.shape[0], self.n_iters))
597634
rand_xyz = self.xyz[rand_idx, :]
@@ -627,24 +664,30 @@ def _fit(self, dataset):
627664
return images
628665

629666
def _compute_summarystat_est(self, data):
630-
"""Generate ALE-value array and null distribution from list of contrasts.
667+
"""Generate ALE-value array and null distribution from a list of contrasts.
631668
632669
For ALEs on the original dataset, computes the null distribution.
633670
For permutation ALEs and all SCALEs, just computes ALE values.
634671
Returns masked array of ALE values and 1XnBins null distribution.
635672
"""
636673
if isinstance(data, pd.DataFrame):
637674
ma_values = self.kernel_transformer.transform(
638-
data, masker=self.masker, return_type="array"
675+
data, masker=self.masker, return_type="sparse"
639676
)
640-
elif isinstance(data, list):
641-
ma_values = self.masker.transform(data)
642-
elif isinstance(data, np.ndarray):
677+
elif isinstance(data, (np.ndarray, sparse._coo.core.COO)):
643678
ma_values = data
644679
else:
645680
raise ValueError(f"Unsupported data type '{type(data)}'")
646681

647682
stat_values = 1.0 - np.prod(1.0 - ma_values, axis=0)
683+
684+
if isinstance(stat_values, sparse._coo.core.COO):
685+
# NOTE: This may not work correctly with a non-NiftiMasker.
686+
mask_data = self.masker.mask_img.get_fdata().astype(bool)
687+
688+
stat_values = stat_values.todense().reshape(-1) # Indexing a .reshape(-1) is faster
689+
stat_values = stat_values[mask_data.reshape(-1)]
690+
648691
return stat_values
649692

650693
def _scale_to_p(self, stat_values, scale_values):

0 commit comments

Comments
 (0)