Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

Version 0.5.1
------------
- Fixes bug when sparse matrices contain strictly negative values, cf [this discussion](https://scverse.zulipchat.com/#narrow/channel/557094-differential-gene-expression/topic/Nonnegative.20expectation/near/590691572).

Version 0.5.0rc2
------------
- For better compatibility with `scanpy`, lower bounds on dependencies have been relaxed.
Expand Down
2 changes: 1 addition & 1 deletion illico/ovo/dense_ovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def dense_ovo_mwu_kernel(
n = n_ref + n_tgt
mu = n_ref * n_tgt / 2.0
for j in range(ncols):
ranksum, tie_sum = rank_sum_and_ties_from_sorted(sorted_ref_data[:, j], sorted_tgt_data[:, j])
ranksum, tie_sum, _ = rank_sum_and_ties_from_sorted(sorted_ref_data[:, j], sorted_tgt_data[:, j])

# Compute U-stat
U1 = ranksum - n_tgt * (n_tgt + 1) / 2.0
Expand Down
11 changes: 5 additions & 6 deletions illico/ovo/sparse_ovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,14 @@ def single_group_sparse_ovo_mwu_kernel(
lbr, ubr = sorted_ref_data.indptr[j], sorted_ref_data.indptr[j + 1]

# Compute ranksum and tie sum for non zero values
ranksum, tie_sum = rank_sum_and_ties_from_sorted(sorted_ref_data.data[lbr:ubr], sorted_tgt_data.data[lbt:ubt])

# Offset the ranks of the number of zeros in ref and perturbed
ranksum += n_zeros_combined * (ubt - lbt)
nz_ranksum, tie_sum, zpos = rank_sum_and_ties_from_sorted(
sorted_ref_data.data[lbr:ubr], sorted_tgt_data.data[lbt:ubt], zero_values_offset=n_zeros_combined
)

# Compute ranksum
n0 = n_zeros_tgt[j]
R1_nz = ranksum # Sum ranks
R1 = R1_nz + n0 * (n_zeros_ref[j] + n0 + 1) / 2.0 # Add sumranks of zeros
z_ranksum = (zpos + (n_zeros_ref[j] + n0 + 1) / 2.0) * n0 # Add sumranks of zeros
R1 = nz_ranksum + z_ranksum # Add sumranks of zeros

# Compute U-stat
U1 = R1 - n_tgt * (n_tgt + 1) / 2
Expand Down
2 changes: 1 addition & 1 deletion illico/ovr/dense_ovr.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def dense_ovr_mwu_kernel_over_contiguous_col_chunk(
ranksums = np.zeros(shape=(grpc.counts.size, chunk.shape[1]), dtype=np.float64)
for j in range(chunk.shape[1]):
idxs = np.argsort(chunk[:, j])
col_tie_sum = _accumulate_group_ranksums_from_argsort(chunk[:, j], idxs, grpc.encoded_groups, ranksums[:, j])
col_tie_sum, _ = _accumulate_group_ranksums_from_argsort(chunk[:, j], idxs, grpc.encoded_groups, ranksums[:, j])
tie_sum[j] = col_tie_sum

# Compute U stats
Expand Down
9 changes: 4 additions & 5 deletions illico/ovr/sparse_ovr.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,18 @@ def sparse_ovr_mwu_kernel(
nz_idx = X.indices[start:end]
"""Step 1: compute ranksum of non-zero elements, per group"""
_idxs = np.argsort(X.data[start:end])
tie_sum = _accumulate_group_ranksums_from_argsort(X.data[start:end], _idxs, groups[nz_idx], R1_nz[:, j])
tie_sum, zero_pos = _accumulate_group_ranksums_from_argsort(
X.data[start:end], _idxs, groups[nz_idx], R1_nz[:, j], zero_values_offset=int(n_zeros[j])
)
n0 = n_zeros[j]
"""Step 2: offset non-zero elements ranks by the number of zeros that precedes them"""
if nz_idx.size:
_add_at_scalar(nnz_per_group[:, j], groups[nz_idx], 1.0)
# Deduce number of zeros per group
nz_per_group = group_counts - nnz_per_group[:, j]
# Offset the non-zero ranks by the amount of 0 that precedes them
# All ranks must be shifted, so the sum is shifted by that many elements.
R1_nz[:, j] += n0 * nnz_per_group[:, j]
""" Step 3: Add ranksums of zero elements, per group"""
# add zero contribution: number of zeros * avg rank
R1 = R1_nz[:, j] + nz_per_group * (n0 + 1) / 2.0
R1 = R1_nz[:, j] + nz_per_group * (n0 + 1 + 2 * zero_pos) / 2.0
U[:, j] = R1 - n_tgt * (n_tgt + 1) / 2
tie_sum += n0**3 - n0

Expand Down
58 changes: 47 additions & 11 deletions illico/utils/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

@njit(nogil=True, cache=False, fastmath=True)
def _accumulate_group_ranksums_from_argsort(
arr: np.ndarray, idx: np.ndarray, groups: np.ndarray, ranksums: np.ndarray
) -> np.ndarray:
arr: np.ndarray, idx: np.ndarray, groups: np.ndarray, ranksums: np.ndarray, zero_values_offset: int = 0
) -> tuple[np.ndarray, int]:
"""From a given array of values, indices of sorted values (result of np.argsort) and groups, accumulate group rank
sums in the placegolder `ranksums`.

Expand All @@ -16,41 +16,53 @@ def _accumulate_group_ranksums_from_argsort(
idx (np.ndarray): Array of indices of sorted values
groups (np.ndarray): Array of group indicator
ranksums (np.ndarray): Plaeholder of shape (n_groups, ) where to accumulate rank sums
zero_values_offset (int): If > 0, it means that there are zeros not present in the input arrays but that
they should be accounted for. This is only used when the input adata is sparse, and ranksum is computed
on non zero values.

Returns:
np.ndarray: tie sums
int: position of zeros in the sorted array (if zero_values_offset > 0)

Comment thread
remydubois marked this conversation as resolved.
Author: Rémy Dubois

"""
# if ranks is None:
# ranks = np.empty(arr.size, dtype=np.float64)
zero_pos = -1
n = idx.size
i = 0
rank = 0
tie_sum = 0.0

while i < n:
# find tie block
j = i + 1
if zero_values_offset > 0 and arr[idx[i]] > 0:
zero_pos = i
rank += zero_values_offset
zero_values_offset = 0
while j < n and arr[idx[j]] == arr[idx[i]]:
j += 1
# average rank for indices i..j-1 (1-based)
avg_rank = 0.5 * (i + 1 + j) # mean of [i+1, ..., j]
avg_rank = rank + 0.5 * (j - i + 1)
for k in range(i, j):
if groups is not None:
ranksums[groups[idx[k]]] += avg_rank
else:
# If no group is specified, ranks are sum reduced into one value for the whole column
ranksums += avg_rank
tie_count = j - i
# Tie sum is the same for all froups
# Tie sum is the same for all groups
tie_sum += tie_count**3 - tie_count
i = j
rank += tie_count

if zero_pos == -1 and zero_values_offset > 0:
zero_pos = n

return tie_sum
return tie_sum, zero_pos


@njit(nogil=True)
def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndarray]:
def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray, zero_values_offset: int = 0) -> tuple[np.ndarray]:
"""Compute rank sums and tie sums from two 1-d sorted arrays.

This routine is similar to the leetcode "merge two sorted arrays", except it
Expand All @@ -62,6 +74,9 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
Args:
A (np.ndarray): The first sorted array (controls)
B (np.ndarray): The second sorted array (perturbed)
zero_values_offset (int): If > 0, it means that there are zeros not present in the input arrays but that
they should be accounted for. This is only used when the input adata is sparse, and ranksum is computed
on non zero values.

Returns:
tuple[np.ndarray]: Ranks sum from the second array, and tie sums for the combined
Expand All @@ -76,8 +91,9 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
i = 0
j = 0
k = 0 # number of items processed so far (0-based)
zero_pos = -1 # Default setting

sum_ranks_B = 0.0
sum_ranks_B = 0
Comment thread
remydubois marked this conversation as resolved.
Outdated
tie_sum = 0.0

# main sweep
Expand All @@ -88,6 +104,11 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
else:
v = B[j]

if v > 0 and zero_values_offset > 0:
zero_pos = k
k += zero_values_offset
zero_values_offset = 0

# count occurrences in A
tA = 0
ii = i
Expand Down Expand Up @@ -118,6 +139,11 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
while i < nA:
v = A[i]

if v > 0 and zero_values_offset > 0:
zero_pos = k
k += zero_values_offset
zero_values_offset = 0

# count in A
tA = 0
ii = i
Expand All @@ -139,6 +165,11 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
while j < nB:
v = B[j]

if v > 0 and zero_values_offset > 0:
zero_pos = k
k += zero_values_offset
zero_values_offset = 0

# count in B
tB = 0
jj = j
Expand All @@ -156,7 +187,12 @@ def rank_sum_and_ties_from_sorted(A: np.ndarray, B: np.ndarray) -> tuple[np.ndar
k += t
j = jj

return sum_ranks_B, tie_sum
# if zero_pos is still not set, it means there are no positive values in the entire column.
# so zeros come at the end of it
if zero_pos == -1 and zero_values_offset > 0:
zero_pos = k

return sum_ranks_B, tie_sum, zero_pos


@njit(nogil=True, cache=False)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "illico"
version = "0.5.0rc2"
version = "0.5.1"
description = "Fast asymptotic mannwhitney-u test"
authors = [
{name = "remydubois",email = "remydubois14@gmail.com"}
Expand Down
2 changes: 1 addition & 1 deletion src/dense_ovo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ pub fn dense_ovo_kernel<D: SparseFloat>(

let remainder = &n_tgt * (&n_tgt + 1.) / 2.;
for j in 0..n_cols as usize {
let (rs, ts) = rank_sum_and_ties(sorted_controls.column(j), sorted_tgt.column(j));
let (rs, ts, _) = rank_sum_and_ties(sorted_controls.column(j), sorted_tgt.column(j), 0);

let u = rs - remainder;

Expand Down
3 changes: 2 additions & 1 deletion src/dense_ovr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ pub fn dense_ovr_kernel<D: SparseFloat>(
let mut tie_sums = Array1::zeros(chunk_ub - chunk_lb);
for j in 0..chunk.dim().1 {
let sorted_indices = argsort(chunk.column(j));
accumulate_rank_and_tie_sums_from_argsort(
_ = accumulate_rank_and_tie_sums_from_argsort(
chunk.column(j),
sorted_indices,
grpc.encoded_groups,
ranksums.column_mut(j),
tie_sums.slice_mut(s![j]),
0,
)?;
Comment thread
remydubois marked this conversation as resolved.
Outdated
}

Expand Down
56 changes: 48 additions & 8 deletions src/ranking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,19 @@ pub fn sort_along_axis_0_inplace_rust(mut x: PyReadwriteArray2<f32>) -> PyResult
Ok(())
}

pub fn rank_sum_and_ties<D: SparseFloat>(ctrl: ArrayView1<D>, tgt: ArrayView1<D>) -> (f64, f64) {
pub fn rank_sum_and_ties<D: SparseFloat>(
ctrl: ArrayView1<D>,
tgt: ArrayView1<D>,
mut zero_values_offset: usize,
) -> (f64, f64, usize) {
let n_ctrl = ctrl.len();
let n_tgt = tgt.len();

// Initialize pointers
let mut i = 0;
let mut j = 0;
let mut k: usize = 0;
let mut zero_pos: isize = -1;

// Initialize accumulators
let mut rank_sum_tgt: f64 = 0.;
Expand All @@ -67,6 +72,12 @@ pub fn rank_sum_and_ties<D: SparseFloat>(ctrl: ArrayView1<D>, tgt: ArrayView1<D>
while i < n_ctrl && j < n_tgt {
let v = ctrl[i].min(tgt[j]);

if (v.to_f64() > 0.) & (zero_values_offset > 0) {
zero_pos = k as isize;
k += zero_values_offset;
zero_values_offset = 0;
}
Comment thread
remydubois marked this conversation as resolved.

// Count unique values in control values
let mut count_ctrl: usize = 0;
let mut offset_i = i;
Expand Down Expand Up @@ -119,6 +130,12 @@ pub fn rank_sum_and_ties<D: SparseFloat>(ctrl: ArrayView1<D>, tgt: ArrayView1<D>
while i < n_ctrl {
let v = ctrl[i];

if (v.to_f64() > 0.) & (zero_values_offset > 0) {
zero_pos = k as isize;
k += zero_values_offset;
zero_values_offset = 0;
}

// Count unique values in control values
let mut count_ctrl: usize = 0;
for l in i..n_ctrl {
Expand All @@ -140,6 +157,12 @@ pub fn rank_sum_and_ties<D: SparseFloat>(ctrl: ArrayView1<D>, tgt: ArrayView1<D>
while j < n_tgt {
let v = tgt[j];

if (v.to_f64() > 0.) & (zero_values_offset > 0) {
zero_pos = k as isize;
k += zero_values_offset;
zero_values_offset = 0;
}

// Count unique values in target values
let mut count_tgt: usize = 0;
for l in j..n_tgt {
Expand All @@ -161,7 +184,11 @@ pub fn rank_sum_and_ties<D: SparseFloat>(ctrl: ArrayView1<D>, tgt: ArrayView1<D>
k += count_tgt;
}

return (rank_sum_tgt, tie_sum);
if (zero_pos == -1) & (zero_values_offset > 0) {
zero_pos = k as isize;
}

return (rank_sum_tgt, tie_sum, zero_pos as usize);
}

#[pyfunction]
Expand All @@ -171,7 +198,7 @@ pub fn rank_sum_and_ties_rust(
) -> (f64, f64) {
let controls = controls.as_array();
let target = target.as_array();
let (ranksum, tiesum) = rank_sum_and_ties(controls, target);
let (ranksum, tiesum, _) = rank_sum_and_ties(controls, target, 0);
return (ranksum, tiesum);
}

Expand All @@ -181,7 +208,8 @@ pub fn accumulate_rank_and_tie_sums_from_argsort<D: SparseFloat>(
group_labels: ArrayView1<usize>,
mut ranksums: ArrayViewMut1<f64>,
mut tie_sum: ArrayViewMut0<f64>,
) -> Result<(), String> {
mut zero_values_offset: usize,
) -> Result<usize, String> {
let n_sorted_idx = sorted_indices.len();
let n_values = x.len();
let n_grp_indices = group_labels.len();
Expand Down Expand Up @@ -217,15 +245,22 @@ pub fn accumulate_rank_and_tie_sums_from_argsort<D: SparseFloat>(
}

let mut i: usize = 0;
let mut rank: usize = 0;
let mut zero_pos: isize = -1;
while i < n_values {
// Find tie block
let mut j = i + 1;
if (zero_values_offset > 0) & (x[sorted_indices[i]].to_f64() > 0.) {
zero_pos = i as isize;
rank += zero_values_offset;
zero_values_offset = 0;
}
Comment thread
remydubois marked this conversation as resolved.
Outdated
while j < n_values && x[sorted_indices[j]] == x[sorted_indices[i]] {
j += 1
}
// Now by def: j - i gives the size of the tie block
// Compute the average rank for this block
let avg_rank = 0.5 * (i + j + 1) as f64;
let avg_rank = rank as f64 + 0.5 * (j - i + 1) as f64;

// Accumulate this avg rank in each group's palceholder
for k in i..j {
Expand All @@ -237,12 +272,17 @@ pub fn accumulate_rank_and_tie_sums_from_argsort<D: SparseFloat>(
}

// Now take care of the tie sum
let tie_block_size: f64 = (j - i) as f64;
tie_sum += tie_block_size.powi(3) - tie_block_size;
let tie_block_size = j - i;
tie_sum += tie_block_size.pow(3) as f64 - tie_block_size as f64;
rank += tie_block_size;
i = j;
}

Ok(())
if (zero_pos == -1) & (zero_values_offset > 0) {
zero_pos = n_values as isize;
}

Ok(zero_pos as usize)
}

// Because only the indices are mutated in argsort, input values do not need to be a vec or a contiguous memory alloc
Expand Down
Loading
Loading