From be2ceb9010341695f67bc92716bde76af4754066 Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Mon, 18 Sep 2023 18:33:15 +0100
Subject: [PATCH 1/6] changed some descriptions and added a function to
 calculatedifferences between voxels in timeseries

---
 findoutlie/dvars_distribution.py | 33 +++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index c4b66d3..d7a222e 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -11,7 +11,7 @@
 import numpy as np
 
 def distribution_mean(data):
-    """ Calculate mean of standard distribution on Nibabel image `img`
+    """ Calculate mean of standard distribution on 4D data
     used to calculate p-value for DVARS metrics
 
     Calculate the sum of variances for voxel intensities 
@@ -22,7 +22,7 @@ def distribution_mean(data):
 
     Parameters
     ----------
-    img : nibabel image
+    data: 4D timeseries
 
     Returns
     -------
@@ -32,4 +32,31 @@ def distribution_mean(data):
     # calculate the variance of all voxel timeseries
     voxel_variances = np.var(data, axis=3)
     # sum of variances divided by sum of all intensities
-    return np.sum(voxel_variances)/np.sum(data)
\ No newline at end of file
+    return np.sum(voxel_variances)/np.sum(data)
+
+def dvars_data(data):
+    """ Calculate dvars metric on 4D data
+
+    The dvars calculation between two volumes is defined as the square root of
+    (the mean of the (voxel differences squared)).
+
+    Parameters
+    ----------
+    data: 4D timeseries
+
+    Returns
+    -------
+    dvals : 1D array
+        One-dimensional array with n-1 elements, where n is the number of
+        volumes in data.
+    """
+    # create two timeseries with n-1 volumes and find the difference between them
+    # remove the last volume
+    img_start = data[...,:-1]
+    # remove the first volume
+    img_end = data[..., 1:]
+    return np.sqrt(np.mean((img_start - img_end)**2, axis = (0,1,2)))
+
+def distribution_variance(data):
+
+    dvars_data(data):
\ No newline at end of file

From 3c368f112ac644fbac1a270aa2ef3dc4e0428bef Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Mon, 18 Sep 2023 23:17:40 +0100
Subject: [PATCH 2/6] fixed incorrect calculation of null distribution mean by
 adding iqr differences in voxel timeseries

---
 findoutlie/dvars_distribution.py            | 71 +++++++++++++++------
 findoutlie/tests/test_dvars_distribution.py | 48 ++++++++++----
 2 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index d7a222e..8a8be1f 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -9,15 +9,18 @@
 """
 
 import numpy as np
+import scipy
+from scipy.special import ndtri
+
 
 def distribution_mean(data):
-    """ Calculate mean of standard distribution on 4D data
+    """ Calculate mean of null distribution on 4D data
     used to calculate p-value for DVARS metrics
 
-    Calculate the sum of variances for voxel intensities 
-    and divide by the sum of intensities
+    Calculate the mean variances of voxel intensities 
+
 
-    For more information see equation 12: 
+    For more information see equation 12 and 13: 
     https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
 
     Parameters
@@ -27,36 +30,66 @@ def distribution_mean(data):
     Returns
     -------
     mu_0 : float
-        decimal to describe to mean for the standard distribution of the timeseries
+        decimal to describe the mean for the null distribution of the timeseries
     """
     # calculate the variance of all voxel timeseries
-    voxel_variances = np.var(data, axis=3)
-    # sum of variances divided by sum of all intensities
-    return np.sum(voxel_variances)/np.sum(data)
+    voxel_variances = voxel_iqr_variance(data)
 
-def dvars_data(data):
-    """ Calculate dvars metric on 4D data
+    # sum of variances divided by voxels per volume (mean)
+    return np.mean(voxel_variances)
 
-    The dvars calculation between two volumes is defined as the square root of
-    (the mean of the (voxel differences squared)).
+def voxel_differences(data):
+    """ Calculate difference between voxel i at time t 
+    and the same voxel at time t + 1 
+    
+    V(t+1)-V(t)
 
     Parameters
     ----------
-    data: 4D timeseries
+    data: normalised 4D timeseries
 
     Returns
     -------
-    dvals : 1D array
-        One-dimensional array with n-1 elements, where n is the number of
-        volumes in data.
+    dvals : 4D array with one less volume than 'data'
     """
     # create two timeseries with n-1 volumes and find the difference between them
     # remove the last volume
     img_start = data[...,:-1]
     # remove the first volume
     img_end = data[..., 1:]
-    return np.sqrt(np.mean((img_start - img_end)**2, axis = (0,1,2)))
+    return img_start - img_end
+
+def voxel_iqr_variance(data):
+    """ Calculate variance of voxel timeseries using iqr
+    used to calculate mean of null distribution
+
+    For more information see equation 13: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    data: 4D difference of adjacent voxel intensities
+
+    Returns
+    -------
+    variance: 3D array of variances for each voxel
+    """
+    # from the data calculate the differences between voxels at t and t+1
+    all_voxel_differences = voxel_differences(data)
+    
+    # initiate variable for loop
+    iqr_dvars_values = np.zeros(data.shape[:-1])
+
+    # initiate loop over x,y,z 
+    for x_value in range(data.shape[0]):
+        for y_value in range(data.shape[1]):
+            for z_value in range(data.shape[2]):
+                
+                # calculate iqr for each voxel timeseries
+                q1, q3 = np.percentile(all_voxel_differences[x_value,y_value,z_value,:], [25, 75])
+                iqr_dvars_values[x_value,y_value,z_value] = q3 - q1
 
-def distribution_variance(data):
+    # IQR os standard normal distribution
+    iqr_0 = ndtri(0.75) - ndtri(0.25)
 
-    dvars_data(data):
\ No newline at end of file
+    return iqr_dvars_values/iqr_0
\ No newline at end of file
diff --git a/findoutlie/tests/test_dvars_distribution.py b/findoutlie/tests/test_dvars_distribution.py
index a413c6a..4abdabc 100644
--- a/findoutlie/tests/test_dvars_distribution.py
+++ b/findoutlie/tests/test_dvars_distribution.py
@@ -15,21 +15,47 @@
 """
 
 import numpy as np
-from findoutlie.dvars_distribution import distribution_mean
+from findoutlie.dvars_distribution import distribution_mean, voxel_differences, voxel_iqr_variance
+from scipy.special import ndtri
+from math import isclose
 
 def test_dvars_distribution():
     example_values1 = np.ones((3,3,3,6))
-    example_values2 = np.append(example_values1,example_values1+1, axis=3)
+    example_values2 = np.zeros((3,3,3,12))
+    for time_values in range(example_values2.shape[-1]):
+        example_values2[...,time_values] = time_values**2
+    
+    example_answer2_array = voxel_differences(example_values2)[1,1,1,:]
+    q1, q3 = np.percentile(example_answer2_array, [25, 75])
+    answer2 = (q3 - q1)/(ndtri(0.75) - ndtri(0.25))
+
+    assert distribution_mean(example_values1) == 0
+    assert isclose(distribution_mean(example_values2), answer2)
+
+def test_voxel_differences():
+    example_values = np.ones((3,3,3,6))
+    example_values1 = np.append(example_values,example_values+1, axis=3)
+
+    # check correct shape for answer
+    assert voxel_differences(example_values).shape == (3,3,3,5)
+
+    # check correct order of operations and values
+    example_answer1 = np.zeros(example_values.shape[:-1]+(example_values.shape[-1] - 1,))
+    example_answer1 = np.append(example_answer1,example_values, axis=3)
+    assert (voxel_differences(example_values1) == example_answer1).all
+
+def test_voxel_iqr_variance():
+
+    example_values = np.zeros((3,3,3,12))
+    for time_values in range(example_values.shape[-1]):
+        example_values[...,time_values] = time_values
+
+    q1, q3 = np.percentile(example_values[1,1,1,:], [25, 75])
+    example_answer = np.zeros(example_values.shape[:-1]) + q3 - q1
+    assert example_answer.shape == voxel_iqr_variance(example_values).shape
+    assert (example_answer == voxel_iqr_variance(example_values)).all
+    
 
-    # Values for exmple 2
-    example2_intensity_sum = 27*6 + 27*6*2
-    example2_variance = 0.25
-    example2_variance_sum = 0.25*27
-    example_distr_mean2 = example2_variance_sum/example2_intensity_sum
 
-    dvars_mean1 = distribution_mean(example_values1)
-    dvars_mean2 = distribution_mean(example_values2)
 
-    assert dvars_mean1 == 0
-    assert dvars_mean2 == example_distr_mean2
 

From 90fba03530de2d358c0f507caf545f9af8331408 Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Tue, 19 Sep 2023 13:28:31 +0100
Subject: [PATCH 3/6] Added and clarified comments of functions, fixed
 calculation in distribution_mean function to calculate the median of voxel
 IQR timeseries differences instead of the mean (look at reference paper
 beneath equation 13)

---
 findoutlie/dvars_distribution.py            | 32 ++++++++++++++-------
 findoutlie/tests/test_dvars_distribution.py |  6 +++-
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index 8a8be1f..a306924 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -15,10 +15,18 @@
 
 def distribution_mean(data):
     """ Calculate mean of null distribution on 4D data
-    used to calculate p-value for DVARS metrics
+    using robust IQR measurements of voxel intensity differences.
 
-    Calculate the mean variances of voxel intensities 
+    Maths:
 
+    Step 1
+    Calculate IQR_i/IQR_0 where IQR_i is the IQR for voxel timeseries
+    in location i and IQR_0 is the IQR for the standard null distribution.
+    see function voxel_iqr_variance
+
+    Step 2
+    The median of {IQR_i/IQR_0} is an estimate for the mean of the null
+    distribution, other estimates are available
 
     For more information see equation 12 and 13: 
     https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
@@ -35,22 +43,24 @@ def distribution_mean(data):
     # calculate the variance of all voxel timeseries
     voxel_variances = voxel_iqr_variance(data)
 
-    # sum of variances divided by voxels per volume (mean)
-    return np.mean(voxel_variances)
+    # the median of the iqr 
+    return np.median(voxel_variances)
 
 def voxel_differences(data):
     """ Calculate difference between voxel i at time t 
     and the same voxel at time t + 1 
     
-    V(t+1)-V(t)
+    Maths:
+
+    V_i(t+1)-V_i(t)
 
     Parameters
     ----------
-    data: normalised 4D timeseries
+    data: normalised 4D timeseries of shape (X,Y,Z,T)
 
     Returns
     -------
-    dvals : 4D array with one less volume than 'data'
+    dvals : 4D array with one less volume than 'data' of shape (X,Y,Z,T-1)
     """
     # create two timeseries with n-1 volumes and find the difference between them
     # remove the last volume
@@ -60,7 +70,7 @@ def voxel_differences(data):
     return img_start - img_end
 
 def voxel_iqr_variance(data):
-    """ Calculate variance of voxel timeseries using iqr
+    """ Calculate IQR of voxel timeseries differences
     used to calculate mean of null distribution
 
     For more information see equation 13: 
@@ -68,11 +78,11 @@ def voxel_iqr_variance(data):
 
     Parameters
     ----------
-    data: 4D difference of adjacent voxel intensities
+    data: 4D timeseries
 
     Returns
     -------
-    variance: 3D array of variances for each voxel
+    IQR_values: 3D array of IQR for each voxel timeseries difference
     """
     # from the data calculate the differences between voxels at t and t+1
     all_voxel_differences = voxel_differences(data)
@@ -89,7 +99,7 @@ def voxel_iqr_variance(data):
                 q1, q3 = np.percentile(all_voxel_differences[x_value,y_value,z_value,:], [25, 75])
                 iqr_dvars_values[x_value,y_value,z_value] = q3 - q1
 
-    # IQR os standard normal distribution
+    # IQR of standard normal distribution
     iqr_0 = ndtri(0.75) - ndtri(0.25)
 
     return iqr_dvars_values/iqr_0
\ No newline at end of file
diff --git a/findoutlie/tests/test_dvars_distribution.py b/findoutlie/tests/test_dvars_distribution.py
index 4abdabc..07b1eb3 100644
--- a/findoutlie/tests/test_dvars_distribution.py
+++ b/findoutlie/tests/test_dvars_distribution.py
@@ -19,9 +19,12 @@
 from scipy.special import ndtri
 from math import isclose
 
-def test_dvars_distribution():
+def test_distribution_mean():
+    
+    # create example matrices 
     example_values1 = np.ones((3,3,3,6))
     example_values2 = np.zeros((3,3,3,12))
+
     for time_values in range(example_values2.shape[-1]):
         example_values2[...,time_values] = time_values**2
     
@@ -32,6 +35,7 @@ def test_dvars_distribution():
     assert distribution_mean(example_values1) == 0
     assert isclose(distribution_mean(example_values2), answer2)
 
+
 def test_voxel_differences():
     example_values = np.ones((3,3,3,6))
     example_values1 = np.append(example_values,example_values+1, axis=3)

From b91b1f2ddc322527f30647d055df7be7d9adf043 Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Tue, 19 Sep 2023 17:24:44 +0100
Subject: [PATCH 4/6] added estimation of variance for null distribution,
 correction of variance using power calculation and calculation of z-scores
 using the null distribution

---
 findoutlie/dvars_distribution.py            | 170 +++++++++++++++++++-
 findoutlie/tests/test_dvars_distribution.py |  30 +++-
 2 files changed, 192 insertions(+), 8 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index a306924..4229bae 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -13,7 +13,7 @@
 from scipy.special import ndtri
 
 
-def distribution_mean(data):
+def null_distribution_mean(data):
     """ Calculate mean of null distribution on 4D data
     using robust IQR measurements of voxel intensity differences.
 
@@ -38,7 +38,7 @@ def distribution_mean(data):
     Returns
     -------
     mu_0 : float
-        decimal to describe the mean for the null distribution of the timeseries
+        estimate of the mean for the null distribution of the timeseries
     """
     # calculate the variance of all voxel timeseries
     voxel_variances = voxel_iqr_variance(data)
@@ -102,4 +102,168 @@ def voxel_iqr_variance(data):
     # IQR of standard normal distribution
     iqr_0 = ndtri(0.75) - ndtri(0.25)
 
-    return iqr_dvars_values/iqr_0
\ No newline at end of file
+    return iqr_dvars_values/iqr_0
+
+def volume_normalisation(raw_data): 
+    """ Calculate normalised volume intensity values from raw
+    4D timeseries
+    
+    Maths:
+    
+    M_Ri = raw mean value for voxel i
+    Y_Rit = raw value for voxel i in volume t
+    m_R = mean (or median) raw value of {M_Ri}
+
+    Normalised value for voxel i at time t
+    Y_it = 100*(Y_Rit-M_Ri)/m_R 
+
+    see equation 1: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    raw_data: raw 4D timeseries 
+
+    Returns
+    -------
+    data : normalised 4D timeseries
+    """
+
+    # overall mean value (note this could be changed to the median)
+    m_R = np.mean(raw_data)
+
+    normalised_data = np.zeros(raw_data.shape)
+    
+    # initiate loop over x,y,z 
+    for x_value in range(raw_data.shape[0]):
+        for y_value in range(raw_data.shape[1]):
+            for z_value in range(raw_data.shape[2]):
+                    
+                    # calculate mean for voxel timeseries
+                    M_Ri = np.mean(raw_data[x_value,y_value,z_value,:])
+                    
+                    # loop over volumes
+                    for t_value in range(raw_data.shape[3]):
+
+                        # get individual raw intensities
+                        Y_Rit = raw_data[x_value,y_value,z_value,t_value]
+                        # calculate normalised intensity
+                        Y_it = 100*(Y_Rit-M_Ri)/m_R 
+                        normalised_data[x_value,y_value,z_value,t_value] = Y_it
+    
+    return normalised_data
+
+def dvars_squared(data):
+    """ Calculate dvars^2 from 4D data
+
+    The dvars calculation between two volumes is defined as the square root of
+    (the mean of the (voxel differences squared)).
+
+    Parameters
+    ----------
+    data : 4D volumes
+
+    Returns
+    -------
+    dvals^2 : 1D array
+        One-dimensional array with n-1 elements, where n is the number of
+        volumes in data.
+    """
+    # create two timeseries with n-1 volumes and find the difference between them
+    # remove the last volume
+    img_start = data[...,:-1]
+    # remove the first volume
+    img_end = data[..., 1:]
+
+    return np.mean((img_start - img_end)**2, axis = (0,1,2))
+
+def null_distribution_variance(data):
+    """ Calculate estimate of variance for null distribution on 4D data
+
+    Maths:
+
+    Step 1
+    Calculate dvars_squared (from function)
+
+    Step 2
+    Find hIQR({dvars^2_t}) the half IQR defined as the difference between
+    the median and lower quartile.
+
+    Step 3
+    Calculate the variance of the null distribution = hIQR({dvars^2_t})/hIQR_0 
+    where hIQR_0 is half of the IQR for the standard normal distribution
+
+    For more information see equation 14: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    data: 4D timeseries
+
+    Returns
+    -------
+    variance_0 : float
+        estimate of the variance for the null distribution of the timeseries
+    """
+
+    # calculate dvars values
+    dvars_squared_values = dvars_squared(data)
+
+    # calculate hIQR for dvars values
+    q1, q2 = np.percentile(dvars_squared_values, [25, 50])
+    hIQR = q2 - q1
+
+    # define hIQR_0 and calculate null distribution variance
+    hIQR_0 = (ndtri(0.75) - ndtri(0.25))/2
+    return hIQR/hIQR_0
+
+def dvars_z_scores(data):
+    """ Calculate z_scores for dvars values testing if dvars values
+    belong to the null distribution
+
+    Maths:
+
+    Z(dvars_t) = (dvars^2_t - mu_0)/sqrt(variance_0)
+
+    For more information see equation 15: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    data: 4D timeseries
+
+    Returns
+    -------
+    z_scores: 1D array
+        One-dimensional array with n-1 elements, where n is the number of
+        volumes in data.
+    """
+
+    dvars_squared_values = dvars_squared(data)
+    null_mean = null_distribution_mean(data)
+    null_variance = null_distribution_variance(data)
+    
+    return (dvars_squared_values-null_mean)/np.sqrt(null_variance)
+
+def null_variance_correction(null_mean,null_variance, d=3):
+    """ Correcting from possible positive skew of estimated null distribution using 
+    power transformation (cube root transformation)
+
+    Maths:
+
+    Variance(dvars_t) = (1/d)*variance_0*mu_0^(2/d-2)
+
+    For more information see Appendix F.2: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    null_mean: estimated mean of the null distribution
+    null_variance: uncorrected estimate for the null variance
+    d: transformation factor (optimal at 3)
+
+    Returns
+    -------
+    corrected_variance: float
+    """
+    return (1/d)*null_variance*null_mean**(2/d-2)
\ No newline at end of file
diff --git a/findoutlie/tests/test_dvars_distribution.py b/findoutlie/tests/test_dvars_distribution.py
index 07b1eb3..cf97d8e 100644
--- a/findoutlie/tests/test_dvars_distribution.py
+++ b/findoutlie/tests/test_dvars_distribution.py
@@ -15,11 +15,13 @@
 """
 
 import numpy as np
-from findoutlie.dvars_distribution import distribution_mean, voxel_differences, voxel_iqr_variance
+from findoutlie.dvars_distribution import null_distribution_mean, voxel_differences, voxel_iqr_variance, dvars_squared
 from scipy.special import ndtri
 from math import isclose
+import nipraxis as npx
+import nibabel as nib
 
-def test_distribution_mean():
+def test_null_distribution_mean():
     
     # create example matrices 
     example_values1 = np.ones((3,3,3,6))
@@ -32,8 +34,8 @@ def test_distribution_mean():
     q1, q3 = np.percentile(example_answer2_array, [25, 75])
     answer2 = (q3 - q1)/(ndtri(0.75) - ndtri(0.25))
 
-    assert distribution_mean(example_values1) == 0
-    assert isclose(distribution_mean(example_values2), answer2)
+    assert null_distribution_mean(example_values1) == 0
+    assert isclose(null_distribution_mean(example_values2), answer2)
 
 
 def test_voxel_differences():
@@ -60,6 +62,24 @@ def test_voxel_iqr_variance():
     assert (example_answer == voxel_iqr_variance(example_values)).all
     
 
-
+TEST_FNAME = npx.fetch_file('ds114_sub009_t2r1.nii')
+
+def test_dvars_squared():
+    img = nib.load(TEST_FNAME)
+    n_trs = img.shape[-1]
+    n_voxels = np.prod(img.shape[:-1])
+    data = img.get_fdata()
+    dvals = dvars_squared(data)
+    assert len(dvals) == n_trs - 1
+    # Calculate the values the long way round
+    data = img.get_fdata()
+    prev_vol = data[..., 0]
+    long_dvals = []
+    for i in range(1, n_trs):
+        this_vol = data[..., i]
+        d = this_vol - prev_vol
+        long_dvals.append(np.sum(d ** 2) / n_voxels)
+        prev_vol = this_vol
+    assert np.allclose(dvals, long_dvals)
 
 

From 8b69c21a19bb8ce634c4a2a0c544ad3d2ca3a12b Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Tue, 19 Sep 2023 17:46:51 +0100
Subject: [PATCH 5/6] minor changes to comments

---
 findoutlie/dvars_distribution.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index 4229bae..4eac662 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -241,9 +241,10 @@ def dvars_z_scores(data):
 
     dvars_squared_values = dvars_squared(data)
     null_mean = null_distribution_mean(data)
-    null_variance = null_distribution_variance(data)
-    
-    return (dvars_squared_values-null_mean)/np.sqrt(null_variance)
+    null_variance_uncorrected = null_distribution_variance(data)
+    null_variance_corrected = null_variance_correction(null_mean,null_variance_uncorrected)
+
+    return (dvars_squared_values-null_mean)/np.sqrt(null_variance_corrected)
 
 def null_variance_correction(null_mean,null_variance, d=3):
     """ Correcting from possible positive skew of estimated null distribution using 

From a552749806282499c89f1bacfc9ca2408dcf1146 Mon Sep 17 00:00:00 2001
From: Francesco D'Antonio <f.dantonio.lrgs@gmail.com>
Date: Thu, 28 Sep 2023 07:27:28 +0100
Subject: [PATCH 6/6] updated p-value calculation to come from chi2
 distribution -note: chi2 metrics are still too high so p-values calculated
 are all too small

---
 findoutlie/dvars_distribution.py            | 296 ++++++++++----------
 findoutlie/tests/test_dvars_distribution.py |   8 +
 2 files changed, 161 insertions(+), 143 deletions(-)

diff --git a/findoutlie/dvars_distribution.py b/findoutlie/dvars_distribution.py
index 4eac662..a18a6e7 100644
--- a/findoutlie/dvars_distribution.py
+++ b/findoutlie/dvars_distribution.py
@@ -9,103 +9,42 @@
 """
 
 import numpy as np
-import scipy
 from scipy.special import ndtri
+import scipy.stats
 
-
-def null_distribution_mean(data):
-    """ Calculate mean of null distribution on 4D data
-    using robust IQR measurements of voxel intensity differences.
-
-    Maths:
-
-    Step 1
-    Calculate IQR_i/IQR_0 where IQR_i is the IQR for voxel timeseries
-    in location i and IQR_0 is the IQR for the standard null distribution.
-    see function voxel_iqr_variance
-
-    Step 2
-    The median of {IQR_i/IQR_0} is an estimate for the mean of the null
-    distribution, other estimates are available
-
-    For more information see equation 12 and 13: 
-    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+def dvars_pvals(raw_data, alpha=0.05):
+    """ Calculate Bonferroni corrected p_values using chi2 stats and finds 
+    rejected null hypothesys
 
     Parameters
     ----------
-    data: 4D timeseries
+    raw_data: 4D timeseries
+    alpha: significance threshold
 
     Returns
     -------
-    mu_0 : float
-        estimate of the mean for the null distribution of the timeseries
+    p_vals_adjusted: 1D array of adjusted p_values
+    reject_null: 1D array, true if null hypothesys is rejected 
     """
-    # calculate the variance of all voxel timeseries
-    voxel_variances = voxel_iqr_variance(data)
-
-    # the median of the iqr 
-    return np.median(voxel_variances)
-
-def voxel_differences(data):
-    """ Calculate difference between voxel i at time t 
-    and the same voxel at time t + 1 
+    # scale the data
+    data = volume_scaling(raw_data)
     
-    Maths:
-
-    V_i(t+1)-V_i(t)
-
-    Parameters
-    ----------
-    data: normalised 4D timeseries of shape (X,Y,Z,T)
-
-    Returns
-    -------
-    dvals : 4D array with one less volume than 'data' of shape (X,Y,Z,T-1)
-    """
-    # create two timeseries with n-1 volumes and find the difference between them
-    # remove the last volume
-    img_start = data[...,:-1]
-    # remove the first volume
-    img_end = data[..., 1:]
-    return img_start - img_end
-
-def voxel_iqr_variance(data):
-    """ Calculate IQR of voxel timeseries differences
-    used to calculate mean of null distribution
-
-    For more information see equation 13: 
-    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
-
-    Parameters
-    ----------
-    data: 4D timeseries
-
-    Returns
-    -------
-    IQR_values: 3D array of IQR for each voxel timeseries difference
-    """
-    # from the data calculate the differences between voxels at t and t+1
-    all_voxel_differences = voxel_differences(data)
+    # calculate chi2 stats
+    chisquared_stats, degrees_of_freedom = dvars_chisquared_stat(data)
     
-    # initiate variable for loop
-    iqr_dvars_values = np.zeros(data.shape[:-1])
-
-    # initiate loop over x,y,z 
-    for x_value in range(data.shape[0]):
-        for y_value in range(data.shape[1]):
-            for z_value in range(data.shape[2]):
-                
-                # calculate iqr for each voxel timeseries
-                q1, q3 = np.percentile(all_voxel_differences[x_value,y_value,z_value,:], [25, 75])
-                iqr_dvars_values[x_value,y_value,z_value] = q3 - q1
-
-    # IQR of standard normal distribution
-    iqr_0 = ndtri(0.75) - ndtri(0.25)
-
-    return iqr_dvars_values/iqr_0
+    # find p-vals
+    p_vals = 1 - scipy.stats.chi2.cdf(chisquared_stats,degrees_of_freedom)
+    
+    # Bonferroni adjusted p_values
+    p_vals = p_vals*len(p_vals)
+    
+    # significance testing
+    reject_null = p_vals <= alpha
+    
+    return p_vals, reject_null
 
-def volume_normalisation(raw_data): 
-    """ Calculate normalised volume intensity values from raw
+def volume_scaling(raw_data): 
+    """ Rescales volume intensity values from raw
     4D timeseries
     
     Maths:
@@ -114,7 +53,7 @@ def volume_normalisation(raw_data):
     Y_Rit = raw value for voxel i in volume t
     m_R = mean (or median) raw value of {M_Ri}
 
-    Normalised value for voxel i at time t
+    Scaled value for voxel i at time t
     Y_it = 100*(Y_Rit-M_Ri)/m_R 
 
     see equation 1: 
@@ -126,13 +65,13 @@ def volume_normalisation(raw_data):
 
     Returns
     -------
-    data : normalised 4D timeseries
+    data : Scaled 4D timeseries
     """
 
-    # overall mean value (note this could be changed to the median)
+    # overall mean value 
     m_R = np.mean(raw_data)
 
-    normalised_data = np.zeros(raw_data.shape)
+    scaled_data = np.zeros(raw_data.shape)
     
     # initiate loop over x,y,z 
     for x_value in range(raw_data.shape[0]):
@@ -144,14 +83,50 @@ def volume_normalisation(raw_data):
                     
                     # loop over volumes
                     for t_value in range(raw_data.shape[3]):
-
+                        
                         # get individual raw intensities
                         Y_Rit = raw_data[x_value,y_value,z_value,t_value]
-                        # calculate normalised intensity
+                        # calculate scaled intensity
                         Y_it = 100*(Y_Rit-M_Ri)/m_R 
-                        normalised_data[x_value,y_value,z_value,t_value] = Y_it
+                        scaled_data[x_value,y_value,z_value,t_value] = Y_it
+    
+    return scaled_data
+
+def dvars_chisquared_stat(data):
+    """ Calculate chi squared stats for and degrees of freedom for 4D timeseries 
+
+    Maths:
+
+    X(dvars_t) = 2*mu_0(dvars^2_t)/(variance_0)
+
+    dof = 2*mu_0^2/(variance_0)
+
+    For more information see equation 11: 
+    https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
+
+    Parameters
+    ----------
+    data: 4D timeseries
+
+    Returns
+    -------
+    chi_squared_stats: 1D array
+        One-dimensional array with n-1 elements, where n is the number of
+        volumes in data.
+    dof: degrees of freedom
+    """
+    # calculate dvars^2 values
+    dvars_squared_values = dvars_squared(data)
     
-    return normalised_data
+    # calculate mean for the null distribution
+    null_mean = null_distribution_mean(data)
+    #null_mean = np.median(dvars_squared_values)
+
+    # calculate the variance for the null distribution
+    null_variance = null_distribution_variance(data)
+
+    # calculate chi squared stats
+    return 2*null_mean*(dvars_squared_values)/(null_variance), 2*null_mean**2/(null_variance)
 
 def dvars_squared(data):
     """ Calculate dvars^2 from 4D data
@@ -169,31 +144,48 @@ def dvars_squared(data):
         One-dimensional array with n-1 elements, where n is the number of
         volumes in data.
     """
+
+    return np.mean((voxel_differences(data))**2, axis = (0,1,2))
+
+def voxel_differences(data):
+    """ Calculate difference between voxel i at time t 
+    and the same voxel at time t + 1 
+    
+    Maths:
+
+    Y_i(t+1)-Y_i(t)
+
+    Parameters
+    ----------
+    data: 4D timeseries of shape (X,Y,Z,T)
+
+    Returns
+    -------
+    dvals : 4D array with one less volume than 'data' of shape (X,Y,Z,T-1)
+    """
     # create two timeseries with n-1 volumes and find the difference between them
     # remove the last volume
     img_start = data[...,:-1]
     # remove the first volume
     img_end = data[..., 1:]
+    return img_start - img_end
 
-    return np.mean((img_start - img_end)**2, axis = (0,1,2))
-
-def null_distribution_variance(data):
-    """ Calculate estimate of variance for null distribution on 4D data
+def null_distribution_mean(data):
+    """ Calculate mean of null distribution on 4D data
+    using robust IQR measurements of voxel intensity differences.
 
     Maths:
 
     Step 1
-    Calculate dvars_squared (from function)
+    Calculate IQR_i/IQR_0 where IQR_i is the IQR for voxel timeseries
+    in location i and IQR_0 is the IQR for the standard null distribution.
+    see function voxel_iqr_variance
 
     Step 2
-    Find hIQR({dvars^2_t}) the half IQR defined as the difference between
-    the median and lower quartile.
-
-    Step 3
-    Calculate the variance of the null distribution = hIQR({dvars^2_t})/hIQR_0 
-    where hIQR_0 is half of the IQR for the standard normal distribution
+    The median of {IQR_i/IQR_0} is an estimate for the mean of the null
+    distribution, other estimates are available
 
-    For more information see equation 14: 
+    For more information see equation 12 and 13: 
     https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
 
     Parameters
@@ -202,30 +194,20 @@ def null_distribution_variance(data):
 
     Returns
     -------
-    variance_0 : float
-        estimate of the variance for the null distribution of the timeseries
+    mu_0 : float
+        estimate of the mean for the null distribution of the timeseries
     """
+    # calculate the IQR of all voxel timeseries
+    voxel_iqrs = voxel_iqr_variance(data)
 
-    # calculate dvars values
-    dvars_squared_values = dvars_squared(data)
-
-    # calculate hIQR for dvars values
-    q1, q2 = np.percentile(dvars_squared_values, [25, 50])
-    hIQR = q2 - q1
-
-    # define hIQR_0 and calculate null distribution variance
-    hIQR_0 = (ndtri(0.75) - ndtri(0.25))/2
-    return hIQR/hIQR_0
-
-def dvars_z_scores(data):
-    """ Calculate z_scores for dvars values testing if dvars values
-    belong to the null distribution
-
-    Maths:
+    # the median of the iqr 
+    return np.median(voxel_iqrs)
 
-    Z(dvars_t) = (dvars^2_t - mu_0)/sqrt(variance_0)
+def voxel_iqr_variance(data):
+    """ Calculate IQR of voxel timeseries differences
+    used to calculate mean of null distribution
 
-    For more information see equation 15: 
+    For more information see equation 13: 
     https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
 
     Parameters
@@ -234,37 +216,65 @@ def dvars_z_scores(data):
 
     Returns
     -------
-    z_scores: 1D array
-        One-dimensional array with n-1 elements, where n is the number of
-        volumes in data.
+    IQR_values: 3D array of IQR for each voxel timeseries difference
     """
+    # from the data calculate the differences between voxels at t and t+1
+    all_voxel_differences = voxel_differences(data)
+    # all_voxel_differences = np.sqrt(dvars_squared(data))
 
-    dvars_squared_values = dvars_squared(data)
-    null_mean = null_distribution_mean(data)
-    null_variance_uncorrected = null_distribution_variance(data)
-    null_variance_corrected = null_variance_correction(null_mean,null_variance_uncorrected)
+    # initiate variable for loop
+    iqr_dvars_values = np.zeros(data.shape[:-1])
+
+    # initiate loop over x,y,z 
+    for x_value in range(data.shape[0]):
+        for y_value in range(data.shape[1]):
+            for z_value in range(data.shape[2]):
+                
+                # calculate iqr for each voxel timeseries
+                q1, q3 = np.percentile(all_voxel_differences[x_value,y_value,z_value,:], [25, 75])
+                iqr_dvars_values[x_value,y_value,z_value] = q3 - q1
+
+    # IQR of standard normal distribution
+    iqr_0 = ndtri(0.75) - ndtri(0.25)
 
-    return (dvars_squared_values-null_mean)/np.sqrt(null_variance_corrected)
+    return iqr_dvars_values/iqr_0
 
-def null_variance_correction(null_mean,null_variance, d=3):
-    """ Correcting from possible positive skew of estimated null distribution using 
-    power transformation (cube root transformation)
+def null_distribution_variance(data):
+    """ Calculate estimate of variance for null distribution on 4D data
 
     Maths:
 
-    Variance(dvars_t) = (1/d)*variance_0*mu_0^(2/d-2)
+    Step 1
+    Calculate dvars_squared (from function)
+
+    Step 2
+    Find hIQR({dvars^2_t}) the half IQR defined as the difference between
+    the median and lower quartile.
 
-    For more information see Appendix F.2: 
+    Step 3
+    Calculate the variance of the null distribution = hIQR({dvars^2_t})/hIQR_0 
+    where hIQR_0 is half of the IQR for the standard normal distribution
+
+    For more information see equation 14: 
     https://www.sciencedirect.com/science/article/pii/S1053811917311229?via%3Dihub#appsecE
 
     Parameters
     ----------
-    null_mean: estimated mean of the null distribution
-    null_variance: uncorrected estimate for the null variance
-    d: transformation factor (optimal at 3)
+    data: 4D timeseries
 
     Returns
     -------
-    corrected_variance: float
+    variance_0 : float
+        estimate of the variance for the null distribution of the timeseries
     """
-    return (1/d)*null_variance*null_mean**(2/d-2)
\ No newline at end of file
+
+    # calculate dvars values
+    dvars_squared_values = dvars_squared(data)
+
+    # calculate hIQR for dvars values
+    q1, q2 = np.percentile(dvars_squared_values, [25, 50])
+    hIQR = q2 - q1
+
+    # define hIQR_0 and calculate null distribution variance
+    hIQR_0 = (ndtri(0.75) - ndtri(0.25))/2
+    return hIQR/hIQR_0
diff --git a/findoutlie/tests/test_dvars_distribution.py b/findoutlie/tests/test_dvars_distribution.py
index cf97d8e..3ef4349 100644
--- a/findoutlie/tests/test_dvars_distribution.py
+++ b/findoutlie/tests/test_dvars_distribution.py
@@ -21,6 +21,10 @@
 import nipraxis as npx
 import nibabel as nib
 
+# create test volume timeseries filled with normally distributed random numbers
+TEST_NORMALISED_VOLUMES = np.random.normal(size = (10,10,10,1000))
+
+
 def test_null_distribution_mean():
     
     # create example matrices 
@@ -36,11 +40,13 @@ def test_null_distribution_mean():
 
     assert null_distribution_mean(example_values1) == 0
     assert isclose(null_distribution_mean(example_values2), answer2)
+    #assert isclose(null_distribution_mean(TEST_NORMALISED_VOLUMES), 0)
 
 
 def test_voxel_differences():
     example_values = np.ones((3,3,3,6))
     example_values1 = np.append(example_values,example_values+1, axis=3)
+    example_answer3 = TEST_NORMALISED_VOLUMES[...,1]-TEST_NORMALISED_VOLUMES[...,0]
 
     # check correct shape for answer
     assert voxel_differences(example_values).shape == (3,3,3,5)
@@ -49,6 +55,8 @@ def test_voxel_differences():
     example_answer1 = np.zeros(example_values.shape[:-1]+(example_values.shape[-1] - 1,))
     example_answer1 = np.append(example_answer1,example_values, axis=3)
     assert (voxel_differences(example_values1) == example_answer1).all
+    assert (voxel_differences(TEST_NORMALISED_VOLUMES)[...,0] == example_answer3).all
+
 
 def test_voxel_iqr_variance():