add distribution module

joshuabmoore · joshuabmoore · commit 364012053672 · 2025-06-29T13:05:53.000+10:00
diff --git a/pyhctsa/Configurations/basic.yaml b/pyhctsa/Configurations/basic.yaml
@@ -466,6 +466,93 @@ Stationarity:
       - lengthDependent
     dependencies:
     configs:
+      - zscore : False
     hctsa_name: ST_Length
     ordered_args:
 
+  FitPolynomial:
+    base_name: FitPolynomial
+    labels:
+      - trend
+      - stationarity
+    dependencies:
+    configs:
+      - k: !range [1, 4]
+        zscore: True
+    hctsa_name: ST_FitPolynomial
+    ordered_args: ["k"]
+
+Distribution:
+  Withinp:
+    base_name: Withinp
+    labels:
+      - distribution
+      - spread
+    dependencies:
+    configs:
+      - p: [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
+        meanOrMedian: ['mean', 'median']
+        zscore: True
+    hctsa_name: DN_Withinp
+    ordered_args: ["p", "meanOrMedian"]
+
+  Unique:
+    base_name: Unique
+    labels:
+      - distribution
+      - raw
+    dependencies:
+    configs:
+      - zscore: False
+    hctsa_name: DN_Unique
+    ordered_args:
+  
+  Spread:
+    base_name: Spread
+    labels:
+      - raw
+      - spread
+      - distribution
+      - spreadDependent
+    dependencies:
+      - scipy
+    configs:
+      - spreadMeasure: ['std', 'mad', 'iqr', 'mead']
+    hctsa_name: DN_Spread
+    ordered_args: ["spreadMeasure"]
+
+  Quantile:
+    base_name: Quantile
+    labels:
+      - distribution
+    dependencies:
+    configs:
+      - p: [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
+        zscore: True
+    hctsa_name: DN_Quantile
+    ordered_args: ["p"]
+
+  ProportionValues:
+    base_name: ProportionValues
+    labels:
+      - distribution
+      - raw
+    dependencies:
+    configs:
+      - propWhat: ['zeros', 'positive', 'geq0']
+        zscore: False
+    hctsa_name: DN_ProportionValues
+    ordered_args: ["propWhat"]
+
+  PLeft:
+    base_name: pleft
+    labels:
+      - distribution
+      - spread
+    dependencies:
+    configs:
+      - th : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
+        zscore: True
+    hctsa_name: DN_pleft
+    ordered_args: ["th"]
+
diff --git a/pyhctsa/Configurations/distribution.yaml b/pyhctsa/Configurations/distribution.yaml
@@ -0,0 +1,76 @@
+Distribution:
+  Withinp:
+    base_name: Withinp
+    labels:
+      - distribution
+      - spread
+    dependencies:
+    configs:
+      - p: [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
+        meanOrMedian: ['mean', 'median']
+        zscore: True
+    hctsa_name: DN_Withinp
+    ordered_args: ["p", "meanOrMedian"]
+
+  Unique:
+    base_name: Unique
+    labels:
+      - distribution
+      - raw
+    dependencies:
+    configs:
+      - zscore: False
+    hctsa_name: DN_Unique
+    ordered_args:
+  
+  Spread:
+    base_name: Spread
+    labels:
+      - raw
+      - spread
+      - distribution
+      - spreadDependent
+    dependencies:
+      - scipy
+    configs:
+      - spreadMeasure: ['std', 'mad', 'iqr', 'mead']
+    hctsa_name: DN_Spread
+    ordered_args: ["spreadMeasure"]
+
+  Quantile:
+    base_name: Quantile
+    labels:
+      - distribution
+    dependencies:
+    configs:
+      - p: [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
+        zscore: True
+    hctsa_name: DN_Quantile
+    ordered_args: ["p"]
+
+  ProportionValues:
+    base_name: ProportionValues
+    labels:
+      - distribution
+      - raw
+    dependencies:
+    configs:
+      - propWhat: ['zeros', 'positive', 'geq0']
+        zscore: False
+    hctsa_name: DN_ProportionValues
+    ordered_args: ["propWhat"]
+
+  PLeft:
+    base_name: pleft
+    labels:
+      - distribution
+      - spread
+    dependencies:
+    configs:
+      - th : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
+        zscore: True
+    hctsa_name: DN_pleft
+    ordered_args: ["th"]
+
+  
+  
diff --git a/pyhctsa/Configurations/stationarity.yaml b/pyhctsa/Configurations/stationarity.yaml
@@ -43,6 +43,7 @@ Stationarity:
       - lengthDependent
     dependencies:
     configs:
+      - zscore: False
     hctsa_name: ST_Length
     ordered_args:
 
@@ -57,4 +58,4 @@ Stationarity:
         zscore: True
     hctsa_name: ST_FitPolynomial
     ordered_args: ["k"]
-    
+  
diff --git a/pyhctsa/Operations/Distribution.py b/pyhctsa/Operations/Distribution.py
@@ -0,0 +1,181 @@
+import numpy as np
+from numpy.typing import ArrayLike
+from typing import Dict, Union
+from scipy import stats
+from loguru import logger
+
+
+def Withinp(x : ArrayLike, p : float = 1.0, meanOrMedian : str = 'mean') -> float:
+    """
+    Proportion of data points within p standard deviations of the mean or median.
+
+    Parameters:
+    -----------
+    x (array-like): The input data vector
+    p (float): The number (proportion) of standard deviations
+    meanOrMedian (str): Whether to use units of 'mean' and standard deviation,
+                          or 'median' and rescaled interquartile range
+
+    Returns:
+    --------
+    float: The proportion of data points within p standard deviations
+
+    Raises:
+    ValueError: If mean_or_median is not 'mean' or 'median'
+    """
+    x = np.asarray(x)
+    N = len(x)
+
+    if meanOrMedian == 'mean':
+        mu = np.mean(x)
+        sig = np.std(x, ddof=1)
+    elif meanOrMedian == 'median':
+        mu = np.median(x)
+        iqr_val = np.percentile(x, 75, method='hazen') - np.percentile(x, 25, method='hazen')
+        sig = 1.35 * iqr_val
+    else:
+        raise ValueError(f"Unknown setting: '{meanOrMedian}'")
+
+    # The withinp statistic:
+    return np.divide(np.sum((x >= mu - p * sig) & (x <= mu + p * sig)), N)
+
+def Unique(y : ArrayLike) -> float:
+    """
+    The proportion of the time series that are unique values.
+
+    Parameters
+    ----------
+    y : array-like
+        The input time series or data vector
+
+    Returns
+    -------
+    float
+        the proportion of time series that are unique values
+    """
+    y = np.asarray(y)
+    return np.divide(len(np.unique(y)), len(y))
+
+
+def Spread(y : ArrayLike, spreadMeasure : str = 'std') -> float:
+    """
+    Measure of spread of the input time series.
+
+    Returns the spread of the raw data vector using different statistical measures.
+    This is part of the Distributional operations from hctsa, implementing DN_Spread.
+
+    Parameters
+    ----------
+    y : array-like
+        The input time series or data vector
+    spreadMeasure : str, optional
+        The spread measure to use (default is 'std'):
+        - 'std': standard deviation
+        - 'iqr': interquartile range 
+        - 'mad': mean absolute deviation
+        - 'mead': median absolute deviation
+
+    Returns
+    -------
+    float
+        The calculated spread measure
+    """
+    y = np.asarray(y)
+    if spreadMeasure == 'std':
+        out = np.std(y, ddof=1)
+    elif spreadMeasure == 'iqr':
+        q75 = np.quantile(y, 0.75, method='hazen')
+        q25 = np.quantile(y, 0.25, method='hazen')
+        out = q75 - q25
+    elif spreadMeasure == 'mad':
+        # mean absolute deviation
+        out = np.mean(np.absolute(y - np.mean(y, None)), None)
+    elif spreadMeasure == 'mead':
+        # median absolute deviation
+        out = np.median(np.absolute(y - np.median(y, None)), None)
+    else:
+        raise ValueError('spreadMeasure must be one of std, iqr, mad or mead')
+    return out
+
+def Quantile(y : ArrayLike, p : float = 0.5) -> float:
+    """ 
+    Calculates the quantile value at a specified proportion, p.
+
+    Parameters:
+    y (array-like): The input data vector
+    p (float): The quantile proportion (default is 0.5, which is the median)
+
+    Returns:
+    float: The calculated quantile value
+
+    Raises:
+    ValueError: If p is not a number between 0 and 1
+    """
+    y = np.asarray(y)
+    if p == 0.5:
+        logger.warning("Using quantile p = 0.5 (median) by default")
+    
+    if not isinstance(p, (int, float)) or p < 0 or p > 1:
+        raise ValueError("p must specify a proportion, in (0,1)")
+    
+    return float(np.quantile(y, p, method = 'hazen'))
+
+def ProportionValues(x : ArrayLike, propWhat : str = 'positive') -> float:
+    """
+    Calculate the proportion of values meeting specific conditions in a time series.
+
+    Parameters
+    ----------
+    x : array-like
+        Input time series or data vector
+    propWhat : str, optional (default is 'positive')
+        Type of values to count:
+        - 'zeros': values equal to zero
+        - 'positive': values strictly greater than zero
+        - 'geq0': values greater than or equal to zero
+
+    Returns
+    -------
+    float
+        Proportion of values meeting the specified condition.
+    """
+    x = np.asarray(x)
+    N = len(x)
+
+    if propWhat == 'zeros':
+        # returns the proportion of zeros in the input vector
+        out = sum(x == 0) / N
+    elif propWhat == 'positive':
+        out = sum(x > 0) / N
+    elif propWhat == 'geq0':
+        out = sum(x >= 0) / N
+    else:
+        raise ValueError(f"Unknown condition to measure: {propWhat}")
+
+    return out
+
+
+def PLeft(y : ArrayLike, th : float = 0.1) -> float:
+    """
+    Distance from the mean at which a given proportion of data are more distant.
+    
+    Measures the maximum distance from the mean at which a given fixed proportion, `th`, of the time-series data points are further.
+    Normalizes by the standard deviation of the time series.
+    
+    Parameters
+    ----------
+    y : array_like
+        The input data vector.
+    th : float, optional
+        The proportion of data further than `th` from the mean (default is 0.1).
+    
+    Returns
+    -------
+    float
+        The distance from the mean normalized by the standard deviation.
+    """
+    y = np.asarray(y)
+    p = np.quantile(np.abs(y - np.mean(y)), 1-th, method='hazen')
+    # A proportion, th, of the data lie further than p from the mean
+    out = np.divide(p, np.std(y, ddof=1))
+    return float(out)