Skip to content

Commit 3640120

Browse files
committed
add distribution module
1 parent 7cf9726 commit 3640120

4 files changed

Lines changed: 346 additions & 1 deletion

File tree

pyhctsa/Configurations/basic.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,93 @@ Stationarity:
466466
- lengthDependent
467467
dependencies:
468468
configs:
469+
- zscore : False
469470
hctsa_name: ST_Length
470471
ordered_args:
471472

473+
FitPolynomial:
474+
base_name: FitPolynomial
475+
labels:
476+
- trend
477+
- stationarity
478+
dependencies:
479+
configs:
480+
- k: !range [1, 4]
481+
zscore: True
482+
hctsa_name: ST_FitPolynomial
483+
ordered_args: ["k"]
484+
485+
Distribution:
486+
Withinp:
487+
base_name: Withinp
488+
labels:
489+
- distribution
490+
- spread
491+
dependencies:
492+
configs:
493+
- p: [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
494+
meanOrMedian: ['mean', 'median']
495+
zscore: True
496+
hctsa_name: DN_Withinp
497+
ordered_args: ["p", "meanOrMedian"]
498+
499+
Unique:
500+
base_name: Unique
501+
labels:
502+
- distribution
503+
- raw
504+
dependencies:
505+
configs:
506+
- zscore: False
507+
hctsa_name: DN_Unique
508+
ordered_args:
509+
510+
Spread:
511+
base_name: Spread
512+
labels:
513+
- raw
514+
- spread
515+
- distribution
516+
- spreadDependent
517+
dependencies:
518+
- scipy
519+
configs:
520+
- spreadMeasure: ['std', 'mad', 'iqr', 'mead']
521+
hctsa_name: DN_Spread
522+
ordered_args: ["spreadMeasure"]
523+
524+
Quantile:
525+
base_name: Quantile
526+
labels:
527+
- distribution
528+
dependencies:
529+
configs:
530+
- p: [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
531+
zscore: True
532+
hctsa_name: DN_Quantile
533+
ordered_args: ["p"]
534+
535+
ProportionValues:
536+
base_name: ProportionValues
537+
labels:
538+
- distribution
539+
- raw
540+
dependencies:
541+
configs:
542+
- propWhat: ['zeros', 'positive', 'geq0']
543+
zscore: False
544+
hctsa_name: DN_ProportionValues
545+
ordered_args: ["propWhat"]
546+
547+
PLeft:
548+
base_name: pleft
549+
labels:
550+
- distribution
551+
- spread
552+
dependencies:
553+
configs:
554+
- th : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
555+
zscore: True
556+
hctsa_name: DN_pleft
557+
ordered_args: ["th"]
558+
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
Distribution:
2+
Withinp:
3+
base_name: Withinp
4+
labels:
5+
- distribution
6+
- spread
7+
dependencies:
8+
configs:
9+
- p: [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
10+
meanOrMedian: ['mean', 'median']
11+
zscore: True
12+
hctsa_name: DN_Withinp
13+
ordered_args: ["p", "meanOrMedian"]
14+
15+
Unique:
16+
base_name: Unique
17+
labels:
18+
- distribution
19+
- raw
20+
dependencies:
21+
configs:
22+
- zscore: False
23+
hctsa_name: DN_Unique
24+
ordered_args:
25+
26+
Spread:
27+
base_name: Spread
28+
labels:
29+
- raw
30+
- spread
31+
- distribution
32+
- spreadDependent
33+
dependencies:
34+
- scipy
35+
configs:
36+
- spreadMeasure: ['std', 'mad', 'iqr', 'mead']
37+
hctsa_name: DN_Spread
38+
ordered_args: ["spreadMeasure"]
39+
40+
Quantile:
41+
base_name: Quantile
42+
labels:
43+
- distribution
44+
dependencies:
45+
configs:
46+
- p: [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
47+
zscore: True
48+
hctsa_name: DN_Quantile
49+
ordered_args: ["p"]
50+
51+
ProportionValues:
52+
base_name: ProportionValues
53+
labels:
54+
- distribution
55+
- raw
56+
dependencies:
57+
configs:
58+
- propWhat: ['zeros', 'positive', 'geq0']
59+
zscore: False
60+
hctsa_name: DN_ProportionValues
61+
ordered_args: ["propWhat"]
62+
63+
PLeft:
64+
base_name: pleft
65+
labels:
66+
- distribution
67+
- spread
68+
dependencies:
69+
configs:
70+
- th : [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
71+
zscore: True
72+
hctsa_name: DN_pleft
73+
ordered_args: ["th"]
74+
75+
76+

pyhctsa/Configurations/stationarity.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Stationarity:
4343
- lengthDependent
4444
dependencies:
4545
configs:
46+
- zscore: False
4647
hctsa_name: ST_Length
4748
ordered_args:
4849

@@ -57,4 +58,4 @@ Stationarity:
5758
zscore: True
5859
hctsa_name: ST_FitPolynomial
5960
ordered_args: ["k"]
60-
61+

pyhctsa/Operations/Distribution.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
import numpy as np
2+
from numpy.typing import ArrayLike
3+
from typing import Dict, Union
4+
from scipy import stats
5+
from loguru import logger
6+
7+
8+
def Withinp(x : ArrayLike, p : float = 1.0, meanOrMedian : str = 'mean') -> float:
9+
"""
10+
Proportion of data points within p standard deviations of the mean or median.
11+
12+
Parameters:
13+
-----------
14+
x (array-like): The input data vector
15+
p (float): The number (proportion) of standard deviations
16+
meanOrMedian (str): Whether to use units of 'mean' and standard deviation,
17+
or 'median' and rescaled interquartile range
18+
19+
Returns:
20+
--------
21+
float: The proportion of data points within p standard deviations
22+
23+
Raises:
24+
ValueError: If mean_or_median is not 'mean' or 'median'
25+
"""
26+
x = np.asarray(x)
27+
N = len(x)
28+
29+
if meanOrMedian == 'mean':
30+
mu = np.mean(x)
31+
sig = np.std(x, ddof=1)
32+
elif meanOrMedian == 'median':
33+
mu = np.median(x)
34+
iqr_val = np.percentile(x, 75, method='hazen') - np.percentile(x, 25, method='hazen')
35+
sig = 1.35 * iqr_val
36+
else:
37+
raise ValueError(f"Unknown setting: '{meanOrMedian}'")
38+
39+
# The withinp statistic:
40+
return np.divide(np.sum((x >= mu - p * sig) & (x <= mu + p * sig)), N)
41+
42+
def Unique(y : ArrayLike) -> float:
43+
"""
44+
The proportion of the time series that are unique values.
45+
46+
Parameters
47+
----------
48+
y : array-like
49+
The input time series or data vector
50+
51+
Returns
52+
-------
53+
float
54+
the proportion of time series that are unique values
55+
"""
56+
y = np.asarray(y)
57+
return np.divide(len(np.unique(y)), len(y))
58+
59+
60+
def Spread(y : ArrayLike, spreadMeasure : str = 'std') -> float:
61+
"""
62+
Measure of spread of the input time series.
63+
64+
Returns the spread of the raw data vector using different statistical measures.
65+
This is part of the Distributional operations from hctsa, implementing DN_Spread.
66+
67+
Parameters
68+
----------
69+
y : array-like
70+
The input time series or data vector
71+
spreadMeasure : str, optional
72+
The spread measure to use (default is 'std'):
73+
- 'std': standard deviation
74+
- 'iqr': interquartile range
75+
- 'mad': mean absolute deviation
76+
- 'mead': median absolute deviation
77+
78+
Returns
79+
-------
80+
float
81+
The calculated spread measure
82+
"""
83+
y = np.asarray(y)
84+
if spreadMeasure == 'std':
85+
out = np.std(y, ddof=1)
86+
elif spreadMeasure == 'iqr':
87+
q75 = np.quantile(y, 0.75, method='hazen')
88+
q25 = np.quantile(y, 0.25, method='hazen')
89+
out = q75 - q25
90+
elif spreadMeasure == 'mad':
91+
# mean absolute deviation
92+
out = np.mean(np.absolute(y - np.mean(y, None)), None)
93+
elif spreadMeasure == 'mead':
94+
# median absolute deviation
95+
out = np.median(np.absolute(y - np.median(y, None)), None)
96+
else:
97+
raise ValueError('spreadMeasure must be one of std, iqr, mad or mead')
98+
return out
99+
100+
def Quantile(y : ArrayLike, p : float = 0.5) -> float:
101+
"""
102+
Calculates the quantile value at a specified proportion, p.
103+
104+
Parameters:
105+
y (array-like): The input data vector
106+
p (float): The quantile proportion (default is 0.5, which is the median)
107+
108+
Returns:
109+
float: The calculated quantile value
110+
111+
Raises:
112+
ValueError: If p is not a number between 0 and 1
113+
"""
114+
y = np.asarray(y)
115+
if p == 0.5:
116+
logger.warning("Using quantile p = 0.5 (median) by default")
117+
118+
if not isinstance(p, (int, float)) or p < 0 or p > 1:
119+
raise ValueError("p must specify a proportion, in (0,1)")
120+
121+
return float(np.quantile(y, p, method = 'hazen'))
122+
123+
def ProportionValues(x : ArrayLike, propWhat : str = 'positive') -> float:
124+
"""
125+
Calculate the proportion of values meeting specific conditions in a time series.
126+
127+
Parameters
128+
----------
129+
x : array-like
130+
Input time series or data vector
131+
propWhat : str, optional (default is 'positive')
132+
Type of values to count:
133+
- 'zeros': values equal to zero
134+
- 'positive': values strictly greater than zero
135+
- 'geq0': values greater than or equal to zero
136+
137+
Returns
138+
-------
139+
float
140+
Proportion of values meeting the specified condition.
141+
"""
142+
x = np.asarray(x)
143+
N = len(x)
144+
145+
if propWhat == 'zeros':
146+
# returns the proportion of zeros in the input vector
147+
out = sum(x == 0) / N
148+
elif propWhat == 'positive':
149+
out = sum(x > 0) / N
150+
elif propWhat == 'geq0':
151+
out = sum(x >= 0) / N
152+
else:
153+
raise ValueError(f"Unknown condition to measure: {propWhat}")
154+
155+
return out
156+
157+
158+
def PLeft(y : ArrayLike, th : float = 0.1) -> float:
159+
"""
160+
Distance from the mean at which a given proportion of data are more distant.
161+
162+
Measures the maximum distance from the mean at which a given fixed proportion, `th`, of the time-series data points are further.
163+
Normalizes by the standard deviation of the time series.
164+
165+
Parameters
166+
----------
167+
y : array_like
168+
The input data vector.
169+
th : float, optional
170+
The proportion of data further than `th` from the mean (default is 0.1).
171+
172+
Returns
173+
-------
174+
float
175+
The distance from the mean normalized by the standard deviation.
176+
"""
177+
y = np.asarray(y)
178+
p = np.quantile(np.abs(y - np.mean(y)), 1-th, method='hazen')
179+
# A proportion, th, of the data lie further than p from the mean
180+
out = np.divide(p, np.std(y, ddof=1))
181+
return float(out)

0 commit comments

Comments
 (0)