-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchi2test.m
109 lines (94 loc) · 4.23 KB
/
chi2test.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
function [chi2, critical] = chi2test (data, n, alpha, dist, a1, a2, a3);
%CHI2TEST Chi-square Test for Continuous Distributions.
% [A, B] = CHI2TEST(DATA, N, ALPHA, DIST, X, Y, Z) returns
% the chi-square statistic for the samples contained in the
% row vector DATA.
%
% N specifies the number of equal-probability class intervals
% for the test. ALPHA is the confidence level parameter used
% to find the critical chi-square value.
%
% DIST is a string containing the probability distribution
% that we are testing against. See the staitsctics toolbox
% for supported distributions - 'exp', 'gam', 'unif' are
% some of them.
%
% X, Y, and Z specify the estimated parameters for the
% selected DIST. Some distributions require only one of
% these parameters, and the order that these parameters are
% provided follows the values given to the cummulative
% distribution functions UNIFCDF, GAMCDF, EXPCDF, and others.
%
% A is the computed chi-square statistic, and B is the
% critical tabulated value at the degrees of freedom. The
% degree of freedom is the number of intervals minus the
% number of estimated parameters.
%
% In general, if A is less than B, the H0 hypothesis
% that DATA follows the DIST distribution is accepted.
%
% An attempt to fit some data with the uniform distribution
% on the interval from 1.5 to 2.9. The test fails, since A > B:
%
% [a, b] = chi2test (data, 10, 0.05, 'unif', 1.5, 2.9)
% a =
% 38.7500
% b =
% 14.0671
%
% See also MLE, CHI2INV, CHI2STAT, HIST, CDF, ICDF, PDF
% Copyright 2004 Leonardo Salomone, Carleton University, Ottawa, Canada
% check input
if nargin < 4, error('Not enough input arguments'); end
% check if number of bins complies to suggested interval range
nsamples = length(data);
if nsamples < 20,
error('Sample data too small, chi-square test not recommended');
elseif nsamples < 50,
if n < 5, error('Number of intervals too small'); end
if n > 10, error('Number of intervals too large'); end
elseif nsamples < 100,
if n < 10, error('Number of intervals too small'); end
if n > 20, error('Number of intervals too large'); end
else
if n < sqrt(nsamples), error('Number of intervals too small'); end
if n > nsamples/5, error('Number of intervals too large'); end
end;
% create functions for the bin probabilities and for the inverse cdf for the parameters given
switch (7 - nargin)
case 2
prob = inline(sprintf('cdf(''%s'', b, %10.10g) - cdf(''%s'', a, %10.10g)', dist, a1, dist, a1), 'a', 'b');
invcdf = inline(sprintf('icdf(''%s'', x, %10.10g)', dist, a1), 'x');
case 1
prob = inline(sprintf('cdf(''%s'', b, %10.10g, %10.10g) - cdf(''%s'', a, %10.10g, %10.10g)', dist, a1, a2, dist, a1, a2), 'a', 'b');
invcdf = inline(sprintf('icdf(''%s'', x, %10.10g, %10.10g)', dist, a1, a2), 'x');
case 0
prob = inline(sprintf('cdf(''%s'', b, %10.10g, %10.10g, %10.10g) - cdf(''%s'', a, %10.10g, %10.10g, %10.10g)', dist, a1, a2, a3, dist, a1, a2, a3), 'a', 'b');
invcdf = inline(sprintf('icdf(''%s'', x, %10.10g, %10.10g, %10.10g)', dist, a1, a2, a3), 'x');
otherwise
return;
end;
% find out the bin edges, for equal probabilities of continous distributions, using the inverse CDF
pi = (1/n) .* [0:n];
intvls = invcdf(pi);
% ensure that last item is infinity for exp distribution
switch (dist)
case 'exp'
intvls(end) = inf;
end;
% find bin counts for intervals
o_freq = histc(data, intvls);
% remove the last bin, it only reflects exact counts at the last item (see histc)
o_freq = o_freq(1:end-1);
% find expected bin probabilities, they are all the same
e_freq = prob(intvls(1),intvls(2)) .* ones(1,n);
% multiply by number of samples for expected frequency
e_freq = length(data) .* e_freq;
% find the chi2 statistic for each interval
chi2bins = ((o_freq - e_freq).^2)./e_freq;
% sum up the statistics
chi2 = sum(chi2bins);
% find degrees of freedom
df = n - (nargin - 3);
% find critical value
critical = chi2inv(1-alpha, df);