Skip to content

Commit 5064dc1

Browse files
authored
Merge pull request #90 from PrzeChoj/dev
likelihood-ratio test
2 parents 437c7a6 + a178da9 commit 5064dc1

File tree

9 files changed

+155
-58
lines changed

9 files changed

+155
-58
lines changed

Diff for: DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,5 @@ Config/testthat/edition: 3
5050
Encoding: UTF-8
5151
Language: en-US
5252
Roxygen: list(markdown = TRUE)
53-
RoxygenNote: 7.3.1
53+
RoxygenNote: 7.3.2
5454
LazyData: true

Diff for: NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ export(validate_gips_perm)
2929
importFrom(stats,AIC)
3030
importFrom(stats,BIC)
3131
importFrom(stats,logLik)
32+
importFrom(stats,pchisq)

Diff for: NEWS.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ There was a significant improvement in the speed of calculation. Details in the
1111

1212
### Update to functions
1313

14-
- `plot.gips()` can get `type = "n0"`, which will plot the change of `n0` along the "MH" optimization. Handy for deciding of burn-in time.
15-
- `find_MAP(optimizer = "MH")` tracks the `n0` along the optimization.
14+
- `plot.gips()` can get `type = "n0"`, which will plot the change of `n0` along the "MH" optimization. Handy for deciding of burn-in time;
15+
- `find_MAP(optimizer = "MH")` tracks the `n0` along the optimization;
16+
- `summary.gips()` calculates Likelihood-Ratio test;
1617

1718

1819
# gips 1.2.2

Diff for: R/get_structure_constants.R

+19-9
Original file line numberDiff line numberDiff line change
@@ -103,21 +103,31 @@ calculate_r <- function(cycle_lengths, perm_order) {
103103
# identity function
104104
return(length(cycle_lengths))
105105
}
106-
# for a in 0,1,...,floor(perm_order/2)
107-
# r_a = #{1:C such that a*p_c is a multiple of N}
108-
# AKA a*p_c %% N == 0
109-
110-
# Corollary: N %% p_c == 0 for each p_c, cause N is LCM of all p_c
106+
# for alpha in 0,1,...,floor(perm_order/2)
107+
# r_{alpha} = #{1:C such that alpha*p_c is a multiple of N}
108+
# alpha*p_c is a multiple of N iff alpha*p_c %% N == 0
109+
# Now, N is the Least Common Multiplier of all p_c
110+
# Which means, that N %% p_c == 0 for every p_c
111+
# In other words, N / p_c is an integer
111112
multiples <- round(perm_order / cycle_lengths) # the result of division should be an integer, but floats may interfere
112113

113-
# Now we have to adjust for 2 cases:
114-
# 1) some alphas are too large
115-
# 2) some alphas are so small, that we can include their multiples
116-
# (if a*p_c %% N == 0, then for any natural k k*a*p_c %% N == 0)
114+
# Since N/p_c is an integer, alpha*p_c %% N == 0 iff alpha %% (N/p_c) == 0
115+
# In other words, alpha must be a multiple of (N/p_1, N/p_2,...,N/p_C)
116+
# (alpha = k*(N/p_1) or k*(N/p_2) or ... or k*(N/p_C) for some integer k (including 0))
117+
# However, alpha must be at most M, and a valid bound from above for integer k is max(p_1,...,p_C).
117118
max_order <- max(cycle_lengths)
119+
120+
# Here we create all possible alpha values.
121+
# The `multiples` corresponds to N/p_1,...,N/p_C, and `0:max_order` are possible integers k.
122+
# Use the outer product to get all pairwise multiplications
118123
alpha_matrix <- multiples %*% t(0:max_order)
124+
125+
# sort is in ascending order, which means smallest alphas go to start.
126+
# The end result is as if we iterated over each alpha value (in ascending order),
127+
# and then deleted entries with 0s.
119128
possible_alphas <- unique(sort(alpha_matrix[alpha_matrix <= M]))
120129

130+
# Recalculate the r_alpha vector using its definition directly.
121131
r_alfa <- sapply(possible_alphas, function(alpha) sum(alpha %% multiples == 0))
122132
as.double(r_alfa)
123133
}

Diff for: R/gips_class.R

+76-28
Original file line numberDiff line numberDiff line change
@@ -1556,26 +1556,33 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
15561556
#' It can be less than 1, meaning the identity permutation
15571557
#' is more likely. Remember that this number can big and
15581558
#' overflow to `Inf` or small and underflow to 0.
1559-
#' 5. `n0` - the minimum number of observations needed for
1559+
#' 5. `log_times_more_likely_than_id` - log of `times_more_likely_than_id`.
1560+
#' 6. `likelihood_ratio_test_statistics`, `likelihood_ratio_test_p_value` -
1561+
#' statistics and p-value of Likelihood Ratio test, where
1562+
#' the H_0 is that the data was drawn from the normal distribution
1563+
#' with Covariance matrix invariant under the given permutation.
1564+
#' The p-value is calculated from the asymptotic distribution.
1565+
#' Note that this is sensibly defined only for \eqn{n \ge p}.
1566+
#' 7. `n0` - the minimum number of observations needed for
15601567
#' the covariance matrix's maximum likelihood estimator
15611568
#' (corresponding to a MAP) to exist. See **\eqn{C\sigma} and `n0`**
15621569
#' section in `vignette("Theory", package = "gips")` or in its
15631570
#' [pkgdown page](https://przechoj.github.io/gips/articles/Theory.html).
1564-
#' 6. `S_matrix` - the underlying matrix.
1571+
#' 8. `S_matrix` - the underlying matrix.
15651572
#' This matrix will be used in calculations of
15661573
#' the posteriori value in [log_posteriori_of_gips()].
1567-
#' 7. `number_of_observations` - the number of observations that
1574+
#' 9. `number_of_observations` - the number of observations that
15681575
#' were observed for the `S_matrix` to be calculated.
15691576
#' This value will be used in calculations of
15701577
#' the posteriori value in [log_posteriori_of_gips()].
1571-
#' 8. `was_mean_estimated` - given by the user while creating the `gips` object:
1578+
#' 10. `was_mean_estimated` - given by the user while creating the `gips` object:
15721579
#' * `TRUE` means the `S` parameter was the output of [stats::cov()] function;
15731580
#' * `FALSE` means the `S` parameter was calculated with
15741581
#' `S = t(X) %*% X / number_of_observations`.
1575-
#' 9. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
1582+
#' 11. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
15761583
#' See the **Hyperparameters** section of [gips()] documentation.
1577-
#' 10. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
1578-
#' 11. `n_parameters` - number of free parameters in the covariance matrix.
1584+
#' 12. `n_parameters` - number of free parameters in the covariance matrix.
1585+
#' 13. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
15791586
#' * For optimized `gips` object:
15801587
#' 1. `optimized` - `TRUE`.
15811588
#' 2. `found_permutation` - the permutation this `gips` represents.
@@ -1590,46 +1597,56 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
15901597
#' the `found_permutation` is over the `start_permutation`.
15911598
#' It cannot be a number less than 1.
15921599
#' Remember that this number can big and overflow to `Inf`.
1593-
#' 7. `n0` - the minimal number of observations needed for the existence of
1600+
#' 7. `log_times_more_likely_than_start` - log of
1601+
#' `times_more_likely_than_start`.
1602+
#' 8. `likelihood_ratio_test_statistics`, `likelihood_ratio_test_p_value` -
1603+
#' statistics and p-value of Likelihood Ratio test, where
1604+
#' the H_0 is that the data was drawn from the normal distribution
1605+
#' with Covariance matrix invariant under `found_permutation`.
1606+
#' The p-value is calculated from the asymptotic distribution.
1607+
#' Note that this is sensibly defined only for \eqn{n \ge p}.
1608+
#' 9. `n0` - the minimal number of observations needed for the existence of
15941609
#' the maximum likelihood estimator (corresponding to a MAP) of
15951610
#' the covariance matrix (see **\eqn{C\sigma} and `n0`**
15961611
#' section in `vignette("Theory", package = "gips")` or in its
15971612
#' [pkgdown page](https://przechoj.github.io/gips/articles/Theory.html)).
1598-
#' 8. `S_matrix` - the underlying matrix.
1613+
#' 10. `S_matrix` - the underlying matrix.
15991614
#' This matrix will be used in calculations of
16001615
#' the posteriori value in [log_posteriori_of_gips()].
1601-
#' 9. `number_of_observations` - the number of observations that
1616+
#' 11. `number_of_observations` - the number of observations that
16021617
#' were observed for the `S_matrix` to be calculated.
16031618
#' This value will be used in calculations of
16041619
#' the posteriori value in [log_posteriori_of_gips()].
1605-
#' 10. `was_mean_estimated` - given by the user while creating the `gips` object:
1620+
#' 12. `was_mean_estimated` - given by the user while creating the `gips` object:
16061621
#' * `TRUE` means the `S` parameter was output of the [stats::cov()] function;
16071622
#' * `FALSE` means the `S` parameter was calculated with
16081623
#' `S = t(X) %*% X / number_of_observations`.
1609-
#' 11. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
1624+
#' 13. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
16101625
#' See the **Hyperparameters** section of [gips()] documentation.
1611-
#' 12. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
1612-
#' 13. `n_parameters` - number of free parameters in the covariance matrix.
1613-
#' 14. `optimization_algorithm_used` - all used optimization algorithms
1626+
#' 14. `n_parameters` - number of free parameters in the covariance matrix.
1627+
#' 15. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
1628+
#' 16. `optimization_algorithm_used` - all used optimization algorithms
16141629
#' in order (one could start optimization with "MH", and then
16151630
#' do an "HC").
1616-
#' 15. `did_converge` - a boolean, did the last used algorithm converge.
1617-
#' 16. `number_of_log_posteriori_calls` - how many times was
1631+
#' 17. `did_converge` - a boolean, did the last used algorithm converge.
1632+
#' 18. `number_of_log_posteriori_calls` - how many times was
16181633
#' the [log_posteriori_of_gips()] function called during
16191634
#' the optimization.
1620-
#' 17. `whole_optimization_time` - how long was the optimization process;
1635+
#' 19. `whole_optimization_time` - how long was the optimization process;
16211636
#' the sum of all optimization times (when there were multiple).
1622-
#' 18. `log_posteriori_calls_after_best` - how many times was
1637+
#' 20. `log_posteriori_calls_after_best` - how many times was
16231638
#' the [log_posteriori_of_gips()] function called after
16241639
#' the `found_permutation`; in other words, how long ago
16251640
#' could the optimization be stopped and have the same result.
16261641
#' If this value is small, consider running [find_MAP()]
16271642
#' again with `optimizer = "continue"`.
16281643
#' For `optimizer = "BF"`, it is `NULL`.
1629-
#' 19. `acceptance_rate` - only interesting for `optimizer = "MH"`.
1644+
#' 21. `acceptance_rate` - only interesting for `optimizer = "MH"`.
16301645
#' How often was the algorithm accepting the change of permutation
16311646
#' in an iteration.
16321647
#' @export
1648+
#'
1649+
#' @importFrom stats pchisq
16331650
#'
16341651
#' @seealso
16351652
#' * [find_MAP()] - Usually, the `summary.gips()`
@@ -1648,12 +1665,12 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
16481665
#' mu <- runif(6, -10, 10) # Assume we don't know the mean
16491666
#' sigma_matrix <- matrix(
16501667
#' data = c(
1651-
#' 1.0, 0.8, 0.6, 0.4, 0.6, 0.8,
1652-
#' 0.8, 1.0, 0.8, 0.6, 0.4, 0.6,
1653-
#' 0.6, 0.8, 1.0, 0.8, 0.6, 0.4,
1654-
#' 0.4, 0.6, 0.8, 1.0, 0.8, 0.6,
1655-
#' 0.6, 0.4, 0.6, 0.8, 1.0, 0.8,
1656-
#' 0.8, 0.6, 0.4, 0.6, 0.8, 1.0
1668+
#' 1.1, 0.8, 0.6, 0.4, 0.6, 0.8,
1669+
#' 0.8, 1.1, 0.8, 0.6, 0.4, 0.6,
1670+
#' 0.6, 0.8, 1.1, 0.8, 0.6, 0.4,
1671+
#' 0.4, 0.6, 0.8, 1.1, 0.8, 0.6,
1672+
#' 0.6, 0.4, 0.6, 0.8, 1.1, 0.8,
1673+
#' 0.8, 0.6, 0.4, 0.6, 0.8, 1.1
16571674
#' ),
16581675
#' nrow = perm_size, byrow = TRUE
16591676
#' ) # sigma_matrix is a matrix invariant under permutation (1,2,3,4,5,6)
@@ -1662,6 +1679,7 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
16621679
#' S <- cov(Z) # Assume we have to estimate the mean
16631680
#'
16641681
#' g <- gips(S, number_of_observations)
1682+
#' unclass(summary(g))
16651683
#'
16661684
#' g_map <- find_MAP(g, max_iter = 10, show_progress_bar = FALSE, optimizer = "Metropolis_Hastings")
16671685
#' unclass(summary(g_map))
@@ -1675,8 +1693,25 @@ summary.gips <- function(object, ...) {
16751693
tmp <- get_n0_and_edited_number_of_observations_from_gips(object)
16761694
n0 <- tmp[1]
16771695
edited_number_of_observations <- tmp[2]
1678-
1696+
16791697
n_parameters <- sum(get_structure_constants(object[[1]])[["dim_omega"]])
1698+
1699+
# Likelihood-Ratio test:
1700+
if (edited_number_of_observations < n0 || !is.positive.definite.matrix(attr(object, "S"))) {
1701+
likelihood_ratio_test_statistics <- NULL
1702+
likelihood_ratio_test_p_value <- NULL
1703+
} else {
1704+
likelihood_ratio_test_statistics <- edited_number_of_observations*(determinant(project_matrix(attr(object, "S"), object[[1]]))$modulus - determinant(attr(object, "S"))$modulus)
1705+
attributes(likelihood_ratio_test_statistics) <- NULL
1706+
p <- attr(object[[1]], "size")
1707+
df_chisq <- p*(p+1)/2 - n_parameters
1708+
if (df_chisq == 0) {
1709+
likelihood_ratio_test_p_value <- NULL
1710+
} else {
1711+
# when likelihood_ratio_test_statistics is close to 0, the H_0
1712+
likelihood_ratio_test_p_value <- 1 - pchisq(likelihood_ratio_test_statistics, df_chisq)
1713+
}
1714+
}
16801715

16811716
if (is.null(attr(object, "optimization_info"))) {
16821717
log_posteriori_id <- log_posteriori_of_perm(
@@ -1691,6 +1726,8 @@ summary.gips <- function(object, ...) {
16911726
start_permutation_log_posteriori = permutation_log_posteriori,
16921727
times_more_likely_than_id = exp(permutation_log_posteriori - log_posteriori_id),
16931728
log_times_more_likely_than_id = permutation_log_posteriori - log_posteriori_id,
1729+
likelihood_ratio_test_statistics = likelihood_ratio_test_statistics,
1730+
likelihood_ratio_test_p_value = likelihood_ratio_test_p_value,
16941731
n0 = n0,
16951732
S_matrix = attr(object, "S"),
16961733
number_of_observations = attr(object, "number_of_observations"),
@@ -1728,7 +1765,7 @@ summary.gips <- function(object, ...) {
17281765
)
17291766
start_permutation_log_posteriori <- log_posteriori_of_gips(gips_start)
17301767
}
1731-
1768+
17321769
summary_list <- list(
17331770
optimized = TRUE,
17341771
found_permutation = object[[1]],
@@ -1737,6 +1774,8 @@ summary.gips <- function(object, ...) {
17371774
start_permutation_log_posteriori = start_permutation_log_posteriori,
17381775
times_more_likely_than_start = exp(permutation_log_posteriori - start_permutation_log_posteriori),
17391776
log_times_more_likely_than_start = permutation_log_posteriori - start_permutation_log_posteriori,
1777+
likelihood_ratio_test_statistics = likelihood_ratio_test_statistics,
1778+
likelihood_ratio_test_p_value = likelihood_ratio_test_p_value,
17401779
n0 = n0,
17411780
S_matrix = attr(object, "S"),
17421781
number_of_observations = attr(object, "number_of_observations"),
@@ -1805,6 +1844,15 @@ print.summary.gips <- function(x, ...) {
18051844
)
18061845
)
18071846
),
1847+
ifelse(is.null(x[["likelihood_ratio_test_statistics"]]),
1848+
ifelse(is.positive.definite.matrix(x[["S_matrix"]]),
1849+
"\n\ndet(S) == 0, so Likelihood-Ratio test cannot be performed",
1850+
"\n\nn0 > number_of_observations, so Likelihood-Ratio test cannot be performed"
1851+
),
1852+
ifelse(is.null(x[["likelihood_ratio_test_p_value"]]),
1853+
"\n\nThe current permutation is id, so Likelihood-Ratio test cannot be performed (there is nothing to compare)",
1854+
paste0("\n\nThe p-value of Likelihood-Ratio test:\n ", format(x[["likelihood_ratio_test_p_value"]], digits = 4)))
1855+
),
18081856
"\n\nThe number of observations:\n ", x[["number_of_observations"]],
18091857
"\n\n", ifelse(x[["was_mean_estimated"]],
18101858
paste0(

Diff for: R/utils.R

+13
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,19 @@ is.positive.semi.definite.matrix <- function(matrix_of_interest, tolerance = 1e-
5959
return(all(eigenvalues >= -tolerance * abs(eigenvalues[1]))) # 1st is the biggest eigenvalue
6060
}
6161

62+
#' Same as for is.positive.semi.definite.matrix
63+
#'
64+
#' @noRd
65+
is.positive.definite.matrix <- function(matrix_of_interest, tolerance = 1e-06) {
66+
eigenvalues <- eigen(
67+
matrix_of_interest,
68+
symmetric = TRUE,
69+
only.values = TRUE
70+
)[["values"]]
71+
72+
return(all(eigenvalues >= tolerance * abs(eigenvalues[1]))) # 1st is the biggest eigenvalue
73+
}
74+
6275
wrong_argument_abort <- function(i, x = "") {
6376
rlang::abort(c("There was a problem identified with provided argument",
6477
"i" = i,

Diff for: inst/WORDLIST

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ CRAN's
88
DOI
99
Diaconis
1010
EDA
11-
EPYC
1211
Eq
1312
Graczyk
1413
HC
@@ -26,6 +25,7 @@ Optimizers
2625
Piotr
2726
Posteriori
2827
Pseudocode
28+
RStudio
2929
Schwarz's
3030
Sym
3131
Validator

Diff for: man/summary.gips.Rd

+24-8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)