Merge pull request #90 from PrzeChoj/dev

PrzeChoj · web-flow · commit 5064dc19489a · 2024-07-14T17:23:58.000+02:00
likelihood-ratio test
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -50,5 +50,5 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 Language: en-US
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,3 +29,4 @@ export(validate_gips_perm)
 importFrom(stats,AIC)
 importFrom(stats,BIC)
 importFrom(stats,logLik)
+importFrom(stats,pchisq)
diff --git a/NEWS.md b/NEWS.md
@@ -11,8 +11,9 @@ There was a significant improvement in the speed of calculation. Details in the
 
 ### Update to functions
 
--   `plot.gips()` can get `type = "n0"`, which will plot the change of `n0` along the "MH" optimization. Handy for deciding of burn-in time.
--   `find_MAP(optimizer = "MH")` tracks the `n0` along the optimization.
+-   `plot.gips()` can get `type = "n0"`, which will plot the change of `n0` along the "MH" optimization. Handy for deciding of burn-in time;
+-   `find_MAP(optimizer = "MH")` tracks the `n0` along the optimization;
+-   `summary.gips()` calculates Likelihood-Ratio test;
 
 
 # gips 1.2.2
diff --git a/R/get_structure_constants.R b/R/get_structure_constants.R
@@ -103,21 +103,31 @@ calculate_r <- function(cycle_lengths, perm_order) {
     # identity function
     return(length(cycle_lengths))
   }
-  # for a in 0,1,...,floor(perm_order/2)
-  # r_a = #{1:C such that a*p_c is a multiple of N}
-  # AKA a*p_c %% N == 0
-  
-  # Corollary: N %% p_c == 0 for each p_c, cause N is LCM of all p_c
+  # for alpha in 0,1,...,floor(perm_order/2)
+  # r_{alpha} = #{1:C such that alpha*p_c is a multiple of N}
+  # alpha*p_c is a multiple of N iff alpha*p_c %% N == 0
+  # Now, N is the Least Common Multiplier of all p_c
+  # Which means, that N %% p_c == 0 for every p_c
+  # In other words, N / p_c is an integer
   multiples <- round(perm_order / cycle_lengths) # the result of division should be an integer, but floats may interfere
 
-  # Now we have to adjust for 2 cases:
-  # 1) some alphas are too large
-  # 2) some alphas are so small, that we can include their multiples
-  #   (if a*p_c %% N == 0, then for any natural k  k*a*p_c %% N == 0)
+  # Since N/p_c is an integer, alpha*p_c %% N == 0 iff alpha %% (N/p_c) == 0
+  # In other words, alpha must be a multiple of (N/p_1, N/p_2,...,N/p_C)
+  # (alpha = k*(N/p_1) or k*(N/p_2) or ... or k*(N/p_C) for some integer k (including 0))
+  # However, alpha must be at most M, and a valid bound from above for integer k is max(p_1,...,p_C).
   max_order <- max(cycle_lengths)
+  
+  # Here we create all possible alpha values. 
+  # The `multiples` corresponds to N/p_1,...,N/p_C, and `0:max_order` are possible integers k.
+  # Use the outer product to get all pairwise multiplications
   alpha_matrix <- multiples %*% t(0:max_order)
+
+  # sort is in ascending order, which means smallest alphas go to start.
+  # The end result is as if we iterated over each alpha value (in ascending order),
+  # and then deleted entries with 0s.
   possible_alphas <- unique(sort(alpha_matrix[alpha_matrix <= M]))
 
+  # Recalculate the r_alpha vector using its definition directly.
   r_alfa <- sapply(possible_alphas, function(alpha) sum(alpha %% multiples == 0))
   as.double(r_alfa)
 }
diff --git a/R/gips_class.R b/R/gips_class.R
@@ -1556,26 +1556,33 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
 #'       It can be less than 1, meaning the identity permutation
 #'       is more likely. Remember that this number can big and
 #'       overflow to `Inf` or small and underflow to 0.
-#'   5. `n0` - the minimum number of observations needed for
+#'   5. `log_times_more_likely_than_id` - log of `times_more_likely_than_id`.
+#'   6. `likelihood_ratio_test_statistics`, `likelihood_ratio_test_p_value` - 
+#'       statistics and p-value of Likelihood Ratio test, where
+#'       the H_0 is that the data was drawn from the normal distribution
+#'       with Covariance matrix invariant under the given permutation.
+#'       The p-value is calculated from the asymptotic distribution.
+#'       Note that this is sensibly defined only for \eqn{n \ge p}.
+#'   7. `n0` - the minimum number of observations needed for
 #'       the covariance matrix's maximum likelihood estimator
 #'       (corresponding to a MAP) to exist. See **\eqn{C\sigma} and `n0`**
 #'       section in `vignette("Theory", package = "gips")` or in its
 #'       [pkgdown page](https://przechoj.github.io/gips/articles/Theory.html).
-#'   6. `S_matrix` - the underlying matrix.
+#'   8. `S_matrix` - the underlying matrix.
 #'       This matrix will be used in calculations of
 #'       the posteriori value in [log_posteriori_of_gips()].
-#'   7. `number_of_observations` - the number of observations that
+#'   9. `number_of_observations` - the number of observations that
 #'       were observed for the `S_matrix` to be calculated.
 #'       This value will be used in calculations of
 #'       the posteriori value in [log_posteriori_of_gips()].
-#'   8. `was_mean_estimated` - given by the user while creating the `gips` object:
+#'   10. `was_mean_estimated` - given by the user while creating the `gips` object:
 #'       * `TRUE` means the `S` parameter was the output of [stats::cov()] function;
 #'       * `FALSE` means the `S` parameter was calculated with
 #'           `S = t(X) %*% X / number_of_observations`.
-#'   9. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
+#'   11. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
 #'       See the **Hyperparameters** section of [gips()] documentation.
-#'   10. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
-#'   11. `n_parameters` - number of free parameters in the covariance matrix.
+#'   12. `n_parameters` - number of free parameters in the covariance matrix.
+#'   13. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
 #' * For optimized `gips` object:
 #'   1. `optimized` - `TRUE`.
 #'   2. `found_permutation` - the permutation this `gips` represents.
@@ -1590,46 +1597,56 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
 #'       the `found_permutation` is over the `start_permutation`.
 #'       It cannot be a number less than 1.
 #'       Remember that this number can big and overflow to `Inf`.
-#'   7. `n0` - the minimal number of observations needed for the existence of
+#'   7. `log_times_more_likely_than_start` - log of
+#'       `times_more_likely_than_start`.
+#'   8. `likelihood_ratio_test_statistics`, `likelihood_ratio_test_p_value` - 
+#'       statistics and p-value of Likelihood Ratio test, where
+#'       the H_0 is that the data was drawn from the normal distribution
+#'       with Covariance matrix invariant under `found_permutation`.
+#'       The p-value is calculated from the asymptotic distribution.
+#'       Note that this is sensibly defined only for \eqn{n \ge p}.
+#'   9. `n0` - the minimal number of observations needed for the existence of
 #'       the maximum likelihood estimator (corresponding to a MAP) of
 #'       the covariance matrix (see **\eqn{C\sigma} and `n0`**
 #'       section in `vignette("Theory", package = "gips")` or in its
 #'       [pkgdown page](https://przechoj.github.io/gips/articles/Theory.html)).
-#'   8. `S_matrix` - the underlying matrix.
+#'   10. `S_matrix` - the underlying matrix.
 #'       This matrix will be used in calculations of
 #'       the posteriori value in [log_posteriori_of_gips()].
-#'   9. `number_of_observations` - the number of observations that
+#'   11. `number_of_observations` - the number of observations that
 #'       were observed for the `S_matrix` to be calculated.
 #'       This value will be used in calculations of
 #'       the posteriori value in [log_posteriori_of_gips()].
-#'   10. `was_mean_estimated` - given by the user while creating the `gips` object:
+#'   12. `was_mean_estimated` - given by the user while creating the `gips` object:
 #'       * `TRUE` means the `S` parameter was output of the [stats::cov()] function;
 #'       * `FALSE` means the `S` parameter was calculated with
 #'           `S = t(X) %*% X / number_of_observations`.
-#'   11. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
+#'   13. `delta`, `D_matrix` - the hyperparameters of the Bayesian method.
 #'       See the **Hyperparameters** section of [gips()] documentation.
-#'   12. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
-#'   13. `n_parameters` - number of free parameters in the covariance matrix.
-#'   14. `optimization_algorithm_used` - all used optimization algorithms
+#'   14. `n_parameters` - number of free parameters in the covariance matrix.
+#'   15. `AIC`, `BIC` - output of [AIC.gips()] and [BIC.gips()] functions.
+#'   16. `optimization_algorithm_used` - all used optimization algorithms
 #'       in order (one could start optimization with "MH", and then
 #'       do an "HC").
-#'   15. `did_converge` - a boolean, did the last used algorithm converge.
-#'   16. `number_of_log_posteriori_calls` - how many times was
+#'   17. `did_converge` - a boolean, did the last used algorithm converge.
+#'   18. `number_of_log_posteriori_calls` - how many times was
 #'       the [log_posteriori_of_gips()] function called during
 #'       the optimization.
-#'   17. `whole_optimization_time` - how long was the optimization process;
+#'   19. `whole_optimization_time` - how long was the optimization process;
 #'       the sum of all optimization times (when there were multiple).
-#'   18. `log_posteriori_calls_after_best` - how many times was
+#'   20. `log_posteriori_calls_after_best` - how many times was
 #'       the [log_posteriori_of_gips()] function called after
 #'       the `found_permutation`; in other words, how long ago
 #'       could the optimization be stopped and have the same result.
 #'       If this value is small, consider running [find_MAP()]
 #'       again with `optimizer = "continue"`.
 #'       For `optimizer = "BF"`, it is `NULL`.
-#'   19. `acceptance_rate` - only interesting for `optimizer = "MH"`.
+#'   21. `acceptance_rate` - only interesting for `optimizer = "MH"`.
 #'       How often was the algorithm accepting the change of permutation
 #'       in an iteration.
 #' @export
+#' 
+#' @importFrom stats pchisq
 #'
 #' @seealso
 #' * [find_MAP()] - Usually, the `summary.gips()`
@@ -1648,12 +1665,12 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
 #' mu <- runif(6, -10, 10) # Assume we don't know the mean
 #' sigma_matrix <- matrix(
 #'   data = c(
-#'     1.0, 0.8, 0.6, 0.4, 0.6, 0.8,
-#'     0.8, 1.0, 0.8, 0.6, 0.4, 0.6,
-#'     0.6, 0.8, 1.0, 0.8, 0.6, 0.4,
-#'     0.4, 0.6, 0.8, 1.0, 0.8, 0.6,
-#'     0.6, 0.4, 0.6, 0.8, 1.0, 0.8,
-#'     0.8, 0.6, 0.4, 0.6, 0.8, 1.0
+#'     1.1, 0.8, 0.6, 0.4, 0.6, 0.8,
+#'     0.8, 1.1, 0.8, 0.6, 0.4, 0.6,
+#'     0.6, 0.8, 1.1, 0.8, 0.6, 0.4,
+#'     0.4, 0.6, 0.8, 1.1, 0.8, 0.6,
+#'     0.6, 0.4, 0.6, 0.8, 1.1, 0.8,
+#'     0.8, 0.6, 0.4, 0.6, 0.8, 1.1
 #'   ),
 #'   nrow = perm_size, byrow = TRUE
 #' ) # sigma_matrix is a matrix invariant under permutation (1,2,3,4,5,6)
@@ -1662,6 +1679,7 @@ get_diagonalized_matrix_for_heatmap <- function(g) {
 #' S <- cov(Z) # Assume we have to estimate the mean
 #'
 #' g <- gips(S, number_of_observations)
+#' unclass(summary(g))
 #'
 #' g_map <- find_MAP(g, max_iter = 10, show_progress_bar = FALSE, optimizer = "Metropolis_Hastings")
 #' unclass(summary(g_map))
@@ -1675,8 +1693,25 @@ summary.gips <- function(object, ...) {
   tmp <- get_n0_and_edited_number_of_observations_from_gips(object)
   n0 <- tmp[1]
   edited_number_of_observations <- tmp[2]
-
+  
   n_parameters <- sum(get_structure_constants(object[[1]])[["dim_omega"]])
+  
+  # Likelihood-Ratio test:
+  if (edited_number_of_observations < n0 || !is.positive.definite.matrix(attr(object, "S"))) {
+    likelihood_ratio_test_statistics <- NULL
+    likelihood_ratio_test_p_value <- NULL
+  } else {
+    likelihood_ratio_test_statistics <- edited_number_of_observations*(determinant(project_matrix(attr(object, "S"), object[[1]]))$modulus - determinant(attr(object, "S"))$modulus)
+    attributes(likelihood_ratio_test_statistics) <- NULL
+    p <- attr(object[[1]], "size")
+    df_chisq <- p*(p+1)/2 - n_parameters
+    if (df_chisq == 0) {
+      likelihood_ratio_test_p_value <- NULL
+    } else {
+      # when likelihood_ratio_test_statistics is close to 0, the H_0
+      likelihood_ratio_test_p_value <- 1 - pchisq(likelihood_ratio_test_statistics, df_chisq)
+    }
+  }
 
   if (is.null(attr(object, "optimization_info"))) {
     log_posteriori_id <- log_posteriori_of_perm(
@@ -1691,6 +1726,8 @@ summary.gips <- function(object, ...) {
       start_permutation_log_posteriori = permutation_log_posteriori,
       times_more_likely_than_id = exp(permutation_log_posteriori - log_posteriori_id),
       log_times_more_likely_than_id = permutation_log_posteriori - log_posteriori_id,
+      likelihood_ratio_test_statistics = likelihood_ratio_test_statistics,
+      likelihood_ratio_test_p_value = likelihood_ratio_test_p_value,
       n0 = n0,
       S_matrix = attr(object, "S"),
       number_of_observations = attr(object, "number_of_observations"),
@@ -1728,7 +1765,7 @@ summary.gips <- function(object, ...) {
       )
       start_permutation_log_posteriori <- log_posteriori_of_gips(gips_start)
     }
-
+    
     summary_list <- list(
       optimized = TRUE,
       found_permutation = object[[1]],
@@ -1737,6 +1774,8 @@ summary.gips <- function(object, ...) {
       start_permutation_log_posteriori = start_permutation_log_posteriori,
       times_more_likely_than_start = exp(permutation_log_posteriori - start_permutation_log_posteriori),
       log_times_more_likely_than_start = permutation_log_posteriori - start_permutation_log_posteriori,
+      likelihood_ratio_test_statistics = likelihood_ratio_test_statistics,
+      likelihood_ratio_test_p_value = likelihood_ratio_test_p_value,
       n0 = n0,
       S_matrix = attr(object, "S"),
       number_of_observations = attr(object, "number_of_observations"),
@@ -1805,6 +1844,15 @@ print.summary.gips <- function(x, ...) {
         )
       )
     ),
+    ifelse(is.null(x[["likelihood_ratio_test_statistics"]]),
+      ifelse(is.positive.definite.matrix(x[["S_matrix"]]),
+        "\n\ndet(S) == 0, so Likelihood-Ratio test cannot be performed",
+        "\n\nn0 > number_of_observations, so Likelihood-Ratio test cannot be performed"
+      ),
+      ifelse(is.null(x[["likelihood_ratio_test_p_value"]]),
+        "\n\nThe current permutation is id, so Likelihood-Ratio test cannot be performed (there is nothing to compare)",
+        paste0("\n\nThe p-value of Likelihood-Ratio test:\n ", format(x[["likelihood_ratio_test_p_value"]], digits = 4)))
+      ),
     "\n\nThe number of observations:\n ", x[["number_of_observations"]],
     "\n\n", ifelse(x[["was_mean_estimated"]],
       paste0(
diff --git a/R/utils.R b/R/utils.R
@@ -59,6 +59,19 @@ is.positive.semi.definite.matrix <- function(matrix_of_interest, tolerance = 1e-
   return(all(eigenvalues >= -tolerance * abs(eigenvalues[1]))) # 1st is the biggest eigenvalue
 }
 
+#' Same as for is.positive.semi.definite.matrix
+#' 
+#' @noRd
+is.positive.definite.matrix <- function(matrix_of_interest, tolerance = 1e-06) {
+  eigenvalues <- eigen(
+    matrix_of_interest,
+    symmetric = TRUE,
+    only.values = TRUE
+  )[["values"]]
+  
+  return(all(eigenvalues >= tolerance * abs(eigenvalues[1]))) # 1st is the biggest eigenvalue
+}
+
 wrong_argument_abort <- function(i, x = "") {
   rlang::abort(c("There was a problem identified with provided argument",
     "i" = i,
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -8,7 +8,6 @@ CRAN's
 DOI
 Diaconis
 EDA
-EPYC
 Eq
 Graczyk
 HC
@@ -26,6 +25,7 @@ Optimizers
 Piotr
 Posteriori
 Pseudocode
+RStudio
 Schwarz's
 Sym
 Validator
diff --git a/man/summary.gips.Rd b/man/summary.gips.Rd
diff --git a/tests/testthat/test-gips_class.R b/tests/testthat/test-gips_class.R