tdaverse
diff --git a/‎.Rbuildignore‎
Lines changed: 2 additions & 0 deletions b/‎.Rbuildignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 16 additions & 4 deletions b/‎DESCRIPTION‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎LICENSE.md‎
Lines changed: 595 additions & 0 deletions b/‎LICENSE.md‎
Lines changed: 595 additions & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/data.R‎
Lines changed: 36 additions & 0 deletions b/‎R/data.R‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎R/two-sample-test.R‎
Lines changed: 83 additions & 0 deletions b/‎R/two-sample-test.R‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎R/utils.R‎
Lines changed: 68 additions & 0 deletions b/‎R/utils.R‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎data-raw/toydata.R‎
Lines changed: 27 additions & 0 deletions b/‎data-raw/toydata.R‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎data-raw/wip.R‎
Lines changed: 47 additions & 0 deletions b/‎data-raw/wip.R‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎data/archspirals.rda‎
27.1 KB b/‎data/archspirals.rda‎
27.1 KB
@@ -1,2 +1,4 @@
 ^inphr\.Rproj$
 ^\.Rproj\.user$
+^LICENSE\.md$
+^data-raw$
@@ -1,14 +1,26 @@
 Package: inphr
-Title: What the Package Does (One Line, Title Case)
+Title: Inference for Persistence Homology Data in R
 Version: 0.0.0.9000
 Authors@R: 
     person("Aymeric", "Stamm", , "aymeric.stamm@cnrs.fr", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-8725-3654"))
-Description: What the package does (one paragraph).
-License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
-    license
+Description: A set of functions for performing null hypothesis testing on
+    samples of persistence diagrams using the theory of permutations. Currently,
+    only two-sample testing is implemented. Inputs can be either samples of
+    persistence diagrams themselves or vectorizations. In the former case, they
+    are embedded in a metric space using either the Bottleneck or Wasserstein
+    distance. In the former case, persistence data becomes functional data and
+    inference is performed using tools available in the {fdatest} package.
+License: GPL (>= 3)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
 URL: https://github.com/tdaverse/inphr
 BugReports: https://github.com/tdaverse/inphr/issues
+Imports: 
+    cli,
+    flipr,
+    phutil
+Depends: 
+    R (>= 3.5)
+LazyData: true
@@ -1,2 +1,3 @@
 # Generated by roxygen2: do not edit by hand
 
+export(two_sample_test)
@@ -0,0 +1,36 @@
+#' Persistence diagrams from trefoil knot samples (first set)
+#'
+#' @description A set of 24 persistence diagrams computed from noisy samples of
+#'   trefoil knots. Each sample consists of 120 points sampled from a trefoil
+#'   knot with Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was
+#'   computed up to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`].
+#'   Generated with seed `28415`.
+#'
+#' @format An object of class `persistence_set` containing 24 objects of class
+#'   [`phutil::persistence`].
+"trefoils1"
+
+#' Persistence diagrams from trefoil knot samples (second set)
+#'
+#' @description A set of 24 persistence diagrams computed from noisy samples of
+#'   trefoil knots. Each sample consists of 120 points sampled from a trefoil
+#'   knot with Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was
+#'   computed up to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`].
+#'   Generated with seed `28415`.
+#'
+#' @format An object of class `persistence_set` containing 24 objects of class
+#'   [`phutil::persistence`].
+"trefoils2"
+
+#' Persistence diagrams from Archimedean spiral samples
+#'
+#' @description A set of 24 persistence diagrams computed from noisy samples of
+#'   2-armed Archimedean spirals. Each sample consists of 120 points sampled
+#'   from an Archimedean spiral, embedded in 3D with a zero z-coordinate, then
+#'   Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was computed up
+#'   to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`]. Generated
+#'   with seed `28415`.
+#'
+#' @format An object of class `persistence_set` containing 24 objects of class
+#'   [`phutil::persistence`].
+"archspirals"
@@ -0,0 +1,83 @@
+#' Two-Sample Test for Persistence Homology Data
+#'
+#' This function performs a two-sample test for persistence homology data using
+#' the theory of permutation hypothesis testing. The input data can take on
+#' various forms:
+#' - A persistence set, which is a collection of persistence diagrams.
+#' - A distance matrix, which is a pairwise distance matrix between persistence
+#' diagrams.
+#' - One of the PH vectorizations available in the [{tdarec}]() package.
+#'
+#' @param x An object of class `persistence_set` typically produced by
+#'   [`phutil::as_persistence_set()`] or of class `dist` typically produced by
+#'   [`phutil::bottleneck_pairwise_distances()`] or
+#'   [`phutil::wasserstein_pairwise_distances()`]. If `x` is a persistence set,
+#'   then `y` must be either a vector of two integers (sample sizes) or another
+#'   persistence set. If `x` is a distance matrix, then `y` must be a vector of
+#'   two integers (sample sizes).
+#' @param y An object of class `persistence_set` typically produced by
+#'   [`phutil::as_persistence_set()`] or a vector of two integers. If `x` is a
+#'   persistence set, then `y` must be either a vector of two integers (sample
+#'   sizes) or another persistence set. If `x` is a distance matrix, then `y`
+#'   must be a vector of two integers (sample sizes).
+#' @param dimension An integer value specifying the homology dimension to use.
+#'   Defaults to `0L`, which corresponds to the 0-dimensional homology.
+#' @param p An integer value specifying the p-norm to use for the Wasserstein
+#'   distance. Defaults to `2L`, which corresponds to the Euclidean distance. If
+#'   `p` is set to `Inf`, then the Bottleneck distance is used.
+#' @param ncores An integer value specifying the number of cores to use when
+#'   computing the pairwise distance matrix between all combined persistence
+#'   diagrams. Defaults to `1L`, which means that the computation is done
+#'   sequentially.
+#' @param B An integer value specifying the number of permutations to use for
+#'   the permutation hypothesis test. Defaults to `1000L`.
+#' @param npc A string specifying the non-parametric combination method to use.
+#'   Choices are either `"tippett"` (default) or `"fisher"`. The former
+#'   corresponds to the Tippet's method, while the latter corresponds to
+#'   Fisher's method.
+#' @param verbose A boolean value indicating whether to print some information
+#'   about the progress of the computation. Defaults to `FALSE`.
+#'
+#' @returns A p-value from the two-sample test where the null hypothesis is that
+#'   the two samples come from the same distribution.
+#'
+#' @export
+#' @examples
+#' two_sample_test(trefoils1[1:5], trefoils2[1:5], B = 100L)
+#' two_sample_test(trefoils1[1:5], archspirals[1:5], B = 100L)
+two_sample_test <- function(
+  x,
+  y,
+  dimension = 0L,
+  p = 2L,
+  ncores = 1L,
+  B = 1000L,
+  npc = "tippett",
+  verbose = FALSE
+) {
+  if (verbose) cli::cli_alert_info("Parsing inputs...")
+  l <- parse_inputs(
+    x = x,
+    y = y,
+    dimension = dimension,
+    p = p,
+    ncores = ncores
+  )
+  D <- l$D
+  sample_sizes <- l$sample_sizes
+  if (verbose) cli::cli_alert_info("Setting up the plausibility function...")
+  # We could use alternative statistics for PH vectorizations
+  pf <- flipr::PlausibilityFunction$new(
+    null_spec = null_spec,
+    stat_functions = list(flipr::stat_t_ip, flipr::stat_f_ip),
+    stat_assignments = list(mean = 1, sd = 2),
+    D,
+    sample_sizes[1],
+    seed = 1234
+  )
+  pf$alternative <- "right_tail"
+  pf$nperms <- B
+  pf$aggregator <- npc
+  if (verbose) cli::cli_alert_info("Calculating the p-value...")
+  pf$get_value(c(0, 1))
+}
@@ -0,0 +1,68 @@
+null_spec <- function(y, parameters) {
+  return(y)
+}
+
+compute_distance_matrix <- function(x, dimension = 0L, p = 2L, ncores = 1L) {
+  if (is.infinite(p)) {
+    D <- phutil::bottleneck_pairwise_distances(
+      x = x,
+      validate = TRUE,
+      dimension = dimension,
+      ncores = ncores
+    )
+  } else {
+    D <- phutil::wasserstein_pairwise_distances(
+      x = x,
+      validate = TRUE,
+      dimension = dimension,
+      ncores = ncores,
+      p = p
+    )
+  }
+  D
+}
+
+parse_inputs <- function(x, y, dimension = 0L, p = 2L, ncores = 1L) {
+  if (is.list(x)) x <- phutil::as_persistence_set(x)
+  if (is.list(y)) y <- phutil::as_persistence_set(y)
+
+  if (inherits(x, "persistence_set")) {
+    if (is.integer(y) && length(y) == 2L) {
+      D <- compute_distance_matrix(
+        x,
+        dimension = dimension,
+        p = p,
+        ncores = ncores
+      )
+      sample_sizes <- y
+    } else if (inherits(y, "persistence_set")) {
+      sample_sizes <- c(length(x), length(y))
+      x <- phutil::as_persistence_set(c(x, y))
+      D <- compute_distance_matrix(
+        x,
+        dimension = dimension,
+        p = p,
+        ncores = ncores
+      )
+    } else {
+      cli::cli_abort(
+        "When the first argument {.arg x} is of class {.cls persistence_set}, the second argument {.arg y} must be either a vector of two integers (sample sizes) or another persistence set."
+      )
+    }
+  } else if (inherits(x, "dist")) {
+    if (is.integer(y) && length(y) == 2L) {
+      D <- x
+      sample_sizes <- y
+    } else {
+      cli::cli_abort(
+        "When the first argument {.arg x} is of class {.cls dist}, the second argument {.arg y} must be a vector of two integers (sample sizes)."
+      )
+    }
+  } else {
+    # TO DO: add support for PH vectorizations
+    cli::cli_abort(
+      "The first argument {.arg x} must be either a persistence set or a distance matrix (of class {.cls dist})."
+    )
+  }
+  list(D = D, sample_sizes = sample_sizes)
+}
@@ -0,0 +1,27 @@
+library(phutil)
+
+n <- 24L
+
+withr::with_seed(28415, {
+  trefoils1 <- as_persistence_set(lapply(seq(n), function(i) {
+    S1 <- tdaunif::sample_trefoil(n = 120L, sd = .05)
+    as_persistence(TDA::ripsDiag(S1, maxdimension = 2, maxscale = 6))
+  }))
+})
+
+withr::with_seed(28415, {
+  trefoils2 <- as_persistence_set(lapply(seq(n), function(i) {
+    S1 <- tdaunif::sample_trefoil(n = 120L, sd = .05)
+    as_persistence(TDA::ripsDiag(S1, maxdimension = 2, maxscale = 6))
+  }))
+})
+
+withr::with_seed(28415, {
+  archspirals <- as_persistence_set(lapply(seq(n), function(i) {
+    S2 <- cbind(tdaunif::sample_arch_spiral(n = 120L, arms = 2L), 0)
+    S2 <- tdaunif::add_noise(S2, sd = .05)
+    as_persistence(TDA::ripsDiag(S2, maxdimension = 2, maxscale = 6))
+  }))
+})
+
+usethis::use_data(trefoils1, trefoils2, archspirals, overwrite = TRUE)
@@ -0,0 +1,47 @@
+sim1 <- tibble(
+  dgms = c(trefoils1, trefoils2),
+  id = as.factor(c(rep(1, n), rep(2, n)))
+)
+sim2 <- tibble(
+  dgms = c(trefoils1, arch_spirals),
+  id = as.factor(c(rep(1, n), rep(2, n)))
+)
+
+library(infer)
+
+my_specify <- function(
+  x,
+  formula,
+  response = NULL,
+  explanatory = NULL,
+  success = NULL
+) {
+  infer:::check_type(x, is.data.frame)
+  x <- infer:::standardize_variable_types(x)
+  response <- rlang::enquo(response)
+  explanatory <- rlang::enquo(explanatory)
+  x <- infer:::parse_variables(x, formula, response, explanatory)
+  attr(x, "success") <- success
+  attr(x, "generated") <- FALSE
+  attr(x, "hypothesized") <- FALSE
+  attr(x, "fitted") <- FALSE
+  infer:::check_success_arg(x, success)
+  x <- x %>%
+    select(any_of(c(infer:::response_name(x), infer:::explanatory_name(x))))
+  # is_complete <- stats::complete.cases(x)
+  # if (!all(is_complete)) {
+  #   x <- dplyr::filter(x, is_complete)
+  #   cli_warn("Removed {sum(!is_complete)} rows containing missing values.")
+  # }
+  infer:::append_infer_class(x)
+}
+
+sim1 |>
+  my_specify(response = dgms, explanatory = id) |>
+  hypothesize(null = "independence") |>
+  generate(reps = 10, type = "permute") |>
+  calculate(stat = "diff in means", order = c("1", "2")) |>
+  visualize()
+
+two_sample_test(trefoils1, trefoils2, ncores = 8L)
+two_sample_test(trefoils1, arch_spirals, ncores = 8L)
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`# Generated by roxygen2: do not edit by hand`
`2`	`2`
	`3`	`+export(two_sample_test)`