Skip to content

Commit 15b8cf8

Browse files
committed
A first working version with two sample test in the space of diagrams.
1 parent 155cc83 commit 15b8cf8

16 files changed

Lines changed: 1013 additions & 4 deletions

.Rbuildignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
^inphr\.Rproj$
22
^\.Rproj\.user$
3+
^LICENSE\.md$
4+
^data-raw$

DESCRIPTION

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
11
Package: inphr
2-
Title: What the Package Does (One Line, Title Case)
2+
Title: Inference for Persistence Homology Data in R
33
Version: 0.0.0.9000
44
Authors@R:
55
person("Aymeric", "Stamm", , "aymeric.stamm@cnrs.fr", role = c("aut", "cre"),
66
comment = c(ORCID = "0000-0002-8725-3654"))
7-
Description: What the package does (one paragraph).
8-
License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
9-
license
7+
Description: A set of functions for performing null hypothesis testing on
8+
samples of persistence diagrams using the theory of permutations. Currently,
9+
only two-sample testing is implemented. Inputs can be either samples of
10+
persistence diagrams themselves or vectorizations. In the former case, they
11+
are embedded in a metric space using either the Bottleneck or Wasserstein
12+
distance. In the former case, persistence data becomes functional data and
13+
inference is performed using tools available in the {fdatest} package.
14+
License: GPL (>= 3)
1015
Encoding: UTF-8
1116
Roxygen: list(markdown = TRUE)
1217
RoxygenNote: 7.3.2
1318
URL: https://github.com/tdaverse/inphr
1419
BugReports: https://github.com/tdaverse/inphr/issues
20+
Imports:
21+
cli,
22+
flipr,
23+
phutil
24+
Depends:
25+
R (>= 3.5)
26+
LazyData: true

LICENSE.md

Lines changed: 595 additions & 0 deletions
Large diffs are not rendered by default.

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export(two_sample_test)

R/data.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#' Persistence diagrams from trefoil knot samples (first set)
2+
#'
3+
#' @description A set of 24 persistence diagrams computed from noisy samples of
4+
#' trefoil knots. Each sample consists of 120 points sampled from a trefoil
5+
#' knot with Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was
6+
#' computed up to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`].
7+
#' Generated with seed `28415`.
8+
#'
9+
#' @format An object of class `persistence_set` containing 24 objects of class
10+
#' [`phutil::persistence`].
11+
"trefoils1"
12+
13+
#' Persistence diagrams from trefoil knot samples (second set)
14+
#'
15+
#' @description A set of 24 persistence diagrams computed from noisy samples of
16+
#' trefoil knots. Each sample consists of 120 points sampled from a trefoil
17+
#' knot with Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was
18+
#' computed up to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`].
19+
#' Generated with seed `28415`.
20+
#'
21+
#' @format An object of class `persistence_set` containing 24 objects of class
22+
#' [`phutil::persistence`].
23+
"trefoils2"
24+
25+
#' Persistence diagrams from Archimedean spiral samples
26+
#'
27+
#' @description A set of 24 persistence diagrams computed from noisy samples of
28+
#' 2-armed Archimedean spirals. Each sample consists of 120 points sampled
29+
#' from an Archimedean spiral, embedded in 3D with a zero z-coordinate, then
30+
#' Gaussian noise (sd = 0.05) added. Vietoris-Rips persistence was computed up
31+
#' to dimension 2 with maximum scale 6 using [`TDA::ripsDiag()`]. Generated
32+
#' with seed `28415`.
33+
#'
34+
#' @format An object of class `persistence_set` containing 24 objects of class
35+
#' [`phutil::persistence`].
36+
"archspirals"

R/two-sample-test.R

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#' Two-Sample Test for Persistence Homology Data
2+
#'
3+
#' This function performs a two-sample test for persistence homology data using
4+
#' the theory of permutation hypothesis testing. The input data can take on
5+
#' various forms:
6+
#' - A persistence set, which is a collection of persistence diagrams.
7+
#' - A distance matrix, which is a pairwise distance matrix between persistence
8+
#' diagrams.
9+
#' - One of the PH vectorizations available in the [{tdarec}]() package.
10+
#'
11+
#' @param x An object of class `persistence_set` typically produced by
12+
#' [`phutil::as_persistence_set()`] or of class `dist` typically produced by
13+
#' [`phutil::bottleneck_pairwise_distances()`] or
14+
#' [`phutil::wasserstein_pairwise_distances()`]. If `x` is a persistence set,
15+
#' then `y` must be either a vector of two integers (sample sizes) or another
16+
#' persistence set. If `x` is a distance matrix, then `y` must be a vector of
17+
#' two integers (sample sizes).
18+
#' @param y An object of class `persistence_set` typically produced by
19+
#' [`phutil::as_persistence_set()`] or a vector of two integers. If `x` is a
20+
#' persistence set, then `y` must be either a vector of two integers (sample
21+
#' sizes) or another persistence set. If `x` is a distance matrix, then `y`
22+
#' must be a vector of two integers (sample sizes).
23+
#' @param dimension An integer value specifying the homology dimension to use.
24+
#' Defaults to `0L`, which corresponds to the 0-dimensional homology.
25+
#' @param p An integer value specifying the p-norm to use for the Wasserstein
26+
#' distance. Defaults to `2L`, which corresponds to the Euclidean distance. If
27+
#' `p` is set to `Inf`, then the Bottleneck distance is used.
28+
#' @param ncores An integer value specifying the number of cores to use when
29+
#' computing the pairwise distance matrix between all combined persistence
30+
#' diagrams. Defaults to `1L`, which means that the computation is done
31+
#' sequentially.
32+
#' @param B An integer value specifying the number of permutations to use for
33+
#' the permutation hypothesis test. Defaults to `1000L`.
34+
#' @param npc A string specifying the non-parametric combination method to use.
35+
#' Choices are either `"tippett"` (default) or `"fisher"`. The former
36+
#' corresponds to the Tippet's method, while the latter corresponds to
37+
#' Fisher's method.
38+
#' @param verbose A boolean value indicating whether to print some information
39+
#' about the progress of the computation. Defaults to `FALSE`.
40+
#'
41+
#' @returns A p-value from the two-sample test where the null hypothesis is that
42+
#' the two samples come from the same distribution.
43+
#'
44+
#' @export
45+
#' @examples
46+
#' two_sample_test(trefoils1[1:5], trefoils2[1:5], B = 100L)
47+
#' two_sample_test(trefoils1[1:5], archspirals[1:5], B = 100L)
48+
two_sample_test <- function(
49+
x,
50+
y,
51+
dimension = 0L,
52+
p = 2L,
53+
ncores = 1L,
54+
B = 1000L,
55+
npc = "tippett",
56+
verbose = FALSE
57+
) {
58+
if (verbose) cli::cli_alert_info("Parsing inputs...")
59+
l <- parse_inputs(
60+
x = x,
61+
y = y,
62+
dimension = dimension,
63+
p = p,
64+
ncores = ncores
65+
)
66+
D <- l$D
67+
sample_sizes <- l$sample_sizes
68+
if (verbose) cli::cli_alert_info("Setting up the plausibility function...")
69+
# We could use alternative statistics for PH vectorizations
70+
pf <- flipr::PlausibilityFunction$new(
71+
null_spec = null_spec,
72+
stat_functions = list(flipr::stat_t_ip, flipr::stat_f_ip),
73+
stat_assignments = list(mean = 1, sd = 2),
74+
D,
75+
sample_sizes[1],
76+
seed = 1234
77+
)
78+
pf$alternative <- "right_tail"
79+
pf$nperms <- B
80+
pf$aggregator <- npc
81+
if (verbose) cli::cli_alert_info("Calculating the p-value...")
82+
pf$get_value(c(0, 1))
83+
}

R/utils.R

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
null_spec <- function(y, parameters) {
2+
return(y)
3+
}
4+
5+
compute_distance_matrix <- function(x, dimension = 0L, p = 2L, ncores = 1L) {
6+
if (is.infinite(p)) {
7+
D <- phutil::bottleneck_pairwise_distances(
8+
x = x,
9+
validate = TRUE,
10+
dimension = dimension,
11+
ncores = ncores
12+
)
13+
} else {
14+
D <- phutil::wasserstein_pairwise_distances(
15+
x = x,
16+
validate = TRUE,
17+
dimension = dimension,
18+
ncores = ncores,
19+
p = p
20+
)
21+
}
22+
D
23+
}
24+
25+
parse_inputs <- function(x, y, dimension = 0L, p = 2L, ncores = 1L) {
26+
if (is.list(x)) x <- phutil::as_persistence_set(x)
27+
if (is.list(y)) y <- phutil::as_persistence_set(y)
28+
29+
if (inherits(x, "persistence_set")) {
30+
if (is.integer(y) && length(y) == 2L) {
31+
D <- compute_distance_matrix(
32+
x,
33+
dimension = dimension,
34+
p = p,
35+
ncores = ncores
36+
)
37+
sample_sizes <- y
38+
} else if (inherits(y, "persistence_set")) {
39+
sample_sizes <- c(length(x), length(y))
40+
x <- phutil::as_persistence_set(c(x, y))
41+
D <- compute_distance_matrix(
42+
x,
43+
dimension = dimension,
44+
p = p,
45+
ncores = ncores
46+
)
47+
} else {
48+
cli::cli_abort(
49+
"When the first argument {.arg x} is of class {.cls persistence_set}, the second argument {.arg y} must be either a vector of two integers (sample sizes) or another persistence set."
50+
)
51+
}
52+
} else if (inherits(x, "dist")) {
53+
if (is.integer(y) && length(y) == 2L) {
54+
D <- x
55+
sample_sizes <- y
56+
} else {
57+
cli::cli_abort(
58+
"When the first argument {.arg x} is of class {.cls dist}, the second argument {.arg y} must be a vector of two integers (sample sizes)."
59+
)
60+
}
61+
} else {
62+
# TO DO: add support for PH vectorizations
63+
cli::cli_abort(
64+
"The first argument {.arg x} must be either a persistence set or a distance matrix (of class {.cls dist})."
65+
)
66+
}
67+
list(D = D, sample_sizes = sample_sizes)
68+
}

data-raw/toydata.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
library(phutil)
2+
3+
n <- 24L
4+
5+
withr::with_seed(28415, {
6+
trefoils1 <- as_persistence_set(lapply(seq(n), function(i) {
7+
S1 <- tdaunif::sample_trefoil(n = 120L, sd = .05)
8+
as_persistence(TDA::ripsDiag(S1, maxdimension = 2, maxscale = 6))
9+
}))
10+
})
11+
12+
withr::with_seed(28415, {
13+
trefoils2 <- as_persistence_set(lapply(seq(n), function(i) {
14+
S1 <- tdaunif::sample_trefoil(n = 120L, sd = .05)
15+
as_persistence(TDA::ripsDiag(S1, maxdimension = 2, maxscale = 6))
16+
}))
17+
})
18+
19+
withr::with_seed(28415, {
20+
archspirals <- as_persistence_set(lapply(seq(n), function(i) {
21+
S2 <- cbind(tdaunif::sample_arch_spiral(n = 120L, arms = 2L), 0)
22+
S2 <- tdaunif::add_noise(S2, sd = .05)
23+
as_persistence(TDA::ripsDiag(S2, maxdimension = 2, maxscale = 6))
24+
}))
25+
})
26+
27+
usethis::use_data(trefoils1, trefoils2, archspirals, overwrite = TRUE)

data-raw/wip.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
sim1 <- tibble(
2+
dgms = c(trefoils1, trefoils2),
3+
id = as.factor(c(rep(1, n), rep(2, n)))
4+
)
5+
sim2 <- tibble(
6+
dgms = c(trefoils1, arch_spirals),
7+
id = as.factor(c(rep(1, n), rep(2, n)))
8+
)
9+
10+
library(infer)
11+
12+
my_specify <- function(
13+
x,
14+
formula,
15+
response = NULL,
16+
explanatory = NULL,
17+
success = NULL
18+
) {
19+
infer:::check_type(x, is.data.frame)
20+
x <- infer:::standardize_variable_types(x)
21+
response <- rlang::enquo(response)
22+
explanatory <- rlang::enquo(explanatory)
23+
x <- infer:::parse_variables(x, formula, response, explanatory)
24+
attr(x, "success") <- success
25+
attr(x, "generated") <- FALSE
26+
attr(x, "hypothesized") <- FALSE
27+
attr(x, "fitted") <- FALSE
28+
infer:::check_success_arg(x, success)
29+
x <- x %>%
30+
select(any_of(c(infer:::response_name(x), infer:::explanatory_name(x))))
31+
# is_complete <- stats::complete.cases(x)
32+
# if (!all(is_complete)) {
33+
# x <- dplyr::filter(x, is_complete)
34+
# cli_warn("Removed {sum(!is_complete)} rows containing missing values.")
35+
# }
36+
infer:::append_infer_class(x)
37+
}
38+
39+
sim1 |>
40+
my_specify(response = dgms, explanatory = id) |>
41+
hypothesize(null = "independence") |>
42+
generate(reps = 10, type = "permute") |>
43+
calculate(stat = "diff in means", order = c("1", "2")) |>
44+
visualize()
45+
46+
two_sample_test(trefoils1, trefoils2, ncores = 8L)
47+
two_sample_test(trefoils1, arch_spirals, ncores = 8L)

data/archspirals.rda

27.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)