update of case study 5

snaketron · snaketron · commit 309f5468c974 · 2024-03-26T23:50:43.000+01:00
diff --git a/data/d_zibb_5.RData b/data/d_zibb_5.RData
diff --git a/inst/scripts/d_zibb_5.R b/inst/scripts/d_zibb_5.R
@@ -0,0 +1,149 @@
+require(rstan)
+
+# Stan generative model
+sim_stan <- "
+functions {
+  int zibb_rng(int y, int n, real mu, real phi, real kappa) {
+    if (bernoulli_rng(kappa) == 1) {
+      return (0);
+    } else {
+      return (beta_binomial_rng(n, mu * phi, (1 - mu) * phi));
+    }
+  }
+}
+
+data {
+  int<lower=0> N_sample;                   // number of repertoires
+  int<lower=0> N_gene;                     // gene
+  int<lower=0> N_individual;               // number of individuals
+  int<lower=0> N_condition;                // number of conditions
+  array [N_sample] int N;                  // repertoire size
+  array [N_sample] int condition_id;   // id of conditions
+  array [N_sample] int individual_id;      // id of replicate
+  real <lower=0> phi;
+  real <lower=0, upper=1> kappa;
+  array [N_condition] vector [N_gene] beta_condition;
+  vector <lower=0> [N_condition] sigma_condition;
+  real <lower=0> sigma_alpha;
+}
+
+generated quantities {
+  vector [N_gene] alpha;
+  array [N_individual] vector [N_gene] alpha_individual;
+  array [N_sample] vector [N_gene] beta_individual;
+  // generate usage
+  array [N_sample] vector <lower=0, upper=1> [N_gene] theta;
+  array [N_gene, N_sample] int Y;
+  
+  for(i in 1:N_gene) {
+    alpha[i] = normal_rng(-3, 0.5);
+  }
+  
+  for(i in 1:N_sample) {
+    for(j in 1:N_gene) {
+      
+      alpha_individual[individual_id[i]][j] = normal_rng(alpha[j], sigma_alpha);
+      
+      beta_individual[i][j] = normal_rng(beta_condition[condition_id[i]][j], sigma_condition[condition_id[i]]);
+      
+      theta[i][j] = inv_logit(alpha_individual[individual_id[i]][j] + beta_individual[i][j]);
+      Y[j, i] = zibb_rng(Y[j, i], N[i], theta[i][j], phi, kappa);
+    }
+  }
+}
+"
+
+# compile model
+model <- rstan::stan_model(model_code = sim_stan)
+
+
+
+# generate data based on the following parameters parameters
+set.seed(11132)
+N_gene <- 8
+N_individual <- 10
+N_condition <- 2
+N_sample <- N_individual*N_condition
+
+condition_id <- rep(x = 1:N_condition, each = N_individual) 
+
+N <- rep(x = 1000, times = N_sample)
+
+individual_id <- rep(x = 1:N_individual, times = N_condition)
+
+phi <- 200
+kappa <- 0.02
+
+beta_condition <- array(data = 0, dim = c(N_condition, N_gene))
+for(c in 1:N_condition) {
+  for(g in 1:N_gene) {
+    u <- runif(n = 1, min = 0, max = 1)
+    if(u <= 0.8) {
+      beta_condition[c,g] <- rnorm(n = 1, mean = 0, sd = 0.1)
+    } else {
+      beta_condition[c,g] <- rnorm(n = 1, mean = 0, sd = 2)
+    }
+  }
+}
+
+sigma_condition <- rep(x = 0.5, times = N_condition)
+sigma_alpha <- 0.25
+
+
+l <- list(N_sample = N_sample, 
+          N_gene = N_gene, 
+          N_individual = N_individual,
+          N_condition = N_condition,
+          N = N,
+          condition_id = condition_id,
+          individual_id = individual_id,
+          phi = phi,
+          kappa = kappa,
+          beta_condition = beta_condition,
+          sigma_condition = sigma_condition,
+          sigma_alpha = sigma_alpha)
+
+# simulate
+sim <- rstan::sampling(object = model,
+                       data = l, 
+                       iter = 1, 
+                       chains = 1, 
+                       algorithm="Fixed_param")
+
+# extract simulation and convert into data frame which can 
+# be used as input of IgGeneUsage
+ysim <- rstan::extract(object = sim, par = "Y")$Y
+ysim <- ysim[1,,]
+
+ysim_df <- reshape2::melt(ysim)
+colnames(ysim_df) <- c("gene_name", "sample_id", "gene_usage_count")
+
+m <- data.frame(sample_id = 1:l$N_sample,
+                individual_id = l$individual_id,
+                condition_id = l$condition_id)
+
+ysim_df <- merge(x = ysim_df, y = m, by = "sample_id", all.x = T)
+
+ysim_df$condition <- paste0("C_", ysim_df$condition_id)
+ysim_df$gene_name <- paste0("G_", ysim_df$gene_name)
+ysim_df$individual_id <- paste0("I_", ysim_df$individual_id)
+
+ysim_df$condition_id <- NULL
+ysim_df$sample_id <- NULL
+ysim_df <- ysim_df[, c("individual_id", "condition", "gene_name",
+                       "gene_usage_count")]
+
+d_zibb_5 <- ysim_df
+
+# save
+save(d_zibb_5, file = "data/d_zibb_5.RData", compress = T)
+
+
+# save(sim, file = "mytests/sim_d_zibb_4.RData", compress = T)
+ggplot(data = d_zibb_5)+
+  facet_wrap(facets = ~gene_name, scales = "free_y")+
+  geom_point(aes(x = condition, y = gene_usage_count, group = individual_id))+
+  geom_line(aes(x = condition, y = gene_usage_count, group = individual_id))+
+  theme_bw(base_size = 10)+
+  theme(legend.position = "none")
+
diff --git a/inst/stan/dgu_paired.stan b/inst/stan/dgu_paired.stan
@@ -72,7 +72,7 @@ transformed parameters {
   for(i in 1:N_individual) {
     alpha_individual[i] = alpha + sigma_alpha * z_alpha_individual[i];
     for(j in 1:N_condition) {
-      beta_individual[i,j]  = beta_condition[j] + sigma_individual[j] * z_beta_individual[i,j];
+      beta_individual[i,j] = beta_condition[j] + sigma_individual[j] * z_beta_individual[i,j];
     }
   }
   
diff --git a/man/d_zibb_5.Rd b/man/d_zibb_5.Rd
@@ -0,0 +1,39 @@
+\name{d_zibb_5}
+\alias{d_zibb_5}
+\docType{data}
+\title{Simulated Ig gene usage data}
+
+\description{
+A small example of paired-sample IRRs with these features:
+
+  \itemize{
+    \item 2 conditions
+    \item 10 individuals with one IRRs per condition
+    \item 8 Ig genes
+  }
+This dataset was simulated from zero-inflated beta-binomial (ZIBB) 
+distribution. Simulation code is available in inst/scripts/d_zibb_5.R
+}
+
+\usage{
+data("d_zibb_5", package = "IgGeneUsage")
+}
+
+\format{
+A data frame with 4 columns:  
+\itemize{
+  \item "individual_id"
+  \item "condition"
+  \item "gene_name"
+  \item "gene_name_count"
+}
+This format is accepted by IgGeneUsage.
+}
+\source{
+Simulation code is provided in inst/scripts/d_zibb_5.R
+}
+\examples{
+data("d_zibb_5", package = "IgGeneUsage")
+head(d_zibb_5)
+}
+\keyword{d_zibb_5}
diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd
@@ -548,24 +548,13 @@ g2 <- ggplot(data = stats)+
 # Case Study C: analyzing paired-IRRs
 
 ```{r}
-require(IgGeneUsage)
-require(rstan)
-require(knitr)
-require(ggplot2)
-require(ggforce)
-require(ggrepel)
-require(reshape2)
-require(patchwork)
-```
-
-```{r}
-data("d_zibb_3", package = "IgGeneUsage")
+data("d_zibb_5", package = "IgGeneUsage")
 ```
 
 
-```{r, fig.width=6, fig.height=6}
-ggplot(data = d_zibb_3)+
-  facet_wrap(facets = ~gene_name)+
+```{r, fig.width=6, fig.height=4}
+ggplot(data = d_zibb_5)+
+  facet_wrap(facets = ~gene_name, ncol = 4, scales = "free_y")+
   geom_line(aes(x = condition, y = gene_usage_count, group = individual_id))+
   geom_point(aes(x = condition, y = gene_usage_count, col = condition))+
   theme_bw(base_size = 11)+
@@ -575,12 +564,11 @@ ggplot(data = d_zibb_3)+
   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
-
 ```{r}
-M <- DGU(ud = d_zibb_3, # input data
+M <- DGU(ud = d_zibb_5, # input data
          mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500)
          mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500)
-         mcmc_chains = 3, # how many MCMC chain to run (default: 4)
+         mcmc_chains = 1, # how many MCMC chain to run (default: 4)
          mcmc_cores = 3, # how many PC cores to use? (e.g. parallel chains)
          hdi_lvl = 0.95, # highest density interval level (de fault: 0.95)
          adapt_delta = 0.8, # MCMC target acceptance rate (default: 0.95)
@@ -590,16 +578,15 @@ M <- DGU(ud = d_zibb_3, # input data
 
 
 ```{r, fig.height = 3, fig.width = 6}
-rstan::stan_rhat(object = M$fit)|
-  rstan::stan_ess(object = M$fit)
+rstan::stan_rhat(object = M$fit)|rstan::stan_ess(object = M$fit)
 ```
 
 
 ```{r, fig.weight = 7, fig.height = 4}
 g1 <- ggplot(data = M$gu)+
   geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L,
                     ymax = prob_H, col = condition),
-                width = 0.1, position = position_dodge(width = 0.4))+
+                width = 0.01, position = position_dodge(width = 0.4))+
   geom_point(aes(x = gene_name, y = prob_mean, col = condition), size = 1,
              position = position_dodge(width = 0.4))+
   theme_bw(base_size = 11)+
@@ -616,7 +603,7 @@ g2 <- ggplot(data = stats)+
   facet_wrap(facets = ~contrast)+
   geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+
   geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H), 
-                col = "darkgray")+
+                col = "darkgray", width = 0.01)+
   geom_point(aes(x = pmax, y = es_mean, col = contrast))+
   geom_text_repel(data = stats[stats$pmax >= 0.9, ],
                   aes(x = pmax, y = es_mean, label = gene_fac),
@@ -635,6 +622,61 @@ g2 <- ggplot(data = stats)+
 
 
 
+<!-- ```{r} -->
+<!-- Mu <- DGU(ud = d_zibb_5, # input data -->
+<!--          mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500) -->
+<!--          mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500) -->
+<!--          mcmc_chains = 3, # how many MCMC chain to run (default: 4) -->
+<!--          mcmc_cores = 3, # how many PC cores to use? (e.g. parallel chains) -->
+<!--          hdi_lvl = 0.95, # highest density interval level (de fault: 0.95) -->
+<!--          adapt_delta = 0.8, # MCMC target acceptance rate (default: 0.95) -->
+<!--          max_treedepth = 10, # tree depth evaluated at each step (default: 12) -->
+<!--          paired = FALSE) -->
+<!-- ``` -->
+
+
+
+<!-- ```{r, fig.weight = 7, fig.height = 4} -->
+<!-- g1 <- ggplot(data = Mu$gu)+ -->
+<!--   geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L, -->
+<!--                     ymax = prob_H, col = condition), -->
+<!--                 width = 0.01, position = position_dodge(width = 0.4))+ -->
+<!--   geom_point(aes(x = gene_name, y = prob_mean, col = condition), size = 1, -->
+<!--              position = position_dodge(width = 0.4))+ -->
+<!--   theme_bw(base_size = 11)+ -->
+<!--   theme(legend.position = "top")+ -->
+<!--   ylab(label = "GU [probability]")+ -->
+<!--   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4)) -->
+
+
+<!-- stats <- Mu$dgu -->
+<!-- stats <- stats[order(abs(stats$es_mean), decreasing = FALSE), ] -->
+<!-- stats$gene_fac <- factor(x = stats$gene_name, levels = unique(stats$gene_name)) -->
+
+<!-- g2 <- ggplot(data = stats)+ -->
+<!--   facet_wrap(facets = ~contrast)+ -->
+<!--   geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+ -->
+<!--   geom_errorbar(aes(x = pmax, y = es_mean, ymin = es_L, ymax = es_H),  -->
+<!--                 col = "darkgray", width = 0.01)+ -->
+<!--   geom_point(aes(x = pmax, y = es_mean, col = contrast))+ -->
+<!--   geom_text_repel(data = stats[stats$pmax >= 0.9, ], -->
+<!--                   aes(x = pmax, y = es_mean, label = gene_fac), -->
+<!--                   min.segment.length = 0, size = 2.75)+ -->
+<!--   theme_bw(base_size = 11)+ -->
+<!--   theme(legend.position = "top")+ -->
+<!--   xlab(label = expression(pi))+ -->
+<!--   xlim(c(0, 1))+ -->
+<!--   ylab(expression(gamma)) -->
+<!-- ``` -->
+
+
+<!-- ```{r, fig.height = 6, fig.width = 7} -->
+<!-- (g1/g2) -->
+<!-- ``` -->
+
+
+
+
 # Session
 
 ```{r}

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ transformed parameters {`
`72`	`72`	`for(i in 1:N_individual) {`
`73`	`73`	`alpha_individual[i] = alpha + sigma_alpha * z_alpha_individual[i];`
`74`	`74`	`for(j in 1:N_condition) {`
`75`		`- beta_individual[i,j] = beta_condition[j] + sigma_individual[j] * z_beta_individual[i,j];`
	`75`	`+ beta_individual[i,j] = beta_condition[j] + sigma_individual[j] * z_beta_individual[i,j];`
`76`	`76`	`}`
`77`	`77`	`}`
`78`	`78`