bugfixes

snaketron · snaketron · commit 918a9838accc · 2024-04-05T16:46:14.000+02:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: IgGeneUsage
 Type: Package
 Title: Differential gene usage in immune repertoires
-Version: 1.17.23
+Version: 1.17.24
 Authors@R: 
     person(given = "Simo",  
            family = "Kitanovski",
diff --git a/R/utils_ppc.R b/R/utils_ppc.R
@@ -104,8 +104,8 @@ get_ppc_condition <- function(glm,
   }
   
   # condition map
-  condition_map <- data.frame(condition_name = ud$condition_names,
-                              condition_id = ud$condition_id)
+  condition_map <- data.frame(condition_name = ud$condition_name_of_sample,
+                              condition_id = ud$condition_id_of_sample)
   condition_map <- condition_map[duplicated(condition_map)==FALSE,]
   rownames(condition_map) <- condition_map$condition_id
   yhat$gene_name <- ud$gene_names[yhat$gene_id]
diff --git a/R/utils_usage.R b/R/utils_usage.R
@@ -34,27 +34,28 @@ get_usage <- function(u) {
   check_paired <- function(u, 
                            has_balanced_replicates, 
                            has_replicates, 
-                           has_condition) {
-    if(has_condition==FALSE) {
+                           has_conditions) {
+    if(has_conditions==FALSE) {
       return(FALSE)
     }
     if(has_balanced_replicates==FALSE) {
       return(FALSE)
     }
     
     if(has_replicates) {
-      q <- u[duplicated(u[,c("individual_id","condition","replicate")])==FALSE,]
+      q <- u[duplicated(u[,c("individual_id","condition",
+                             "replicate_id")]) == FALSE,]
       q$f <- 1
-      q <- aggregate(f~individual_id+condition+replicate, data = q, 
-                     FUN = sum, simplify = FALSE, drop = FALSE)
+      q <- aggregate(f~individual_id+condition+replicate_id, 
+                     data = q, FUN = sum, drop = FALSE)
       q$f[is.null(q$f)|is.na(q$f)] <- 0
       return(ifelse(test = any(q$f!=1), yes = FALSE, no = TRUE))
     }
     else {
       q <- u[duplicated(u[,c("individual_id", "condition")])==FALSE,]
       q$f <- 1
-      q <- aggregate(f~individual_id+condition, data = q, FUN = sum, 
-                     simplify = FALSE, drop = FALSE)
+      q <- aggregate(f~individual_id+condition, 
+                     data = q, FUN = sum, drop = FALSE)
       q$f[is.null(q$f)|is.na(q$f)] <- 0
       return(ifelse(test = any(q$f!=1), yes = FALSE, no = TRUE))
     }
@@ -158,7 +159,7 @@ get_usage <- function(u) {
     u = u, 
     has_balanced_replicates = has_balanced_replicates, 
     has_replicates = has_replicates, 
-    has_condition = has_condition)
+    has_conditions = has_conditions)
   
   return(list(Y = Y, 
               N = as.numeric(N), 
diff --git a/data/d_zibb_3.RData b/data/d_zibb_3.RData
diff --git a/inst/scripts/d_zibb_3.R b/inst/scripts/d_zibb_3.R
@@ -21,17 +21,24 @@ data {
   array [N_individual] int condition_id;       // id of conditions
   real <lower=0> phi;
   real <lower=0, upper=1> kappa;
-  array [N_condition] vector [N_gene] beta_condition;
   vector <lower=0> [N_condition] sigma_individual;
+  vector <lower=0> [N_condition] sigma_condition;
 }
 
 generated quantities {
   vector [N_gene] alpha;
   array [N_individual] vector <lower=0, upper=1> [N_gene] theta;
   array [N_individual] vector [N_gene] beta_individual;
+  array [N_condition] vector [N_gene] beta_condition;
   // generate usage
   array [N_gene, N_individual] int Y;
   
+  for(i in 1:N_condition) {
+    for(j in 1:N_gene) {
+      beta_condition[i][j] = normal_rng(0, sigma_condition[i]);
+    }
+  }
+  
   for(i in 1:N_gene) {
     alpha[i] = normal_rng(-3.0, 1.0);
     
@@ -54,10 +61,10 @@ N_condition <- 3
 N_individual <- 5
 N_gene <- 8
 N <- 10^3
-sigma_individual <- runif(n = N_condition, min = 0.1, max = 0.6)
-beta_condition <- t(replicate(n = N_condition, expr = rnorm(n = N_gene, mean = 0, sd = 1)))
+sigma_individual <- runif(n = N_condition, min = 0.1, max = 0.2)
+sigma_condition <- runif(n = N_condition, min = 0.2, max = 0.6)
 phi <- 200
-kappa <- 0.03
+kappa <- 0.015
 
 condition_id <- rep(x = 1:N_condition, each = N_individual)
 
@@ -67,7 +74,7 @@ l <- list(N_individual = N_individual*N_condition,
           N = N,
           condition_id = condition_id,
           sigma_individual = sigma_individual,
-          beta_condition = beta_condition,
+          sigma_condition = sigma_condition,
           phi = phi,
           kappa = kappa)
 
@@ -77,7 +84,7 @@ sim <- rstan::sampling(object = m,
                        iter = 1, 
                        chains = 1, 
                        algorithm = "Fixed_param",
-                       seed = 123456)
+                       seed = 12346)
 
 
 # extract simulation and convert into data frame which can 
diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd
@@ -88,19 +88,19 @@ on the posterior distribution of $\gamma$, and are thus related.
 `r Biocpkg("IgGeneUsage")` has a couple of built-in Ig gene usage datasets. 
 Some were obtained from studies and others were simulated.
 
-Lets look into the simulated dataset `d_zibb_2`. This dataset was generated 
+Lets look into the simulated dataset `d_zibb_3`. This dataset was generated 
 by a zero-inflated beta-binomial (ZIBB) model, and `r Biocpkg("IgGeneUsage")` 
 was designed to fit ZIBB-distributed data.
 
 ```{r}
-data("d_zibb_2", package = "IgGeneUsage")
-knitr::kable(head(d_zibb_2))
+data("d_zibb_3", package = "IgGeneUsage")
+knitr::kable(head(d_zibb_3))
 ```
 
-We can also visualize `d_zibb_2` with `r CRANpkg("ggplot")`:
+We can also visualize `d_zibb_3` with `r CRANpkg("ggplot")`:
 
 ```{r, fig.width=6, fig.height=3.25}
-ggplot(data = d_zibb_2)+
+ggplot(data = d_zibb_3)+
   geom_point(aes(x = gene_name, y = gene_usage_count, col = condition),
              position = position_dodge(width = .7), shape = 21)+
   theme_bw(base_size = 11)+
@@ -113,10 +113,10 @@ ggplot(data = d_zibb_2)+
 
 ## DGU analysis
 As main input `r Biocpkg("IgGeneUsage")` uses a data.frame formatted as e.g.
-`d_zibb_2`. Other input parameters allow you to configure specific settings 
+`d_zibb_3`. Other input parameters allow you to configure specific settings 
 of the `r CRANpkg("rstan")` sampler.
 
-In this example, we analyze `d_zibb_2` with 3 MCMC chains, 1500 iterations 
+In this example, we analyze `d_zibb_3` with 3 MCMC chains, 1500 iterations 
 each including 500 warm-ups using a single CPU core (Hint: for parallel 
 chain execution set parameter `mcmc_cores` = 3). We report for each model 
 parameter its mean and 95% highest density interval (HDIs).
@@ -129,8 +129,8 @@ issue with a reproducible script at the Bioconductor support site or on
 Github[^3].
 
 ```{r}
-M <- DGU(ud = d_zibb_2, # input data
-         mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500)
+M <- DGU(ud = d_zibb_3, # input data
+         mcmc_warmup = 300, # how many MCMC warm-ups per chain (default: 500)
          mcmc_steps = 1500, # how many MCMC steps per chain (default: 1,500)
          mcmc_chains = 3, # how many MCMC chain to run (default: 4)
          mcmc_cores = 1, # how many PC cores to use? (e.g. parallel chains)
@@ -182,7 +182,7 @@ summary(M)
 rstan::check_hmc_diagnostics(M$fit)
 ```
 
-  * rhat < 1.03 and n_eff > 0
+  * rhat < 1.05 and n_eff > 0
   
 
 ```{r, fig.height = 3, fig.width = 6}
@@ -197,7 +197,7 @@ Error bars show 95% HDI of mean posterior prediction. The predictions can be
 compared with the observed data (x-axis). For points near the diagonal 
 $\rightarrow$ accurate prediction.
 
-```{r, fig.height = 3.25, fig.width = 7}
+```{r, fig.height = 4, fig.width = 7}
 ggplot(data = M$ppc$ppc_rep)+
   facet_wrap(facets = ~individual_id, ncol = 5)+
   geom_abline(intercept = 0, slope = 1, linetype = "dashed", col = "darkgray")+
@@ -366,7 +366,7 @@ by evaluating their variability for a specific gene.
 This analysis can be computationally demanding.
 
 ```{r}
-L <- LOO(ud = d_zibb_2, # input data
+L <- LOO(ud = d_zibb_3, # input data
          mcmc_warmup = 500, # how many MCMC warm-ups per chain (default: 500)
          mcmc_steps = 1000, # how many MCMC steps per chain (default: 1,500)
          mcmc_chains = 1, # how many MCMC chain to run (default: 4)
@@ -376,6 +376,7 @@ L <- LOO(ud = d_zibb_2, # input data
          max_treedepth = 10) # tree depth evaluated at each step (default: 12)
 ```
 
+
 Next, we collected the results (GU and DGU) from each LOO iteration:
 
 ```{r}
@@ -388,32 +389,32 @@ L_dgu <- do.call(rbind, lapply(X = L, FUN = function(x){return(x$dgu)}))
 
 ## LOO-DGU: variability of effect size $\gamma$
 
-```{r, fig.width=6.5, fig.height=4}
+```{r, fig.width=6, fig.height=5}
 ggplot(data = L_dgu)+
+  facet_wrap(facets = ~contrast, ncol = 1)+
   geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+
   geom_errorbar(aes(x = gene_name, y = es_mean, ymin = es_L,
                     ymax = es_H, col = contrast, group = loo_id),
-                width = 0.1, position = position_dodge(width = 0.5))+
+                width = 0.1, position = position_dodge(width = 0.75))+
   geom_point(aes(x = gene_name, y = es_mean, col = contrast,
                  group = loo_id), size = 1,
-             position = position_dodge(width = 0.5))+
+             position = position_dodge(width = 0.75))+
   theme_bw(base_size = 11)+
-  theme(legend.position = "top")+
-  ylab(expression(gamma))+
-  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
+  theme(legend.position = "none")+
+  ylab(expression(gamma))
 ```
 
 ## LOO-DGU: variability of $\pi$
 
-```{r, fig.width=6.5, fig.height=4}
+```{r, fig.width=6, fig.height=5}
 ggplot(data = L_dgu)+
+  facet_wrap(facets = ~contrast, ncol = 1)+
   geom_point(aes(x = gene_name, y = pmax, col = contrast,
                  group = loo_id), size = 1,
              position = position_dodge(width = 0.5))+
   theme_bw(base_size = 11)+
-  theme(legend.position = "top")+
-  ylab(expression(pi))+
-  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
+  theme(legend.position = "none")+
+  ylab(expression(pi))
 ```
 
 
@@ -425,24 +426,16 @@ ggplot(data = L_gu)+
   geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L,
                     ymax = prob_H, col = condition, 
                     group = interaction(loo_id, condition)),
-                width = 0.1, position = position_dodge(width = 0.5))+
+                width = 0.1, position = position_dodge(width = 1))+
   geom_point(aes(x = gene_name, y = prob_mean, col = condition,
                  group = interaction(loo_id, condition)), size = 1,
-             position = position_dodge(width = 0.5))+
+             position = position_dodge(width = 1))+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
   ylab("GU [probability]")+
   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
-# Hierarchical clustering analaysis 
-
-```{r, fig.width=6, fig.height=4}
-# x <- M$theta
-x <- acast(individual_id~gene_name, data = M$theta, value.var = "theta_mean")
-
-plot(hclust(dist(x, method = "euclidean"), method = "average"))
-```
 
 
 # Case Study B: analyzing IRRs containing biological replicates