PCoA_DAA.Rmd

---
title: "QMP Differential Abundance Analysis Across Diagnosis Groups"
output: html_document
---

```{r setup, include=FALSE}
# Load libraries
library(phyloseq)
library(vegan)
library(data.table)
library(rstatix)
library(ggplot2)
library(vegan)
library(readxl)
library(dplyr)
library(ggpubr)
library(gridExtra)
library(knitr)
library(kableExtra)
```
```{r seed_and_infiles, include=FALSE}

# Set the seed for reproducibility
set.seed(531)

# URL of the Excel file
url <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-024-02963-2/MediaObjects/41591_2024_2963_MOESM3_ESM.xlsx"
temp_file <- tempfile(fileext = ".xlsx")

# Download the file
download.file(url, temp_file, mode = "wb")

# Read the Excel file from the temporary location
metaDF <- as.data.frame(read_excel(temp_file, sheet = "S1"))
sp_table <- as.data.frame(read_excel(temp_file, sheet = "S14"))


# Load infiles from supplementary tables file
#metaDF <- as.data.frame(read_excel("/Users/u0101921/Library/CloudStorage/OneDrive-KULeuven/LCPM_v/from_nm_web/41591_2024_2963_MOESM3_ESM.xlsx", sheet = "S1"))
#sp_table <- as.data.frame(read_excel("/Users/u0101921/Library/CloudStorage/OneDrive-KULeuven/LCPM_v/from_nm_web/41591_2024_2963_MOESM3_ESM.xlsx", sheet = "S14"))

# Set the first column as row names
row.names(sp_table) <- sp_table[[1]]
sp_table <- sp_table[-1]

# Set the first column as row names
row.names(metaDF) <- metaDF[[1]]
metaDF <- metaDF[-1]

# phy obj
crc_u <- merge_phyloseq(sample_data(metaDF), 
                        otu_table(sp_table, taxa_are_rows=F))
```
```{r RDA, include=FALSE}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%          RDA      %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%     QMP n=589     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

to_test <- crc_u
fDSGENUS <- as.data.frame(to_test@otu_table@.Data)
m <- data.frame(sample_data(to_test))
dim(m)
dim(fDSGENUS)
# Set the seed for reproducibility
set.seed(531)
# Perform analysis using capscale and anova
all <- c()
for (i in 1:ncol(m)) {
  capsc <- capscale(fDSGENUS ~ m[, i], distance = "bray", na.action = na.omit)
  an <- anova.cca(capsc)
  pval <- an["Pr(>F)"][[1]][[1]]
  Fa <- an["F"][[1]][[1]]
  r2 <- RsquareAdj(capsc)[[1]]
  adjr2 <- RsquareAdj(capsc)[[2]]
  all <- rbind(all, cbind(Fa, r2, adjr2, pval))
}

colnames(all) <- c("F", "r2", "adjr2", "p-value")
row.names(all) <- colnames(m)

qval <- p.adjust(all[,"p-value"], method = "BH")
all <- data.frame(all)
allfDSall <- cbind(all, qval)
```

```{r PCoA, include=FALSE}

# Plot PCoA with colors representing diagnosis
to_plotPCoA <- crc_u
sample_data(to_plotPCoA)$Diagnosis <- as.factor(sample_data(to_plotPCoA)$Diagnosis)
#levels(sample_data(to_plotPCoA)$Diagnosis) <- c("Polyp", "Tumor", "No_lesion")

to_plotPCoA.bc <- ordinate(to_plotPCoA, "PCoA", "bray")

BC_plot_QMP_ASV_LCPM_SLV_species <- plot_ordination(to_plotPCoA, to_plotPCoA.bc, type = "samples", color = "Diagnosis") +
  scale_color_manual(values = c("#1B9E77", "#D95F02", "#7570B3")) +
  theme(panel.background = element_blank()) + theme(legend.position = c(0.85, 0.85))
```


```{r infiles2, include=FALSE}
species_forDAA <- as.data.frame(read_excel(temp_file, sheet = "S6"))
species_forDAA_selected <- subset(species_forDAA, `Final set`=="selected")

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% QMP @ Species SLV  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Assign variables
ps_sp <- crc_u
df_names <- data.frame(t(otu_table(ps_sp)))


# Subset samples based on CRC_general_status values
sub_DF <- data.frame(sample_data(ps_sp)[, c("Diagnosis")])
OTUdf <- as.data.frame(as(otu_table(ps_sp), "matrix"))
OTUdf <- OTUdf[, !names(OTUdf)%in%taxa_names(ps_sp)[grep('.unclassified', taxa_names(ps_sp))]]
OTUdf <- OTUdf[, names(OTUdf)%in%species_forDAA_selected$`Species name`]

dataStatus <- cbind.data.frame(sub_DF, OTUdf)

dim(dataStatus)

# Perform Kruskal-Wallis tests
kw_chi <- lapply(dataStatus[2:length(dataStatus)], function(x) {kruskal.test(x ~ Diagnosis, data = dataStatus)$statistic})
kw_pvl <- lapply(dataStatus[2:length(dataStatus)], function(x) {kruskal.test(x ~ Diagnosis, data = dataStatus)$p.value})

# Create a data frame with the results of Kruskal-Wallis tests
kw_res <- as.data.frame(cbind(as.data.frame(p.adjust(kw_chi, "none")),
                              as.data.frame(p.adjust(kw_pvl, "none")),
                              as.data.frame(p.adjust(kw_pvl, "BH"))))
setnames(kw_res, c("Chi2", "P", "AdjustedP"))

# Assign variables for further analysis
QMP_138sp_SLV_sp_kw_res <- kw_res

# Subset significant results based on adjusted p-values
QMP_SLV_sp_kw_res_sig <- row.names(subset(kw_res, AdjustedP < 0.05))
QMP_SLV_sp_kw_res_sig

# Create a data frame with significant results and CRC_general_status values
QMP_SLV_sp_kw_res_sig_df <- dataStatus[, c("Diagnosis", QMP_SLV_sp_kw_res_sig)]

# Modify row names and variable names for further analysis
rownames(QMP_138sp_SLV_sp_kw_res) <- gsub("-", ".", rownames(QMP_138sp_SLV_sp_kw_res))
rownames(QMP_138sp_SLV_sp_kw_res) <- gsub(" ", ".", rownames(QMP_138sp_SLV_sp_kw_res))
names(dataStatus) <- gsub("-", ".", names(dataStatus))
names(dataStatus) <- gsub(" ", ".", names(dataStatus))

# Assign variables for further analysis
data <- dataStatus
outcome_vars <- QMP_SLV_sp_kw_res_sig ##rownames(QMP_138sp_SLV_sp_kw_res)

# Perform Kruskal-Wallis effect size analysis
kruskal_effsize_results <- list()

for (outcome_var in outcome_vars) {
  formula <- as.formula(paste(outcome_var, "~ Diagnosis"))
  result <- data %>%
    kruskal_effsize(formula = formula)
  result$Outcome <- outcome_var
  kruskal_effsize_results[[outcome_var]] <- result
}

# Combine the results into a single data frame
combined_results <- bind_rows(kruskal_effsize_results)


# Convert dataframe to HTML table using kable and kableExtra
combined_resultsH <- combined_results %>%
  kable("html") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(5, bold = TRUE, color = "blue")  # 
```


```{r DT, include=FALSE}

# Merge Kruskal-Wallis results with effect size results
QMP_138sp_SLV_sp_kw_res_effsize <- merge(QMP_138sp_SLV_sp_kw_res, combined_results, by.x = "row.names", by.y = "Outcome", all = TRUE)

# Perform Dunn's post hoc tests
colname1 <- names(QMP_SLV_sp_kw_res_sig_df)[1]
colname2 <- names(QMP_SLV_sp_kw_res_sig_df)[2:length(QMP_SLV_sp_kw_res_sig_df)]

QMP_SLV_sp_kw_res_sig_df_dunnt <- lapply(colname2, function(x) {
  rstatix::dunn_test(QMP_SLV_sp_kw_res_sig_df, reformulate(colname1, x),
                     p.adjust.method = "fdr")
})

QMP_SLV_sp_kw_res_sig_df_dunnt_r <- Reduce(full_join, QMP_SLV_sp_kw_res_sig_df_dunnt)
QMP_SLV_sp_kw_res_sig_df_dunnt_r$compari <- paste(QMP_SLV_sp_kw_res_sig_df_dunnt_r$group1, QMP_SLV_sp_kw_res_sig_df_dunnt_r$group2, sep = ".")


QMP_SLV_sp_kw_res_sig_df_dunnt_r <- QMP_SLV_sp_kw_res_sig_df_dunnt_r %>%
  kable("html") %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "blue")  # 
```

```{r Bp, include=FALSE}

# Transform the data using logarithm
temp <- QMP_SLV_sp_kw_res_sig_df[, !names(QMP_SLV_sp_kw_res_sig_df) == "Diagnosis"]
temp <- log10(temp + 1)
df_to_plot_log <- cbind(QMP_SLV_sp_kw_res_sig_df$Diagnosis, temp)

# Modify column names
setnames(df_to_plot_log, "QMP_SLV_sp_kw_res_sig_df$Diagnosis", "Diagnosis")
df_to_plot_log <- df_to_plot_log[, c("Diagnosis", sort(names(temp)))]

# Create boxplots for each variable
uBiome_colFixed <- c("#1B9E77", "#D95F02", "#7570B3")
p_qmp_sp <- list()
for (j in colnames(df_to_plot_log)[2:length(df_to_plot_log)]) {
  p_qmp_sp[[j]] <- add_summary(ggplot(data = df_to_plot_log, aes_string(x = "Diagnosis", y = j)), "mean_sd", color = "blue", size = 0.05) +
    geom_boxplot(aes(fill = factor(Diagnosis)), outlier.size = 0.5) + # Adjusted outlier size
    guides(fill = FALSE) +
    scale_y_continuous(labels = function(x) format(x, nsmall = 2)) +
    scale_fill_manual(values = uBiome_colFixed) +
    theme_minimal() +
    theme(
      axis.title.y = element_text(face = "italic", size = 5),
      axis.text.y = element_text(size = 5),
      axis.title.x = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.title = element_text(size = 5), # Adjusted
      axis.text = element_text(size = 5), # Adjusted
      axis.ticks = element_blank() # Adjusted
    ) 
}
```

### PCoA on BCD representing QMP species-level microbiota variation in the LCPM cohort (n = 589)

Each dot represents one sample, colored by assigned diagnosis group.


```{r PCoA_o, echo=FALSE}
BC_plot_QMP_ASV_LCPM_SLV_species
```

Cancer diagnosis group (CTL, ADE and CRC), as a covariate, was not associated with microbial variation (n = 589, univariate dbRDA)

```{r RDA_o, echo=FALSE}
allfDSall
```
###  Microbial biomarkers in CRC

Nine species were identified with differential absolute abundance across diagnosis groups (n = 589, KW test, adjusted P < 0.05)
```{r BP_o, echo=FALSE}
# Combine the boxplots into a grid
do.call(grid.arrange, c(p_qmp_sp, ncol = length(df_to_plot_log) - 1))
```

Kruskal-Wallis effect size analysis

```{r KW_o, echo=FALSE}
combined_resultsH
```

Dunn's post hoc tests

```{r DT_o, echo=FALSE, message=FALSE, warning=FALSE}
QMP_SLV_sp_kw_res_sig_df_dunnt_r
```