ncpi_crc/scripts/DESeq2_Interaction_Analysis_Notebook.Rmd at main · schatzlab/ncpi_crc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
---
title: "R Notebook for DESeq2 Analysis"
output: html_notebook
---

Packages for analysis

```{r}
library(DESeq2)
library(dplyr)
library(tibble)
library(data.table)
library(readr) # for reading csv
library(writexl) # for writing to excel files
library(readxl) # for reading excel files
```

Creating dataframes with tximport count data and metadata for design
```{r}
#read tximport data
countdata <- read_csv("/home/yhuan165/yhuan165/anvil_project/Gene_count_txi.csv")

#Convert Columns to Rows
df <- countdata  %>%
  column_to_rownames("Gene")

#Read in design file (replicate metadata)
design <- read_csv("/home/yhuan165/yhuan165/anvil_project/design.csv")

#Select the required coloumn & groups
df2<- (dplyr::select(df,1:172))
design2<- design[c(1:172),]
```

Remove low expressing genes with filter function
```{r}
remove_rows_with_low_values <- function(df, threshold = 10, percentage = 0.8) {
  # Create a logical matrix where TRUE indicates values less than the threshold
  low_values <- df < threshold
  # For each row, calculate the percentage of columns with values less than the threshold
  rows_to_remove <- apply(low_values, 1, function(x) mean(x) >= percentage)
  # Keep only the rows where the percentage of low values is less than the threshold
  df_cleaned <- df[!rows_to_remove, ]
  df_removed <- df[rows_to_remove, ]
  return(df_cleaned)
  }

# Apply the function to your data
result <- remove_rows_with_low_values(df2)
df2 <- round(result)
```

DESeq2 Analysis
```{r}
dds <- DESeqDataSetFromMatrix(
  countData = df2,
  colData = design2,
  design =  ~ Disease+Population + Disease:Population) #tests for main effects and interaction

#Collapse the replicates
dds <- collapseReplicates(dds, design2$Replicates, renameCols = TRUE)

#Put what the controls are
dds$Population <- relevel(dds$Population, ref = "NHW")
dds$Disease <- relevel(dds$Disease, ref = "NAT")

dds <- DESeq(dds)
resultsNames(dds) #output terminology for effect tests
```

Extract results for main effects and interaction effects
```{r}
# Main Effects
res_population <- results(dds, name = "Population_Hispanic_vs_NHW")
res_disease <- results(dds, name = "Disease_CRC_vs_NAT")

# Extract results for the interaction term
res_interaction <- results(dds, name = "DiseaseCRC.PopulationHispanic")

# Convert the results to data frames and add gene names as the first column
population <- data.frame(res_population, stringsAsFactors = FALSE)
population <- cbind("Gene" = rownames(population), population)

disease <- data.frame(res_disease, stringsAsFactors = FALSE)
disease <- cbind("Gene" = rownames(disease), disease)

interaction <- data.frame(res_interaction, stringsAsFactors = FALSE)
interaction <- cbind("Gene" = rownames(interaction), interaction)

# Write the results to Excel files for further inspection
write_xlsx(population, "Population_main_effects.xlsx")
write_xlsx(disease, "Disease_main_effects.xlsx")
write_xlsx(interaction, "DiseaseCRC.PopulationHispanic_interaction_analysis.xlsx")
```