-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCRC_prep_data.R
More file actions
116 lines (91 loc) · 5 KB
/
CRC_prep_data.R
File metadata and controls
116 lines (91 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# First part of CRC_prep_data_rev.R that prepares case-control studies
library(tidyverse)
library(haven)
library(lubridate)
# Read metadata for whole CRC case-control
# WARNING: do not use case-control status from this dataset. Use metabolomics datasets only.
# Remove duplicated Idepics (with dplyr or base). Also get follow up time and colorectal site
var.list <- c("Country", "Center", "Sex", "Match_Caseset", "L_School", #"Smoke_Int",
"Smoke_Stat", "Smoke_Intensity", "Fasting_C", "Menopause", "Phase_Mnscycle",
"Alc_Drinker", "Pa_Total")
# set D_Dgclrt of controls to that of corresponding cases, calculate followup time and
# and get colorectal subsite variables
meta <- read_dta("clrt_caco.dta") %>%
mutate(Tfollowup.days = D_Dgclrt - D_Bld_Coll, Tfollowup = Tfollowup.days/365.25,
location = case_when(
Case_Mal_Colon_Prox == 1 ~ 1, Case_Mal_Colon_Dist == 1 ~ 2,
Case_Mal_Colon_Nos == 1 ~ 4, Case_Mal_Rectum == 1 ~ 3)) %>%
# Assign controls the same pathology value as corresponding case
group_by(Match_Caseset) %>%
fill(c(D_Dgclrt, location, Tfollowup, Stagclrt), .direction = "downup") %>%
ungroup() %>%
select(-Match_Caseset, -Cncr_Caco_Clrt) %>%
distinct(Idepic, .keep_all = T)
# Small case-control subset (p180)
# Use Batch_MetBio to correctly subset biocrates data, join metadata.
# 496 cases, 492 controls in this dataset. Delete if only 1 in the caseset.
crc1 <- read_sas("clrt_caco_metabo.sas7bdat") %>%
filter(!is.na(Batch_MetBio)) %>%
left_join(meta, by = "Idepic", suffix = c("_1", "")) %>%
mutate_at(vars(var.list), as.factor) %>%
mutate(Smoke_Int = fct_collapse(Smoke_Intensity, Other = c("8", "9", "10"))) %>%
group_by(Match_Caseset) %>% filter(n() == 2) %>% ungroup() %>%
filter(Country != 6) # Greece removed
# 490 C+C, 467 after removal of Greece
# Add categorical BMI
crc1$Bmi_Cat<- as.factor(cut(crc1$Bmi_C, c(0,25,30,99), labels=FALSE, right=FALSE))
# Amino acids study
# Get colon cancer only, fasting status (2) only
#colon1 <- crc1 %>% filter(!location %in% 3 & Fasting_C == 2)
# Get colon cancer by Jelena's list of IDs and join to data to leave 740 subjects
# 700 after removal of Greece
p180ids <- read.csv("p180_ids.csv")
colon1 <- crc1 %>% inner_join(p180ids, by = c("Idepic" = "ids_p180")) %>% filter(Country != 6)
# Subsites
rectal1 <- crc1 %>% group_by(Match_Caseset) %>% filter(max(location, na.rm = T) == 3) %>% ungroup(Match_Caseset)
prox1 <- crc1 %>% group_by(Match_Caseset) %>% filter(max(location, na.rm = T) == 1) %>% ungroup(Match_Caseset)
dist1 <- crc1 %>% group_by(Match_Caseset) %>% filter(max(location, na.rm = T) == 2) %>% ungroup(Match_Caseset)
# Subset male, female, cases diagnosed after 2 years only
crc1m <- crc1 %>% filter(Sex == 1)
crc1f <- crc1 %>% filter(Sex == 2)
crc1t <- crc1 %>% group_by(Match_Caseset) %>% filter(max(Tfollowup, na.rm = T) > 2) %>% ungroup()
# Large case-control subset (p150 from Jelena)
# 1185 C+C, 1141 C+C after removal of Greece
crc2 <- read_csv("biocrates_p150.csv") %>%
select(Match_Caseset, Cncr_Caco_Clrt,
ends_with("Idepic"), matches("(carn|oacid|genic|roph|ingo|Sugars)[_]"), -contains("tdq")) %>%
inner_join(meta, by = "Idepic") %>%
mutate_at(vars(var.list), as.factor) %>%
mutate(Smoke_Int = fct_collapse(Smoke_Intensity, Other = c("8", "9", "10"))) %>%
group_by(Match_Caseset) %>% filter(n() == 2) %>% ungroup() %>%
filter(Country != 6)
crc2$Bmi_Cat<- as.factor(cut(crc2$Bmi_C, c(0,25,30,99), labels=FALSE, right=FALSE))
# Get colon cancer for amino acids study
#colon2 <- crc2 %>% filter(!location %in% 3 & Fasting_C == 2) #%>%
#group_by(Match_Caseset) %>% filter(n() == 2) %>% ungroup()
# Get colon cancer by Jelena's list of IDs
p150ids <- read.csv("p150_ids.csv")
colon2 <- crc2 %>% inner_join(p150ids, by = c("Idepic" = "ids_p150")) %>% filter(Country != 6)
#Checks
#table(colon2$Cncr_Caco_Clrt)
#intersect(colon2$Idepic, as.character(p150ids$ids_p150))
#unique(droplevels(colon2$Match_Caseset))
rectal2 <- crc2 %>% group_by(Match_Caseset) %>% filter(max(location, na.rm = T) == 3) %>% ungroup(Match_Caseset)
prox2 <- crc2 %>% group_by(Match_Caseset) %>% filter(max(location, na.rm = T) == 1) %>% ungroup(Match_Caseset)
dist2 <- crc2 %>% group_by(Match_Caseset) %>% filter( max(location, na.rm = T) == 2) %>% ungroup(Match_Caseset)
# Get cases diagnosed after 2 years only
crc2t <- crc2 %>% group_by(Match_Caseset) %>% filter(mean(Tfollowup, na.rm = T) > 2) %>% ungroup()
# Subset male or female
crc2m <- crc2 %>% filter(Sex == 1)
crc2f <- crc2 %>% filter(Sex == 2)
# Merge crc1 and crc2 to make complete dataset, remove non-fasting
crc <- bind_rows(crc1, crc2, .id = "lab") %>% filter(Fasting_C == 2)
crc$lab <- as.factor(crc$lab)
colon <- bind_rows(colon1, colon2, .id = "lab")
colon$lab <- as.factor(colon$lab)
rectal <- bind_rows(rectal1, rectal2, .id = "lab")
rectal$lab <- as.factor(rectal$lab)
prox <- bind_rows(prox1, prox2, .id = "lab")
prox$lab <- as.factor(prox$lab)
dist <- bind_rows(dist1, dist2, .id = "lab")
dist$lab <- as.factor(dist$lab)