forked from HeatherARobinson/EHR-reshaping
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathPERMIT SIR reshaping and cleaning
More file actions
241 lines (195 loc) · 11.1 KB
/
PERMIT SIR reshaping and cleaning
File metadata and controls
241 lines (195 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
memory.size(100000)
load ("sir.data.rda") #Taking the long format input file
load("small.rda") #This is the demographic data from he processed sir.data.rda,
#comprising PatientID, Gender, BirthDate and LSOA
small<-unique(small)
sir.data<-merge(sir.data,small,all.x=TRUE)
save(sir.data,file="sirdata.rda")
#load("sirdata.rda")
library(zoo)
library(plyr)
library(tidyverse)
library(zoo)
library(data.table)
library(survival)
library(lubridate)
#READ CONDITION FILES- SUFFIX ALL NAMES WITH A 1 THEN YOU CAN IMPORT THEM ALL TOGETHER
temp = list.files(pattern="*1.csv")
for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#######################################################################################
#SELECT PATIENTS WHO WERE 18 OR ABOVE AT THE TIME OF HEART FAILURE DIAGNOSIS
hf<-sir.data[sir.data$ReadCode %in% HeartFailure1.csv$ReadCode,]
length(unique(as.factor(hf$PatientID))) #7254 total heart failure patients identified with confirmed Read Codes (V2)
hf$Age<-(as.numeric(year(strptime(hf$EntryDate, format="%Y%m%d"))))-hf$BirthYear
hf$hfage<-hf$Age
smalltab<-hf[,c("PatientID","hfage")]
first<-smalltab %>% group_by(PatientID) %>%
summarize(hfage = min(as.numeric(hfage)))
ungroup(first)
head(first)
sir.data<-merge(sir.data[sir.data$PatientID %in% hf$PatientID,],as.data.frame(first),all.x=TRUE)
sir.data<-sir.data[sir.data$hfage>=18,]
length(unique(as.factor(sir.data$PatientID))) #7208 hf patients who were 18 or over at first diagnosis
save(sir.data,file="sirdatahfonly.rda") #full patient records from all adult hf patients from all years
####################################################################################
#COHORT SELECTION
#REMOVE DIALYSIS PATIENTS
dialysis<-sir.data[sir.data$ReadCode %in% Dialysis1.csv$ReadCode,]
levels(as.factor(dialysis$PatientID)) #48 dialysis patients will be removed
#SELECT PATIENTS WITH CREATININE DATA
sir.data[sir.data$ReadCode=="44J3.",]->crea
crea <- droplevels(crea)
crea<-crea[!crea$PatientID %in% dialysis$PatientID,]
#CHECK FOR CASES WHERE A VIABLE VALUE IS ENTERED IN THE UNITS COLUMN BY MISTAKE
temp<-ifelse(as.numeric(as.character(crea$CodeUnits))>0 & as.numeric(as.character(crea$CodeUnits))<1000 & !is.na(crea$CodeValue),crea$CodeUnits,NA)
table(temp) # 41 records affected, but most not viable values
crea$CodeValue<-ifelse(as.numeric(as.character(crea$CodeUnits))>0
& as.numeric(as.character(crea$CodeUnits))<1000
& is.na(crea$CodeValue),crea$CodeUnits,crea$CodeValue)
crea$CodeUnits<-ifelse(!is.na(temp),paste(""),paste(crea$CodeUnits))
crea$CodeValue<-as.numeric(as.character(crea$CodeValue))
crea<-crea[!is.na(crea$CodeValue),]
summary(crea$CodeValue)
length(unique(as.factor(crea$PatientID))) #6923 hf patients over 18 at diagnosis with creatinine data
###########################################################################
#SENSITIVITY TESTS
#HOW MANY ZERO CR VALUES AND HOW MANY CR VALUES UNDER 20
lowcr<-crea[crea$CodeValue<18 & !is.na(crea$CodeValue),] #21727
length(lowcr$PatientID)
#ARE THESE FROM OLD ASSAY?
levels(as.factor(lowcr$PatientID)) #4121 patients are affected
table(lowcr$CodeValue)
#471 are zero values
crea<-crea[crea$CodeValue>0,]
lowcr<-crea[crea$CodeValue<18,]
length(lowcr$PatientID)
#ARE THESE FROM OLD ASSAY?
lowcr <- droplevels(lowcr)
head(lowcr)
table(lowcr$CodeUnits)
# % g/L g/mol L micmol/l mmol mmol/L
# 4400 1 1 1 1 6099 1 766
# umol/l umol/L h mol/L
# 5 891607 1 1
summary(lowcr$EventDate)
#MOST SEEM TO BE IN RECENT UNITS AND RECENT VALUES
#In the first iteration 891607 records were retained.
#Evidently some were omitted due to minor typographic changes
#We can assume post 2008 values unless otherwise marked were completed with the new assay as we believed the lab switched over to this in 2005
#There is only one regional lab so the tests will be consistent, just potentially not the units used for recording.
#Some units may ahve been incorrectly selected from the pulldown menu, so values over 50 should be assumed to be in umol not mmol.
#We can use a simple formula to convert values under 100 in mmol to umol
crea$CodeUnits<-as.factor(crea$CodeUnits)
levels(crea$CodeUnits)
#[1] "" "%" "g/L" "g/mol" "L" "micmol/l"
#[7] "mmol" "mmol/L" "umol/l" "None" "umol/L"
levels(crea$CodeUnits)[c(1,6,9)]<-"umol/L"
levels(crea$CodeUnits)[c(6)]<-"mmol/L"
levels(crea$CodeUnits)[c(2:5)]<-"NA"
crea<-crea[!is.na(crea$CodeUnits),]
crea$CodeValue<-ifelse(crea$CodeUnits=="mmol/L" & as.numeric(crea$CodeValue)<50,(as.numeric(crea$CodeValue)*1000),as.numeric(crea$CodeValue))
crea$CodeUnits<-"umol/L"
summary(crea$CodeValue) #Any remaining high and low values will be removed
crea<-crea[as.numeric(crea$CodeValue)>=20 & !is.na(crea$CodeValue),]
#Upper limit TBA
save(crea,file="crearecleaned.rda")
###############################################################################
#DATA CLEANING
#load("sirdatahfonly.rda") #full patient records from all adult hf patients from all years
#load("crearecleaned.rda") # all steps above completed
length(crea$CodeValue) #412394 records left
#REMOVE SAME DAY CREATININE ENTRIES IF THE SOURCE LOCATION CODE DIFFERS.
names(crea)
crea<-crea[order(crea$PatientID,crea$CodeValue, rev(crea$Source)),]
crea2<-crea[(duplicated(crea[,c(1,4,6)])&!duplicated(crea[,7])),]
length(crea2$CodeValue) #Only 24 same day duplicates are from different sources
crea3<-crea[(duplicated(crea[,c(1,4,6)])),]
length(crea3$CodeValue) #in total, 105665 of 412394 (26%) records have a same day duplicate
crea<-crea[!rownames(crea) %in% rownames(crea2),]
#REMOVE DELAYED CREATININE ENTRIES FROM SAME CALENDAR MONTH IF THE SOURCE LOCATION CODE DIFFERS.
crea$event.date<-as.Date(as.character(crea$EntryDate),format="%Y%m%d")
(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))) -> year
(as.numeric(month(strptime(crea$event.date, format="%Y-%m-%d")))) -> month
crea$EntryPeriod<-paste(month,year)
crea$Source<-ifelse(crea$Source=="salfordt",paste("2"),paste("1")) #2 for hospital, 1 for GP
crea$Source<-ifelse(is.na(crea$Source),paste("2"),crea$Source)
crea<-crea[order(crea[,1], -(crea[,4]),(crea[,7])),]
crea4<-crea[(duplicated(crea[,c(1,6,14)])&!duplicated(crea[,7])),] #DELAYED DUPLICATES (SAME PATIENT, MONTH, VALUE)
length(crea4$PatientID) #- Only 1 of the remaining potential duplicate records outside of the same day window come from different locations
crea5<-crea[duplicated(crea[,c(1,6,14)]),]
table(crea5$Source) #The vast majority (12802 of out of range duplicates are multiple tests from within hospital. Only 730 have a GP code and are repeats within GP practises)
crea<-crea[!rownames(crea) %in% rownames(crea4),]
length(crea$PatientID) # 412369 records retained
#SELECT MEAN DAILY CREATININE IF MULTIPLE ENTRIES AFTER REMOVING DELAYED DUPLICATES AND OUT OF RANGE VALUES
smalltab<-crea[,c("PatientID","CodeValue", "EntryDate")]
xcrea<-smalltab %>% group_by(PatientID, EntryDate) %>%
summarize(Creatinine = mean(as.numeric(as.character(CodeValue))))
head(xcrea)
ungroup(xcrea)
crea<-merge(crea,as.data.frame(xcrea),all.x=TRUE)
save(crea,file="crearecleaned.rda")
######################################################
#Add demographic variables from lookup tables
#LSOA and age already added in SIR
#ASSIGN AGE
crea$Age<-(as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d"))))-crea$BirthYear
#CODE ETHNICITY
ethnic.data<-read.table("ethnic.data.csv",header=TRUE,sep=",")
ethnic.data$Category<-floor(ethnic.data$Category)
crea<-merge(crea,ethnic.data,by.x="Ethnicity",by.y="ClinCode2",all.x=TRUE, all.y=FALSE)
names(crea)
crea<-subset(crea, select=-c(Ethnicity,ClinCode1,EntryPeriod,Rubric))
colnames(crea)[which(names(crea) == "Category")] <- "Ethnicity"
save(crea, file = "crea.ongoing.Rdata")
levels(unique(as.factor(crea$PatientID)))
#6918 remaining with hf as an adult, all same day duplicates removed.
#No other duplicate pathology data removed, no data time range applied.
#COHORT SELECTION
#LIMIT TO PATIENTS WITH AT LEAST 2 POST 2008 CREATININE TEST VALUES
crea<-crea[as.numeric(year(strptime(crea$event.date, format="%Y-%m-%d")))>=2008,]
table(crea$PatientID) < 2 -> rare
rownames(as.matrix(rare)) -> ids
table(rare)
crea[!(crea$PatientID %in% ids[rare]),] -> crea.rep
levels(unique(as.factor(crea.rep$PatientID))) # 6589 non-dialysis adult hf patients have 2 or more post-2008 creatinine tests
length(crea.rep$PatientID) #284233 remaining
sir.data<-sir.data[sir.data$PatientID %in% crea.rep$PatientID,]
#Breakpoint
#######################################################
save(crea.rep, file = "crea.rephf2tests.Rdata")
save(sir.data, file = "sir.datahf2tests.Rdata")
#######################################################
#ADD IN VARIOUS CONDITIONS AND FLAGS FROM MAIN EHR FILES USING R SCRIPT IN SAME FOLDER
#ADD PRESCRIPTION DATA USING R SCRIPT IN SAME FOLDER
#Narrow lookup table to the full list of codes of interest to speed up processing
#temp = list.files(pattern="*1.csv")
#for (i in 1:length(temp)) assign(temp[i], read.csv(temp[i]))
#merged <- Reduce(function(x, y) merge(x, y, all=TRUE),
#list(AF1.csv,BMI1.csv,BNP1.csv,BUN1.csv,cessation1.csv,DBP1.csv,Diabetes1.csv,Dialysis1.csv,Haemoglobin1.csv,HeartFailure1.csv, HeartRate1.csv,IHD1.csv,MCV1.csv,Nephrectomy1.csv,NTPROBNP1.csv,PVD1.csv,RM1.csv,SBP1.csv,SerumAlbumin1.csv,SerumPotassium1.csv,SerumSodium1.csv,smoking1.csv,transplant1.csv,UACR1.csv,UAlbumin1.csv,UricAcid1.csv))
#sir.data<-sir.data[which(sir.data$PatientID %in% crea.rep$PatientID & sir.data$ReadCode %in% merged$ReadCode),]
##########################################################################
#CODE MODULES NOT CURRENTLY IN USE:
#CHECK PATIENTS WHICH HAVE AT LEAST 2 TESTS, OPTION FOR TIME RANGE RESTRICTION
#(as.numeric(year(strptime(crea$EntryDate, format="%Y%m%d")))) -> year
#(as.numeric(month(strptime(crea$EntryDate, format="%Y%m%d")))) -> month
#(as.numeric(day(strptime(crea$EntryDate, format="%Y%m%d")))) -> day
#as.Date.character(paste(year,month,day,sep="-")) -> crea$event.date
#aggregate(as.numeric(year(strptime(crea.rep$event.date, format="%Y-%m-%d"))), list(crea.rep$PatientID), range) -> ranges
#ranges$x[,2] - ranges$x[,1] -> ranges$range
#ranges[(which(ranges$range<1)),1] -> range_short_ids # define exclusion range if time range restriction needed
#crea.rep[-which(crea.rep$PatientID %in% range_short_ids),]->crea.rep
#IF DELETING DELAYED PATHOLOGY DUPLICATES
#names(sir.data)
#d<-sir.data[duplicated(sir.data[,c(1,2,6,15)])&sir.data$ReadCode %in% Pathology1.csv$ReadCode,c(1,2,6,7,15)]
#d<-unique(d[order(as.Date(d$EntryDate,format="%Y%m%d")),,drop=FALSE,fromLast=FALSE]) #REMAINING DUPLICATES BASED ON SAME VALUE AND CALENDAR MONTH BUT DIFFERENT SOURCE LOCATION
#THE EARLIEST OF THE DUPLICATES IS KEPT.
#Then use strptime (strip time) to convert dates into POSIX format
#date_vec <-strptime(paste(crea.rep$entry.date), "%Y/%m/%d")
#compare observation 1 and 2, 2 and 3, 3 and 4...
#first_date <- date_vec[1:(length(date_vec)-1)]
#second_date <- date_vec[2:length(date_vec)]
#second_gap <- difftime(second_date, first_date, units="days")
#Determine the gaps that are less than 30 days apart.Leave TRUE in to keep 1st instance
#dup_index <- second_gap>10
#dup_index <- c(TRUE, dup_index)
#dat[dup_index, ]