-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNHS_Master_Postcode_Script.r
132 lines (108 loc) · 5.26 KB
/
NHS_Master_Postcode_Script.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Extract Postcode Information from NHSPD dataset (on GOV UK)
# Download location Office for National Statistics data - NHS Digital
# https://digital.nhs.uk/services/organisation-data-service/data-downloads/office-for-national-statistics-data
# V1.0 - 140521 - Sarah Eaglesfield
# **** INSTRUCTIONS ****
#There are 41 columns of data in the dataset. These are detailed on P27-P40 the User Guide PDF.
#To Add An Additional Column
#1. Download the corresponding lookup table from the above URL
#2. Change the col_type from col_skip to its actual type given in comments on the sam line
#3. Add the file location to the fnames list
# **** TROUBLESHOOTING ****
#1. Make Sure The Path of the Filename is Set Correctly Relative to the Base Directory for R
#2. Some NHS Lookup Tables have 3 columns - drop the unneeded column on an adhoc basis
# **** MASTER COLUMN LIST - LOAD COLUMS THAT WILL BE EXPORTED AND SKIP OTHERS ****
col_types = {cols(
"PCD2" = col_character(),
"PCDS" = col_skip(), #col_character()
"DOINTR" = col_skip(), #col_double()
"DOTERM" = col_double(),
"OSEAST100M" = col_skip(), #col_double()
"OSNRTH100M" = col_skip(), #col_double()
"OSCTY" = col_character(),
"ODSLAUA" = col_skip(), #col_character()
"OSLAUA" = col_character(),
"OSWARD" = col_character(),
"USERTYPE" = col_skip(), #col_double()
"OSGRDIND" = col_skip(), #col_double()
"CTRY" = col_character(),
"OSHLTHAU" = col_skip(), #col_character()
"RGN" = col_skip(), #col_character()
"OLDHA" = col_skip(), #col_character()
"NHSER" = col_character(),
"CCG" = col_character(),
"PSED" = col_skip(), #col_character()
"CENED" = col_skip(), #col_character()
"EDIND" = col_skip(), #col_character()
"WARD98" = col_skip(), #col_character()
"OA01" = col_skip(), #col_character()
"NHSRLO" = col_skip(), #col_character()
"HRO" = col_skip(), #col_character()
"LSOA01" = col_skip(), #col_character()
"UR01IND" = col_skip(), #col_character()
"MS0A01" = col_skip(), #col_character()
"CANNET" = col_skip(), #col_character()
"SCN" = col_character(),
"OSHAPREV" = col_skip(), #col_character()
"OLDPCT" = col_skip(), #col_character()
"OLDHRO" = col_skip(), #col_character()
"PCON" = col_skip(), #col_character()
"CANREG" = col_skip(), #col_character()
"PCT" = col_skip(), #col_character()
"OSEAST1M" = col_skip(), #col_double()
"OSNRTH1M" = col_skip(), #col_double()
"OA11" = col_skip(), #col_character()
"LSOA11" = col_skip(), #col_character()
"MSOA11" = col_character(),
"CALNCV" = col_skip(), #col_character()
"STP" = col_skip() #col_character()
)}
#Start The Timer
tictoc::tic()
#Read The Postcode Data from the GRIDALL CSV in the Data Folder
postcodes <- read_csv("Data/gridall.csv",
col_names = names(col_types$cols),
col_types = col_types)
#Drop Postcodes That Have Expired then Drop the Column As It Is No Longer Needed
postcodes[is.na(postcodes$DOTERM), ]
postcodes$DOTERM <- NULL
#List of lookup table CSVs for each element used in final Csv
fnames <- c("Documents/Names and Codes/County names and codes EN as at 04_20.csv", #OSCTY
"Documents/Names and Codes/LA_UA names and codes UK as at 04_21.csv", #OOSLAUA
"Documents/Names and Codes/Ward names and codes UK as at 12_19.csv", #OSWARD
"Documents/Names and Codes/Country names and codes UK as at 08_12.csv", #CTRY
"Documents/Names and Codes/NHSER names and codes EN as at 04_19.csv", #NHSER
"Documents/Names and Codes/CCG names and codes EN as at 04_20.csv", #CCG
"Documents/Names and Codes/SCN names and codes EN as at 04_13.csv", #SCN
"Documents/Names and Codes/MSOA (2011) names and codes UK as at 12_12.csv") #MSOA11
#Columnames from Postcodes
cnames <- colnames(postcodes)
#A list of all Dataframes
dframes <- paste("df",1:length(fnames),sep="")
dframes <- lapply(strsplit(as.character(dframes),split=','),trimws)
#Columns B and J contain the matches
match_columns <- c("b", "j")
#Loop over each filename in the list
for(i in 1:length(fnames)) {
p=i+1
q=p-1
#Load lookup data
lookup <- read_csv(fnames[i])
#Push a column name to merge
names(lookup) <- c(cnames[p])
#Merge Postcodes with the Lookup Table
d <- merge(postcodes, lookup, by.x = cnames[p], by.y =cnames[p], all = TRUE, suffixes = c("by"))
colnames(d) <- letters[1:length(d)]
d <- d[match_columns]
#Create a dataframe for each file name (later delted)
assign(paste("df", i, sep = ""), d)
}
#Need to change this to a loop
final <- df1 %>% inner_join(df2, by = "b") %>% inner_join(df3, by = "b") %>% inner_join(df4, by = "b") %>% inner_join(df5, by = "b") %>% inner_join(df6, by = "b") %>% inner_join(df7, by = "b") %>% inner_join(df8, by = "b")
#Friendly Column Names
names(final) <- c("Postcode","County", "Local Authority District", "Ward", "Country", "NHS England Region", "Clinical Commissioning Group", "Strategic Clinical Network", "MSOA")
#Write Final File
write_csv(final, "Output/NHS_UK_Postcodes_Master.csv")
tictoc::toc()
#Clean Up Memory - Comment Out To See Underlying Datasets
rm(list=setdiff(ls(), "final"))