-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.R
More file actions
75 lines (69 loc) · 3.55 KB
/
load_data.R
File metadata and controls
75 lines (69 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# load data from external sources
# get ECDC data from their website for specified geoId's (countries)
EcdcData <- function(countries) {
library(readxl)
library(httr)
# get the most recent data.... note that URL includes date
retrieved_date <- Sys.time()
latest_data_date <- Sys.Date() + 1
resp <- list()
resp[["status_code"]] <- 404
while(resp[["status_code"]] == 404)
{
latest_data_date <- latest_data_date - 1
url <- paste("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-",format(latest_data_date, "%Y-%m-%d"), ".xlsx", sep = "")
resp <- GET(url, authenticate(":", ":", type="ntlm"), write_disk(tf <- tempfile(fileext = ".xlsx")))
retrieved_date <- resp[["date"]]
}
#read the Dataset sheet into “R”
full_ecdc <- read_excel(tf)
return(list("data" = filter(full_ecdc, geoId %in% countries), "retrieved_date" = retrieved_date, "latest_data_date" = latest_data_date))
}
# get berlin data from URL or local file CSV
BerlinData <- function(filename) {
library(dplyr)
df<-read.csv(filename, header = TRUE)
# calculate numbers per day - file has cumulative numbers per day
df <- mutate(df, cases = cases - lag(cases))
df <- mutate(df, deaths = deaths - lag(deaths))
# get date as actual date object, and weekend calcs - these fields match ECDC fields
df <- mutate(df,
dateRep = as.Date(date, '%Y-%m-%d'),
day = as.double(format.Date(dateRep, "%e")),
month = as.double(format.Date(dateRep, "%m")),
year = as.double(format.Date(dateRep, "%Y")),
countriesAndTerritories = 'Berlin',
geoId = "DE-BER",
countryterritoryCode = "DE",
popData2019 = 3769495,
continentExp = 'Europe',
"Cumulative_number_for_14_days_of_COVID-19_cases_per_100000" = 1.0
)
return(subset(df, select = c(dateRep, day, month, year, cases, deaths, countriesAndTerritories, geoId, countryterritoryCode, popData2019,continentExp,get("Cumulative_number_for_14_days_of_COVID-19_cases_per_100000"))))
}
# get data from CSV and make it match ECDC format
# also filter for desired "geoId" from data set
# defaults to Berlin parameters
CSVData <- function(filename, cumulativeCounts = TRUE, countriesAndTerritories = "Berlin", geoId = "DE-BER", countryterritoryCode = "BER", popData2019 = 3769495, continentExp = "Europe" ) {
library(dplyr)
df<-read.csv(filename, header = TRUE)
# calculate numbers per day - file has cumulative numbers per day
if(cumulativeCounts) {
df <- mutate(df, cases = cases - lag(cases))
df <- mutate(df, deaths = deaths - lag(deaths))
}
# get date as actual date object, and weekend calcs - these fields match ECDC fields
df <- mutate(df,
dateRep = as.Date(date, '%Y-%m-%d'),
day = as.double(format.Date(dateRep, "%e")),
month = as.double(format.Date(dateRep, "%m")),
year = as.double(format.Date(dateRep, "%Y")),
countriesAndTerritories = countriesAndTerritories,
geoId = geoId,
countryterritoryCode = countryterritoryCode,
popData2019 = popData2019,
continentExp = continentExp,
"Cumulative_number_for_14_days_of_COVID-19_cases_per_100000" = 1.0
)
return(subset(df, select = c(dateRep, day, month, year, cases, deaths, countriesAndTerritories, geoId, countryterritoryCode, popData2019,continentExp,get("Cumulative_number_for_14_days_of_COVID-19_cases_per_100000"))))
}