From 788ebf18939c5f628216e40931e7beafb5689bd0 Mon Sep 17 00:00:00 2001 From: jebyrnes Date: Wed, 13 May 2020 21:54:16 -0400 Subject: [PATCH 1/2] Create covid19R compliant JHU coronavirus dataset --- data_raw/data_covid19R.R | 53 ++++++++++++++++++++ data_raw/get_code_table.R | 100 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 data_raw/data_covid19R.R create mode 100644 data_raw/get_code_table.R diff --git a/data_raw/data_covid19R.R b/data_raw/data_covid19R.R new file mode 100644 index 0000000..f3d46d8 --- /dev/null +++ b/data_raw/data_covid19R.R @@ -0,0 +1,53 @@ +#---------------------------------------------------- +# Creating a covid19R compliant JHU coronavirus data set +# using coronavirus data +# https://github.com/CSSEGISandData/COVID-19 + +`%>%` <- magrittr::`%>%` +setwd(here::here()) +source("data-raw/dplyr::left_join") + + +# the initial data +# git_df <- read.csv("https://raw.githubusercontent.com/RamiKrispin/coronavirus/master/csv/coronavirus.csv", +# stringsAsFactors = FALSE) + +# create valid locations +git_df_long_location <- git_df %>% + dplyr::mutate(country = ifelse(country=="Korea, South", "South Korea", country), + province = ifelse(province=="Bonaire, Sint Eustatius and Saba", + "Bonaire and Sint Eustatius and Saba", province) + ) %>% + tidyr::unite(location, province, country, sep = ", ") %>% + dplyr::rename(data_type = type, + value = cases) %>% + + # fix some bad location names + dplyr::mutate( + location = gsub("^\\, ", "", location), + location_type = ifelse(grepl("\\,", location), "state", "country")) + +code_table <- get_code_table() + +# add codes +coronavirus_covid19 <- dplyr::left_join(git_df_long_location, code_table) + +# fix data types +coronavirus_covid19 <- coronavirus_covid19 %>% + dplyr::mutate(data_type = dplyr::case_when( + data_type == "confirmed" ~ "cases_new", + data_type == "recovered" ~ "recovered_new", + data_type == "death" ~ "deaths_new", + + )) + + +# data checks +sum(is.na(coronavirus_covid19$location_code)) #make sure codes combine a-ok - will be >0 due to cruise ships +nrow(coronavirus_covid19)- nrow(git_df_long_location) #should be 0, or there was a one to many match + +# write out +#write.csv(coronavirus, "csv/coronavirus_covid19format.csv", row.names = FALSE) +print("covid19R compliant data done...") + + diff --git a/data_raw/get_code_table.R b/data_raw/get_code_table.R new file mode 100644 index 0000000..bb71ca6 --- /dev/null +++ b/data_raw/get_code_table.R @@ -0,0 +1,100 @@ +get_code_table <- function() { + # get iso 3166 2 codes + iso_codes <- + read.csv( + "https://github.com/olahol/iso-3166-2.js/raw/master/data.csv", + col.names = c( + "Country", + "iso_3166_2", + "name", + "type", + "Country_iso_3166_2" + ), + na.strings = "." + ) %>% + dplyr::mutate( + name = ifelse( + iso_3166_2 == "NL-BQ1", + "Bonaire and Sint Eustatius and Saba", + name + ), + Country = ifelse(iso_3166_2 == "VG-VG", "British Virgin Islands", Country), + Country = ifelse( + Country_iso_3166_2 == "CD", + "The Democratic Republic Of The Congo", + Country + ), + Country = ifelse(Country_iso_3166_2 == "CZ", "Czechia", Country), + ) + + country_code <- iso_codes %>% + dplyr::mutate(location = Country) %>% + dplyr::group_by(location) %>% + dplyr::summarize(location_code = Country_iso_3166_2[1]) %>% + dplyr::bind_rows( + data.frame(location = "Cabo Verde", location_code = "CV"), + data.frame(location = "Greenland, Denmark", location_code = "GL"), + data.frame(location = "Cote d'Ivoire", location_code = "CI") + ) #do not know why these were missing + + province_code <- iso_codes %>% + dplyr::mutate(type = ifelse(iso_3166_2 == "NL-AW", "Province", type)) %>% # problem with Aruba + dplyr::filter(type != "Country") %>% + dplyr::mutate(location = paste(name, Country, sep = ", ")) %>% + dplyr::group_by(location) %>% + dplyr::summarize(location_code = iso_3166_2[1]) %>% + dplyr::bind_rows( + data.frame(location = "Channel Islands, United Kingdom", location_code = "GB-CHA"), + data.frame(location = "Tibet, China", location_code = "CN-XZ"), + data.frame(location = "Inner Mongolia, China", location_code = "CN-NM") + ) + + code_table <- dplyr::bind_rows(country_code, province_code) %>% + dplyr::mutate( + location_code_type = "iso_3166_2", + location = dplyr::case_when( + location == "Cayman Islands" ~ "Cayman Islands, United Kingdom", + location == "Anguilla" ~ "Anguilla, United Kingdom", + location == "Kinshasa, The Democratic Republic Of The Congo" ~ "Congo (Kinshasa)", + location == "Brazzaville, Congo" ~ "Congo (Brazzaville)", + location == "Brunei Darussalam" ~ "Brunei", + location == "Myanmar" ~ "Burma", + location == "Falkland Islands" ~ "Falkland Islands (Malvinas), United Kingdom", + location == "Swaziland" ~ "Eswatini", + location == "Bermuda" ~ "Bermuda, United Kingdom", + location == "Curaçao" ~ "Curacao, Netherlands", + location == "French Polynesia" ~ "French Polynesia, France", + location == "British Virgin Islands" ~ "British Virgin Islands, United Kingdom", + location == "Faroe Islands" ~ "Faroe Islands, Denmark", + location == "French Guiana" ~ "French Guiana, France", + location == "French Guiana" ~ "French Guiana, France", + location == "Gibraltar" ~ "Gibraltar, United Kingdom", + location == "Vatican City" ~ "Holy See", + location == "Vatican City" ~ "Holy See", + location == "Isle of Man" ~ "Isle of Man, United Kingdom", + location == "Kosovo-Metohija, Serbia" ~ "Kosovo", + location == "Macau" ~ "Macau, China", + location == "Montserrat" ~ "Montserrat, United Kingdom", + location == "New Caledonia" ~ "New Caledonia, France", + location == "Macedonia, the Former Yugoslav Republic Of" ~ "North Macedonia", + location == "Reunion" ~ "Reunion, France", + location == "Saint-Barthélemy, France" ~ "Saint Barthelemy, France", + location == "Saint Kitts And Nevis" ~ "Saint Kitts and Nevis", + location == "Saint-Pierre-et-Miquelon, France" ~ "Saint Pierre and Miquelon, France", + location == "Saint Vincent And The Grenadines" ~ "Saint Vincent and the Grenadines", + location == "St. Maarten" ~ "Sint Maarten, Netherlands", + location == "Korea, Republic of" ~ "South Korea", + location == "Saint-Martin, France" ~ "St Martin, France", + location == "Taiwan" ~ "Taiwan*", + location == "East Timor" ~ "Timor-Leste", + location == "Turks & Caicos Islands" ~ "Turks and Caicos Islands, United Kingdom", + location == "United States" ~ "US", + location == "Viet Nam" ~ "Vietnam", + location == "Gaza, Palestine" ~ "West Bank and Gaza", + TRUE ~ location + ) + ) + + + code_table +} From 858cebeeaf44d7c2dd5abd7688f21dffc46b84e8 Mon Sep 17 00:00:00 2001 From: jebyrnes Date: Wed, 13 May 2020 22:08:18 -0400 Subject: [PATCH 2/2] Adding covid19R format --- R/refresh_coronavirus.R | 56 ++++++++++++++++++++++++++++++++++++++++ data_raw/data_covid19R.R | 34 ++++++++++++++---------- 2 files changed, 77 insertions(+), 13 deletions(-) create mode 100644 R/refresh_coronavirus.R diff --git a/R/refresh_coronavirus.R b/R/refresh_coronavirus.R new file mode 100644 index 0000000..83e5fec --- /dev/null +++ b/R/refresh_coronavirus.R @@ -0,0 +1,56 @@ +#' Refresh the 2019 Novel Coronavirus COVID-19 (2019-nCoV) Dataset in the Covid19R Project Format +#' +#' Daily summary of the Coronavirus (COVID-19) cases by state/province. +#' @return A tibble object +#' * date - The date in YYYY-MM-DD form +#' * location - The name of the location as provided by the data source. +#' * location_type - The type of location using the covid19R controlled vocabulary. +#' * location_code - A standardized location code using a national or international standard. Drawn from \href{https://github.com/olahol/iso-3166-2.js/}{iso-3166-2.js}'s version +#' * location_code_type The type of standardized location code being used according to the covid19R controlled vocabulary. Here we use `iso_3166_2` +#' * data_type - the type of data in that given row using the covid19R controlled vocabulary. Includes cases_new, deaths_new, recovered_new. +#' * value - number of cases of each data type +#' @export refresh_coronavirus_jhu +#' @return A data.frame object +#' @source coronavirus - Johns Hopkins University Center for Systems Science and Engineering (JHU CCSE) Coronavirus \href{https://systems.jhu.edu/research/public-health/ncov/}{website} +#' +#' @examples +#' \dontrun{ +#' # update the data +#' jhu_covid19_dat <- refresh_coronavirus_jhu() +#' } +#' +refresh_coronavirus_jhu <- function(){ + utils::read.csv("https://raw.githubusercontent.com/RamiKrispin/coronavirus/master/csv/coronavirus_covid19format.csv", + stringsAsFactors = FALSE) +} + + + +#' Get information about the datasets provided by the coronavirus package +#' +#' @description Returns information about the datasets in this package for covid19R harvesting +#' +#' @return a tibble of information about the datasets in this package +#' @export get_info_coronavirus +#' +#' @examples +#' \dontrun{ +#' +#' # get the dataset info from this package +#' get_info_coronavirus() +#' } +#' +get_info_coronavirus <- function(){ + data.frame( + data_set_name = "coronavirus_jhu", + package_name = "coronavirus", + function_to_get_data = "refresh_coronavirus_jhu*", + data_details = "The 2019 Novel Coronavirus COVID-19 (2019-nCoV) Dataset from the Johns Hopkins University Center for Systems Science and Engineering", + data_url = "https://systems.jhu.edu/research/public-health/ncov/", + license_url = "https://github.com/CSSEGISandData/COVID-19/", + data_types = "cases_new, recovered_new, deaths_new", + location_types = "country, state", + spatial_extent = "global", + TRUE + ) +} diff --git a/data_raw/data_covid19R.R b/data_raw/data_covid19R.R index f3d46d8..70f2132 100644 --- a/data_raw/data_covid19R.R +++ b/data_raw/data_covid19R.R @@ -14,18 +14,23 @@ source("data-raw/dplyr::left_join") # create valid locations git_df_long_location <- git_df %>% - dplyr::mutate(country = ifelse(country=="Korea, South", "South Korea", country), - province = ifelse(province=="Bonaire, Sint Eustatius and Saba", - "Bonaire and Sint Eustatius and Saba", province) - ) %>% + dplyr::mutate( + country = ifelse(country == "Korea, South", "South Korea", country), + province = ifelse(province == "Bonaire, Sint Eustatius and Saba", + "Bonaire and Sint Eustatius and Saba", province + ) + ) %>% tidyr::unite(location, province, country, sep = ", ") %>% - dplyr::rename(data_type = type, - value = cases) %>% + dplyr::rename( + data_type = type, + value = cases + ) %>% # fix some bad location names dplyr::mutate( location = gsub("^\\, ", "", location), - location_type = ifelse(grepl("\\,", location), "state", "country")) + location_type = ifelse(grepl("\\,", location), "state", "country") + ) code_table <- get_code_table() @@ -38,16 +43,19 @@ coronavirus_covid19 <- coronavirus_covid19 %>% data_type == "confirmed" ~ "cases_new", data_type == "recovered" ~ "recovered_new", data_type == "death" ~ "deaths_new", - )) +coronavirus_covid19 <- coronavirus_covid19 %>% + dplyr::select( + date, location, location_type, + location_code, location_code_type, + data_type, value, lat, long + ) # data checks -sum(is.na(coronavirus_covid19$location_code)) #make sure codes combine a-ok - will be >0 due to cruise ships -nrow(coronavirus_covid19)- nrow(git_df_long_location) #should be 0, or there was a one to many match +sum(is.na(coronavirus_covid19$location_code)) # make sure codes combine a-ok - will be >0 due to cruise ships +nrow(coronavirus_covid19) - nrow(git_df_long_location) # should be 0, or there was a one to many match # write out -#write.csv(coronavirus, "csv/coronavirus_covid19format.csv", row.names = FALSE) +# write.csv(coronavirus, "csv/coronavirus_covid19format.csv", row.names = FALSE) print("covid19R compliant data done...") - -