covid19/deaths-and-cases-per-day-histogram-faceted-fixed-scale.R at master · lawrencewalters/covid19 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# Deaths per day histogram ECDC data
number_of_days <- 20
regression_days <- 10
#countries <- c('DE-BER','DE')
#countries <- c('DE','IT','UK','SE')
countries <- c('US','CA','MX')
#countries <- c('NORAM')
#countries <- c('DE-BER', 'DE','US','CA','MX','IT','ES','UK')
#countries <- c('ES')

Sys.setenv(TZ="Europe/Berlin")

library(reshape2)
library(ggplot2)
library(dplyr)
library(broom)
library(scales)
library(ggpmisc)

source("load_data.R")
source("utilities.R")

ecdc_list <- EcdcData(countries)
ecdc <- ecdc_list$data
latest_data_date <- ecdc_list$latest_data_date
retrieved_date <- ecdc_list$retrieved_date

#berlin <- BerlinData("data\\daily.csv")
berlin <- BerlinData("https://raw.githubusercontent.com/jakubvalenta/covid-berlin-data/master/covid_berlin_data_incl_hospitalized.csv")

ecdc <- rbind(berlin, ecdc)
ecdc <- filter(ecdc, geoId %in% countries)

# deaths per day graphs
ecdc <- mutate(ecdc,
               date = as.Date(dateRep, '%Y-%m-%d',tz='Europe/Berlin'),
               wday = as.POSIXlt(date, tz='Europe/Berlin')$wday,
               wkday = ifelse(wday == 0 | wday == 6,0.9,1))

ecdc <- filter(ecdc, date > latest_data_date - number_of_days)
dflong <- melt(ecdc, id.vars = c("dateRep","day","month","year","countriesAndTerritories","geoId","countryterritoryCode","popData2019","date","wday","wkday"),
               measure.vars = c("cases","deaths"))

# calculate linear regression so that we can display p-value on graphs as labels
regressions <- dflong %>%
  filter(date > latest_data_date - regression_days) %>%
  group_by(geoId,variable) %>%
  do(model = lm(value ~ date, data = .)) %>%
  tidy(model)

labels <- ecdc[,c('geoId','countriesAndTerritories','deaths')] %>%
  group_by(geoId) %>%
  top_n(1,deaths)

# get just one row from regressions (date is arbitrary)
labels <- regressions %>%
  filter(term=="date") %>%
  inner_join(labels)

labels <- mutate(labels,
                 labelText = paste('p =',
                                   format(`p.value`,
                                          digits=2,
                                          nsmall = 1)))

fixed_scales <- data.frame(variable=c("cases","cases","deaths","deaths"), x = c(latest_data_date - number_of_days, latest_data_date), y = c(0,max(subset(dflong, variable == "cases")$value),0,max(subset(dflong, variable == "deaths")$value)))

ggplot(dflong,
       aes(x = date,
           y = value,
           fill = variable,
           alpha=wkday
           )) +
  geom_col(position="dodge2") +
  # linear regression line, only for specific recent timeframe
  geom_smooth(method='lm',formula=y ~ x, se = TRUE,
              data = subset(dflong,
                            (date > latest_data_date - regression_days))) +
  # x-axis as dates
  scale_x_date(date_labels = "%m.%d",
               breaks = "60 days",
               minor_breaks = "30 day",
               name = "Date. Weekends shaded lighter"
  ) +
  # x-axis written vertically
  theme(axis.text.x = element_text(angle = -90, vjust = 0.3),
        legend.position = "none") +
  # show the p-value in the upper right of each graph
  geom_text_npc(data=labels,aes(npcx = 1, npcy = 1, label = labelText)) +
  # graph per country, for both cases and deaths, let y axis be dynamic
  facet_wrap(vars(variable, geoId),
             scales = "free_y",
             ncol = length(countries),
             labeller = LabellerForGeoIDs()) +
  # zero bound y scale (regressions sometimes go negative), use commas in y scale labels
  scale_y_continuous(NULL, label=comma, limits = c(0,NA)) +
  # fixed y-axis per facet (cases, deaths)
  geom_blank(data = fixed_scales, aes(x = x, y = y, alpha=1)) +
  scale_fill_brewer(palette = "Dark2") +
  # weekdays shaded lighter
  scale_alpha(range = c(0.4, 1)) +
  # legend
  labs(title = paste("Trends by country in cases and deaths per day over time as of"
                ,latest_data_date),
          subtitle = paste("p value from linear regression on most recent",
                           regression_days,"days."),
          caption= paste("Source: Berlin - berlin.de, all others ECDC data",
                           latest_data_date,
                           "retrieved",
                           format.Date(retrieved_date, "%Y-%m-%d %H:%M:%S %Z")))