Let's Plot 8: (Animated) US State Covid-19 Case Count

Load packages, pull data

2020 03 30 Update

CSSE changed their data structure, so I’ve updated the document.

I was using their “time series” data, but they dropped the US-specific (with state by state info) documents.

So we’ll have change the import process to use their “daily reports”

I’ve added one more plot - the deaths by state with overlays showing the future projections if deaths double every 2 or 4 days.

library(tidyverse)
library(gganimate)
library(ggrepel)
library(pals)
library(purrr)

start = as.Date('2020-01-22')
end = as.Date('2020-03-29')
files = paste0('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_daily_reports/', format(seq(start,end, by = 1), format = '%m-%d-%Y'), '.csv')

names(files) <- format(seq(start,end, by = 1))

# use purrr to read_csv each individual file and make a list of data frames
covid_data_list <- files %>% 
  map(read_csv)

# loop through each data frame and clean column names
# add date to each individual data frame
# only keep state, country, confirmed, deaths, recovered
for (set in names(covid_data_list)){
  colnames(covid_data_list[[set]]) <- gsub('/','_', colnames(covid_data_list[[set]]))
  covid_data_list[[set]] <- covid_data_list[[set]] %>% 
    select(Province_State, Country_Region, Confirmed, Deaths, Recovered) %>% 
    mutate(Date = as.Date(set))
}

# collapse list of data frames to one data frame
covid_data <- covid_data_list %>% 
  bind_rows()

# make state_table to convert two letter states (eg IA) to full name (eg Iowa)
state_table <- cbind(state.abb, state.name) %>% as_tibble()
# make USA data and clean up state names and remove the cruise ship data (and a few "Wuhan Evacuee")
covid_usa <- covid_data %>% 
  filter(Country_Region == 'US') %>%
  filter(!grepl('Diamond|Princess|Wuhan', Province_State)) %>% 
  mutate(State = case_when(grepl(',', `Province_State`) ~ str_extract(`Province_State`, ', \\w\\w') %>% 
                             gsub('\\.| |,','',.) %>% 
                             toupper(),
                           TRUE ~ `Province_State`)) %>% 
  left_join(., state_table, by = c('State' = 'state.abb')) %>% 
  # swap in full name into State column
  mutate(State = case_when(is.na(state.name) ~ State,
                           TRUE ~ state.name)) %>% 
  # fix a few more inconsistent state / region names
  mutate(State = case_when(State == 'DC' ~ 'District of Columbia',
                           State == 'United States Virgin Islands' ~ 'Virgin Islands',
                           State == 'Chicago' ~ 'Illinois',
                           TRUE ~ State)) %>% 
  filter(!State %in% c('US','Recovered')) %>% 
  group_by(State, Date) %>% 
  # now summarise "redundant" state entries into
  summarise(Cases = sum(Confirmed),
            Deaths = sum(Deaths),
            Recovered = sum(Recovered))

Plotter function

This is a fairly complex function as it:

  • overlays color lines by top_states, given by user, on top of all data (in gray)
  • labels the color lines with ggrepel (but only at the end!)
  • uses a cowplot theme, with tweaked grid lines
  • custom x and y labels
  • expand x lim to fit the state names
covid19_plotter <- function(clean_data, 
                            type = Cases, 
                            top_states = c('Washington', 'New York', 'California'), 
                            xlab = '', 
                            ylab = '', 
                            title = ''){
  plot <- clean_data %>% 
    filter(State %in% top_states) %>% 
    ggplot(aes(x=Date,y= {{ type }}, 
               group = State, 
               color = State)) + 
    geom_line(data = clean_data, aes(x=Date, y = {{ type }}, group = State), color = 'gray') +
    geom_line(size = 2) +
    ggrepel::geom_text_repel(data = clean_data %>% 
                               filter(State %in% top_states) %>% 
                               filter(Date == max(Date)),
                             aes(x=Date, y = {{ type }}, label = State),
                             direction = 'y',
                             nudge_x = 3,
                             hjust = 0,
                             segment.size = 0.5,
                             force = 2,
                             size = 5) +
    cowplot::theme_cowplot() +
    guides(color=FALSE) +
    scale_color_manual(values = as.vector(pals::glasbey(26))) +
    theme(panel.grid.major.y = element_line(linetype = 'dotted')) +
    scale_y_continuous(breaks = c(10, 100,1000,5000,10000, 20000, 60000)) +
    scale_x_date(date_labels = "%b %d",
                 breaks = c(as.Date('2020-02-01'),
                            as.Date('2020-02-15'),
                            as.Date('2020-03-01'),
                            as.Date('2020-03-15'),
                            max(clean_data$Date)
                 )) +
    expand_limits(x = c(min(clean_data$Date),max(clean_data$Date) + 7 )) +
    ylab(ylab) + xlab(xlab) + ggtitle(title)
  plot
}

Cases by state

Important: We don’t know the denominator (number of tests given). So New York may not have far more cases than the rest of the other states, but rather they are testing far more.

I’ve overwritten the function-specified y scale (hence the ggplot warning).

# find top n (by sum cases) states
top_n <- covid_usa %>% filter(Date == max(Date)) %>% arrange(-Cases) %>% head(9) %>% pull(State)

covid19_plotter(covid_usa, 
                type = Cases,
                top_states = top_n,
                ylab = 'Cases',
                xlab = paste0('\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                title = 'USA COVID-19 Cases by State') +
  scale_y_continuous(breaks = c(100,1000,5000,10000, 20000, 40000, 60000))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Cases, with log10 scaling

Log10 scaling helps see the trends between states better

covid19_plotter(covid_usa %>% filter(Cases > 0), 
                type = Cases,
                top_states = top_n,
                ylab = 'log10 Cases',
                xlab = paste0('\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                title = 'USA COVID-19 Cases by State') +
  coord_trans(y = 'log10')

Deaths by state (log10 scaled)

Deaths may be a more useful metric as states may not be testing at comparable rates

# find top n (by sum cases) states
top_n <- covid_usa %>% filter(Date == max(Date)) %>% arrange(-Deaths) %>% head(9) %>% pull(State)
covid19_plotter(covid_usa %>% filter(Deaths > 0), 
                type = Deaths,
                top_states = top_n,
                ylab = 'Deaths',
                xlab = paste0('\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                title = 'COVID-19 Deaths by US State') +
  coord_trans(y = 'log10') +
  scale_y_continuous(breaks = c(10,25, 50, 100, 250, 500, 1000))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Deaths by state, animated

This is extremely easy to do, thanks to the gganimate package (written by Thomas Lin Pedersen)

animation <- covid19_plotter(covid_usa %>% filter(Deaths > 0), 
                             type = Deaths,
                             top_states = top_n,
                             ylab = 'Deaths',
                             xlab = paste0('\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                             title = 'COVID-19 Deaths by US State') + 
  coord_trans(y = 'log10') +
  scale_y_continuous(breaks = c(5, 10,25, 50, 100, 250, 500, 1000)) +
  transition_reveal(Date) + 
  view_follow(fixed_y = TRUE, 
              fixed_x = TRUE)

animate(animation, 
        height = 400, 
        width = 600, 
        end_pause = 30)

Shift plot

Many visualizations (e.g. NYTimes, Financial Times) are plotting (on the x-axis) Days Past n Deaths so you can better compare the infection rate across different locations.

Let’s give that a try.

We need to find, for each state, the first Date where Deaths >= 5.

We see the state of Washington was about two weeks ahead of anyone else

day0 <- covid_usa %>% 
  filter(Deaths >= 5) %>% 
  group_by(State) %>% 
  summarise(Day0 = min(Date))
day0 %>% arrange(Day0)
## # A tibble: 37 x 2
##    State      Day0      
##    <chr>      <date>    
##  1 Washington 2020-03-02
##  2 California 2020-03-14
##  3 Florida    2020-03-16
##  4 New York   2020-03-16
##  5 Georgia    2020-03-19
##  6 Louisiana  2020-03-19
##  7 New Jersey 2020-03-19
##  8 Texas      2020-03-19
##  9 Illinois   2020-03-20
## 10 Michigan   2020-03-21
## # … with 27 more rows

Transform Data and plot

We filter the data to only retain States with >= 5 deaths

Then we left join the day0 table we made above

We do math (yes, R can do math on Dates), creating a new Date where Date = Date - Day0 (Day0 is first Date in state where COVID19 deaths >= 5).

As our new Date is a time, we swap out the x axis scale for scale_x_continuous

We see that New York, Michigan, Lousiana, and New Jersey are on similar trajectories.

covid19_plotter(covid_usa %>% 
                  filter(State %in% day0$State,
                         Deaths >= 5) %>% 
                  left_join(., day0, by = 'State') %>% 
                  mutate(Date = Date - Day0), 
                type = Deaths,
                top_states = top_n,
                ylab = 'Deaths',
                xlab = paste0('Days Past 5 Deaths\n\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                title = 'COVID-19 Deaths by US State') +
  scale_x_continuous() + 
  coord_trans(y = 'log10') +
  scale_y_continuous(breaks = c(10,25, 50, 100, 250, 500, 1000))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Add exponential lines

For deaths doubling every 2,and 4 days

# function to calculate cases/etc over user given double times (in days) and start values
growth_maker = function(start_day = 0, start_val = 5, doubles_every_n_days = 2, end_val = 5000){
  out <- cbind(start_day, start_val)
  vals = start_val
  days = start_day
  while(vals < end_val){
    vals = vals * 2
    days = days + doubles_every_n_days
    out <- rbind(out, 
                 cbind(days, vals))
    
  }
  out <- as_tibble(out)
  colnames(out) <- c('Date', 'Deaths')
  out <- out %>% mutate(Date = as.difftime(Date, units = 'days'), 
                        'State' = paste0('doubles ', doubles_every_n_days, ' days'))
  out
}

doubling_data = bind_rows(growth_maker(start_val = 10, end_val = 2000, doubles_every_n_days = 2),
                          growth_maker(start_val = 5, end_val = 2000, doubles_every_n_days = 4))

covid19_plotter(covid_usa %>% 
                  filter(State %in% day0$State,
                         Deaths >= 5) %>% 
                  left_join(., day0, by = 'State') %>% 
                  mutate(Date = Date - Day0), 
                type = Deaths,
                top_states = top_n,
                ylab = 'Deaths',
                xlab = paste0('Days Past 5 Deaths\n\nData Pulled ', Sys.Date(), ' from Johns Hopkins CSSE'),
                title = 'COVID-19 Deaths by US State') +
  scale_x_continuous() + 
  scale_y_log10(breaks = c(10,30,100,300,1000)) +
  geom_line(data = doubling_data,             
            aes(x=Date, y=Deaths, 
                group = State, 
                color = NULL,
                State = NULL), 
            alpha = 0.2) +
  geom_text_repel(data = doubling_data %>% 
                    group_by(State) %>% 
                    top_n(n = 1, wt = Deaths),
                  aes(x = Date, 
                      y = Deaths, 
                      label = State), color = 'gray')
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Related

comments powered by Disqus