2020.03.04 CoV seasonality - for Git - v01.R

### 1. PREP: Download packages --------------------------------------------------------

if(!require('tidyverse')) install.packages('tidyverse'); library(tidyverse)
if(!require('readxl')) install.packages('readxl'); library(readxl)
if(!require('lubridate')) install.packages('lubridate'); library(lubridate)
if(!require('ggpubr')) install.packages('ggpubr'); library(ggpubr)
if(!require('gridExtra')) install.packages('gridExtra'); library(gridExtra)
if(!require('splines')) install.packages('splines'); library(splines)
if(!require('EnvStats')) install.packages('EnvStats'); library(EnvStats) #for 'geoMean' function
if(!require('lmtest')) install.packages('lmtest'); library(lmtest) #to calculate robust standard errors
if(!require('sandwich')) install.packages('sandwich'); library(sandwich) #to calculate robust standard errors

### 1. PREP: Read in data --------------------------------------------------------

# Data below shared by NREVSS team
df.us_cov_national <- read.csv("Corona4PP_Nat.csv") #2018-03-10 through 2020-02-29

# Data below from FluView
df.us_ili <- read.csv("ILINet.csv", header=TRUE, stringsAsFactors=FALSE) #1997 week 40 through 2020 week 7

### 1. PREP: Initialize key variables ----------------------------------------------

list.CoVs <- c("CoVHKU1", "CoVNL63", "CoVOC43", "CoV229E")

### 2. CLEANING: Create key linking week start date and epidemiological week/year ----
# CDC epidemiological weeks start on Sunday and end on Saturday
# Create list of Sundays from 1996-2020, then use lubridate to pull CDC week and year

key.epi_dates <- data.frame(Week_start=(as.Date("1996-12-29") + seq(0, 52*(2020-1996)*7, by=7))) %>% # Create 'Week_start' dates as every Sunday through 2020
  mutate(epi_year=epiyear(Week_start), # Use lubridate function to get CDC week and year
         epi_week=epiweek (Week_start)) %>%
  mutate(season=case_when(
    Week_start < "2018-07-01" ~ 0, # First season (and only complete season) in this dataset is 2018-19
    (Week_start >= "2018-07-01") & (Week_start < "2019-07-01") ~ 1,
    (Week_start >= "2019-07-01") & (Week_start < "2020-07-01") ~ 2))  # 2018-2019 is the last season in our data

### 2. CLEANING: National flu dataset -----------------------------------------------

df.flu_national <- df.us_ili %>%
  select(-c(starts_with("REGION"), starts_with("AGE"), X.UNWEIGHTED.ILI, NUM..OF.PROVIDERS)) %>% #Remove unnecessary columns
  rename("epi_year"="YEAR", "epi_week"="WEEK", "ili.wt_pct"="X..WEIGHTED.ILI", "ili_visit.count"="ILITOTAL", "all_visit.count"="TOTAL.PATIENTS") %>% 
  left_join(key.epi_dates, by=c("epi_year", "epi_week")) #Pull in 'Week_start' dates

### 2. CLEANING: National CoV dataset -----------------------------------------------

# Create ntests column in national dataframe by summing over all tests in regional dataframe
df.us_cov_national <- df.us_cov_national %>% 
  mutate(RepWeekDate=as.Date(RepWeekDate, "%m/%d/%y")) %>% 
  mutate(Week_start = as.Date(RepWeekDate)-6) %>% #Date given is week end date (Sat); create new variable for week start date (Sun)
  select(-c("RepWeekDate")) %>% 
  rename("CoVNL63_pos.pct"="CoVNL63",
         "CoVOC43_pos.pct"="CoVOC43",
         "CoV229E_pos.pct"="CoV229E",
         "CoVHKU1_pos.pct"="CoVHKU1") #Create columns for weekly percent positive by strain
  
### 2. CLEANING: Merge national CoV and flu datasets ------------------------------------

df.us_all_national <- df.flu_national %>%
  full_join(df.us_cov_national, by=c("Week_start")) %>%
  mutate(CoVNL63_ili_x_pos_pct=CoVNL63_pos.pct*ili.wt_pct,
         CoVOC43_ili_x_pos_pct=CoVOC43_pos.pct*ili.wt_pct,
         CoV229E_ili_x_pos_pct=CoV229E_pos.pct*ili.wt_pct,
         CoVHKU1_ili_x_pos_pct=CoVHKU1_pos.pct*ili.wt_pct) %>%  #Create ili weighted percent * proportion testing positive for CoV (incidence proxy based on Ed Goldstein's work)
  filter(season!=0) %>% 
  arrange(Week_start) 

### 3. CALCULATE R: Initialize serial intervals ------------------------------------------

## SARS serial interval
# Assume SARS generation interval distribution
# Based on data from Singapore, Weibull with mean 8.4 days and sd 3.8 days
# This mean and sd imply the following shape and scale parameters:
SARS_shape <- 2.35
SARS_scale <- 9.48
SARS_stop <- ceiling(qweibull(0.99, shape=SARS_shape, scale=SARS_scale)) # Set max generation interval as first day that captures >99% of the density

## nCoV serial interval from Li, et al. (NEJM, 2020)
# Generated by fitting a gamma distribution to data from cluster investigations
# Used the serial interval for SARS as an informative prior
# nCoV data available from 6 cases in 5 clusters
# Mean = shape*scale; variance = shape*scale^2
# Reported mean of 7.5 days, reported SD of 3.4 days
nCoV_Li_shape <- 4.87
nCoV_Li_scale <- 1.54
nCoV_Li_stop <- ceiling(qgamma(0.99, shape=nCoV_Li_shape, scale=nCoV_Li_scale)) # Set max generation interval as first day that captures >99% of the density

## nCoV serial interval from Nishiura, et al. (medRxiv, 2020)
# Generated by fitting a model to 18 "certain" pairs from published research articles and case investigation reports
# Accounted for right truncation
# Reported median of 4.6 days, mean of 4.8 days and SD of 2.3 days under a Weibull distribution
nCoV_Nishiura_shape <- 2.203391
nCoV_Nishiura_scale <- 5.419875
nCoV_Nishiura_stop <- ceiling(qweibull(0.99, shape=nCoV_Nishiura_shape, scale=nCoV_Nishiura_scale)) # Set max generation interval as first day that captures >99% of the density

## Write function to pull value of serial interval distribution
func.SI_pull <- function(value, serial_int){
  if(serial_int == "SARS"){
    return(dweibull(value, shape=SARS_shape, scale=SARS_scale))
  } else if(serial_int == "nCoV_Li"){
    return(dgamma(value, shape=nCoV_Li_shape, scale=nCoV_Li_scale))
  } else if(serial_int == "nCoV_Nishiura"){
    return(dweibull(value, shape=nCoV_Nishiura_shape, scale=nCoV_Nishiura_scale))
  }
}

### 3. CALCULATE R: Write function to do Wallinga-Teunis -------------------------------
# Adapted from code from te Beest, et al.

func.WT <- function(week_list, daily_inc, proxy_name, org_list, serial_int){
  
  df.Reff_results <- data.frame(Week_start=week_list) # Initialize dataframe to hold results
  stop <- get(paste0(serial_int, "_stop"))
  
  # Estimate daily R effective
  for(v in org_list){ #For each strain
    RDaily <- numeric()
    for(u in 1:(length(week_list)*7)){ #Estimate for each day
      sumt <- 0
      for(t in u:(u+stop)){ #Look ahead starting at day u through (u+max SI)
        suma <- 0 
        for(a in 0:(stop)){ #Calc denominator, from day t back through (t-max SI)
          suma = daily_inc[t-a,v]*func.SI_pull(a, serial_int) + suma
        } 
        sumt = (daily_inc[t,v]*func.SI_pull(t-u, serial_int))/suma + sumt
      }
      RDaily[u] = sumt
    }
    
    # Take geometric mean over daily values for current week, and one week before + after (for smoothing), to get weekly estimates
    RWeekly <- numeric()
    for(i in 1:length(week_list)) RWeekly[i] <- geoMean(as.numeric(RDaily[max(1,7*(i-2)+1):min(length(RDaily),7*(i+1))]), na.rm=TRUE)
    RWeekly[1:3] <- NA #Set results for first 3 weeks to NA (unreliable using WT)
    RWeekly[(length(RWeekly)-2):length(RWeekly)] <- NA #Set results for last 3 weeks to NA (unreliable using WT)
    df.Reff_results[,paste0(v,"_R_",proxy_name)] <- RWeekly
  }
  
  return(df.Reff_results)
}

### 3. CALCULATE R: Write function to fit spline to estimate daily incidence -----------------

# Implement spline method described in te Beest, et al.
# Turns weekly incidence proxy into estimated daily incidence values
func.daily_inc_spline <- function(week_list, weekly_inc){
  temp.week_count <- length(week_list)
  
  temp.cum_weekly_inc <- c(0,cumsum(weekly_inc)) #Calculate cumulative weekly sums
  temp.spline <- spline(c(0:temp.week_count)*7, temp.cum_weekly_inc, n=temp.week_count*7+1) # Fit a spline to the cumulative incidences by week (# of data points = # of weeks) and then interpolate by day
  temp.daily_inc <- diff((temp.spline$y)) # Using interpolation for cumulative incidence by day, use 'diff' to back-calc daily incidence
  temp.daily_inc[temp.daily_inc<0] <- 0.0000001
  
  return(temp.daily_inc)
}

### 3. CALCULATE R: CoV, national, %ILI * %-positive for each strain ----------------------------

CoV_ili_x_pos_pct_daily <- data.frame(CoVNL63=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVNL63_ili_x_pos_pct),
                                      CoVOC43=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVOC43_ili_x_pos_pct),
                                      CoV229E=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoV229E_ili_x_pos_pct),
                                      CoVHKU1=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVHKU1_ili_x_pos_pct))  

Reff.CoV_ili_x_pos_pct_SARS <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_SARS", list.CoVs, "SARS")
Reff.CoV_ili_x_pos_pct_nCoV_Li <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_nCoV_Li", list.CoVs, "nCoV_Li")
Reff.CoV_ili_x_pos_pct_nCoV_Nishiura <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_nCoV_Nishiura", list.CoVs, "nCoV_Nishiura")

### 3. CALCULATE R: Merge R estimates with national dataset -----------------------------

df.us_all_national_withR <- df.us_all_national %>% 
  left_join(Reff.CoV_ili_x_pos_pct_SARS, by="Week_start") %>% 
  left_join(Reff.CoV_ili_x_pos_pct_nCoV_Li, by="Week_start") %>%
  left_join(Reff.CoV_ili_x_pos_pct_nCoV_Nishiura, by="Week_start")

### 4. REGRESSION: INITIALIZE parameters for regression --------------------------------

serial_int.for_model <- "SARS"
inc_proxy.for_model <- "ili_x_pos_pct"
R_inc_proxy.for_model <- "ili_x_pos_pct"
season_start <- 40 #epi_week of season start
season_end <- 20 #epi_week of season end (in the following year)

### 4. REGRESSION: Prepare dataset for regression --------------------------------------

# Always start counting weeks in the season at the same time
# Ending week may differ depending on whether years have 53 weeks
season_week.count <- (52-season_start) + season_end + 1 #Number of weeks in season

# Renumber weeks based on start of season
# Also remove weeks that are not "in season" for regression models
df.us_all_national_withR.in_seas <- data.frame()
for(s in 1:2){
  temp.df <- df.us_all_national_withR %>% filter(season==s, epi_week>=season_start | epi_week<=season_end) 
  temp.df <- bind_cols(temp.df, season_week=c(1:min(season_week.count,nrow(temp.df)), rep(NA,max(nrow(temp.df)-season_week.count,0)))) %>% drop_na(season_week)
  df.us_all_national_withR.in_seas <- bind_rows(df.us_all_national_withR.in_seas, temp.df)
}

### 4. REGRESSION: Create long dataframe for specified incidence proxy -----------

df.inc_proxy.for_model <- df.us_all_national_withR.in_seas %>%
  select(Week_start, epi_year, epi_week, season, season_week, ends_with(inc_proxy.for_model)) %>% 
  select(-c(ends_with(paste0("R_", inc_proxy.for_model)))) %>% #Remove R for flu
  pivot_longer(
    cols=contains(inc_proxy.for_model), 
    names_to="strain",
    values_to="inc_proxy") %>%
  rowwise() %>% 
  mutate(strain=substr(strain,1,unlist(gregexpr("_",strain))[1]-1)) #Change name to just be the name of the strain

### 4. REGRESSION: Create long dataframe for R for specified incidence proxy -----------

df.R_inc_proxy.for_model <- df.us_all_national_withR.in_seas %>%
  select(Week_start, ends_with(paste0("R_",R_inc_proxy.for_model)), ends_with(paste0("R_",R_inc_proxy.for_model,"_",serial_int.for_model))) %>% 
  pivot_longer(
    cols=contains(paste0("_R_", R_inc_proxy.for_model)), 
    names_to="strain",
    values_to="R_inc_proxy") %>%
  rowwise() %>% 
  mutate(strain=substr(strain,1,unlist(gregexpr("_",strain))[1]-1)) #Change name to just be the name of the strain

### 4. REGRESSION: Calculate depletion of susceptibles -----------------------------------

## Calculate depletion of susceptibles by strain
df.depletion.strain <- df.inc_proxy.for_model %>%
  group_by(season, strain) %>%
  mutate(depletion=cumsum(inc_proxy)) %>%
  ungroup() %>% 
  select(Week_start, strain, depletion) %>% 
  mutate(opp_strain=case_when( 
    strain == "CoVHKU1" ~ "CoVOC43",
    strain == "CoVNL63" ~ "CoV229E",
    strain == "CoV229E" ~ "CoVNL63",
    strain == "CoVOC43" ~ "CoVHKU1")) %>%  # Assign "opposite" strain
  mutate(subgroup=case_when(
    strain == "CoVHKU1" ~ "beta",
    strain == "CoVNL63" ~ "alpha",
    strain == "CoV229E" ~ "alpha",
    strain == "CoVOC43" ~ "beta")) # Assign subgroup

## Calculate depletion of susceptibles by subgroup
df.depletion.subgroup <- df.depletion.strain %>%
  group_by(Week_start, subgroup) %>% 
  summarise(depletion=sum(depletion)) %>% 
  mutate(opp_subgroup=case_when(
    subgroup == "alpha" ~ "beta",
    subgroup == "beta" ~ "alpha")) # Assign "opposite" subgroup

### 4. REGRESSION: Create combined and strain-level datasets ----------------------------------------------------------

df.for_model <- df.inc_proxy.for_model %>% 
  left_join(df.R_inc_proxy.for_model, by=c("Week_start", "strain")) %>% 
  left_join(df.depletion.strain %>% rename("depletion.same_strain"="depletion"), by=c("Week_start", "strain")) %>% 
  left_join(df.depletion.strain %>% select(-c(strain, subgroup)) %>% rename("depletion.opp_strain"="depletion"), by=c("Week_start", "strain"="opp_strain")) %>%
  left_join(df.depletion.subgroup %>% rename("depletion.same_subgroup"="depletion"), by=c("Week_start", "subgroup")) %>% 
  left_join(df.depletion.subgroup %>% select(-c(subgroup)) %>% rename("depletion.opp_subgroup"="depletion"), by=c("Week_start", "subgroup"="opp_subgroup")) %>% 
  drop_na(R_inc_proxy) %>% 
  mutate(season_name=case_when(
    season==1 ~ "2018-19",
    season==2 ~ "2019-20")) %>% 
  mutate(season=factor(season, levels=c("1", "2"))) #Set season 1 as reference group in regression
# Note: with this limited dataset, season 2 is incomplete. Full dataset has 5 complete seasons.

## Create separate dataframes by strain and subgroup
df.for_model.beta <- df.for_model %>% filter(subgroup=="beta")

### 4. REGRESSION: Run model ----------------------------------------------------------

# Estimate model using limited data
df.for_model.beta$strain <- factor(df.for_model.beta$strain, levels=c("CoVHKU1", "CoVOC43")) #Set reference strain (first strain listed)
model.beta <- lm(log(R_inc_proxy) ~ season*strain + strain*depletion.same_strain + strain*depletion.opp_strain + bs(season_week, degree=3, knots=(c(1:7)*4+1), Boundary.knots=c(1,33)), data=df.for_model.beta)
model.beta.robust.vcov <- vcovHC(model.beta, type="HC3") #Calculate robust variance-covariance matrix
model.beta.robust <- coeftest(model.beta, vcov=model.beta.robust.vcov)