-
Notifications
You must be signed in to change notification settings - Fork 6
/
2020.03.04 CoV seasonality - for Git - v01.R
270 lines (219 loc) · 14.5 KB
/
2020.03.04 CoV seasonality - for Git - v01.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
### 1. PREP: Download packages --------------------------------------------------------
if(!require('tidyverse')) install.packages('tidyverse'); library(tidyverse)
if(!require('readxl')) install.packages('readxl'); library(readxl)
if(!require('lubridate')) install.packages('lubridate'); library(lubridate)
if(!require('ggpubr')) install.packages('ggpubr'); library(ggpubr)
if(!require('gridExtra')) install.packages('gridExtra'); library(gridExtra)
if(!require('splines')) install.packages('splines'); library(splines)
if(!require('EnvStats')) install.packages('EnvStats'); library(EnvStats) #for 'geoMean' function
if(!require('lmtest')) install.packages('lmtest'); library(lmtest) #to calculate robust standard errors
if(!require('sandwich')) install.packages('sandwich'); library(sandwich) #to calculate robust standard errors
### 1. PREP: Read in data --------------------------------------------------------
# Data below shared by NREVSS team
df.us_cov_national <- read.csv("Corona4PP_Nat.csv") #2018-03-10 through 2020-02-29
# Data below from FluView
df.us_ili <- read.csv("ILINet.csv", header=TRUE, stringsAsFactors=FALSE) #1997 week 40 through 2020 week 7
### 1. PREP: Initialize key variables ----------------------------------------------
list.CoVs <- c("CoVHKU1", "CoVNL63", "CoVOC43", "CoV229E")
### 2. CLEANING: Create key linking week start date and epidemiological week/year ----
# CDC epidemiological weeks start on Sunday and end on Saturday
# Create list of Sundays from 1996-2020, then use lubridate to pull CDC week and year
key.epi_dates <- data.frame(Week_start=(as.Date("1996-12-29") + seq(0, 52*(2020-1996)*7, by=7))) %>% # Create 'Week_start' dates as every Sunday through 2020
mutate(epi_year=epiyear(Week_start), # Use lubridate function to get CDC week and year
epi_week=epiweek (Week_start)) %>%
mutate(season=case_when(
Week_start < "2018-07-01" ~ 0, # First season (and only complete season) in this dataset is 2018-19
(Week_start >= "2018-07-01") & (Week_start < "2019-07-01") ~ 1,
(Week_start >= "2019-07-01") & (Week_start < "2020-07-01") ~ 2)) # 2018-2019 is the last season in our data
### 2. CLEANING: National flu dataset -----------------------------------------------
df.flu_national <- df.us_ili %>%
select(-c(starts_with("REGION"), starts_with("AGE"), X.UNWEIGHTED.ILI, NUM..OF.PROVIDERS)) %>% #Remove unnecessary columns
rename("epi_year"="YEAR", "epi_week"="WEEK", "ili.wt_pct"="X..WEIGHTED.ILI", "ili_visit.count"="ILITOTAL", "all_visit.count"="TOTAL.PATIENTS") %>%
left_join(key.epi_dates, by=c("epi_year", "epi_week")) #Pull in 'Week_start' dates
### 2. CLEANING: National CoV dataset -----------------------------------------------
# Create ntests column in national dataframe by summing over all tests in regional dataframe
df.us_cov_national <- df.us_cov_national %>%
mutate(RepWeekDate=as.Date(RepWeekDate, "%m/%d/%y")) %>%
mutate(Week_start = as.Date(RepWeekDate)-6) %>% #Date given is week end date (Sat); create new variable for week start date (Sun)
select(-c("RepWeekDate")) %>%
rename("CoVNL63_pos.pct"="CoVNL63",
"CoVOC43_pos.pct"="CoVOC43",
"CoV229E_pos.pct"="CoV229E",
"CoVHKU1_pos.pct"="CoVHKU1") #Create columns for weekly percent positive by strain
### 2. CLEANING: Merge national CoV and flu datasets ------------------------------------
df.us_all_national <- df.flu_national %>%
full_join(df.us_cov_national, by=c("Week_start")) %>%
mutate(CoVNL63_ili_x_pos_pct=CoVNL63_pos.pct*ili.wt_pct,
CoVOC43_ili_x_pos_pct=CoVOC43_pos.pct*ili.wt_pct,
CoV229E_ili_x_pos_pct=CoV229E_pos.pct*ili.wt_pct,
CoVHKU1_ili_x_pos_pct=CoVHKU1_pos.pct*ili.wt_pct) %>% #Create ili weighted percent * proportion testing positive for CoV (incidence proxy based on Ed Goldstein's work)
filter(season!=0) %>%
arrange(Week_start)
### 3. CALCULATE R: Initialize serial intervals ------------------------------------------
## SARS serial interval
# Assume SARS generation interval distribution
# Based on data from Singapore, Weibull with mean 8.4 days and sd 3.8 days
# This mean and sd imply the following shape and scale parameters:
SARS_shape <- 2.35
SARS_scale <- 9.48
SARS_stop <- ceiling(qweibull(0.99, shape=SARS_shape, scale=SARS_scale)) # Set max generation interval as first day that captures >99% of the density
## nCoV serial interval from Li, et al. (NEJM, 2020)
# Generated by fitting a gamma distribution to data from cluster investigations
# Used the serial interval for SARS as an informative prior
# nCoV data available from 6 cases in 5 clusters
# Mean = shape*scale; variance = shape*scale^2
# Reported mean of 7.5 days, reported SD of 3.4 days
nCoV_Li_shape <- 4.87
nCoV_Li_scale <- 1.54
nCoV_Li_stop <- ceiling(qgamma(0.99, shape=nCoV_Li_shape, scale=nCoV_Li_scale)) # Set max generation interval as first day that captures >99% of the density
## nCoV serial interval from Nishiura, et al. (medRxiv, 2020)
# Generated by fitting a model to 18 "certain" pairs from published research articles and case investigation reports
# Accounted for right truncation
# Reported median of 4.6 days, mean of 4.8 days and SD of 2.3 days under a Weibull distribution
nCoV_Nishiura_shape <- 2.203391
nCoV_Nishiura_scale <- 5.419875
nCoV_Nishiura_stop <- ceiling(qweibull(0.99, shape=nCoV_Nishiura_shape, scale=nCoV_Nishiura_scale)) # Set max generation interval as first day that captures >99% of the density
## Write function to pull value of serial interval distribution
func.SI_pull <- function(value, serial_int){
if(serial_int == "SARS"){
return(dweibull(value, shape=SARS_shape, scale=SARS_scale))
} else if(serial_int == "nCoV_Li"){
return(dgamma(value, shape=nCoV_Li_shape, scale=nCoV_Li_scale))
} else if(serial_int == "nCoV_Nishiura"){
return(dweibull(value, shape=nCoV_Nishiura_shape, scale=nCoV_Nishiura_scale))
}
}
### 3. CALCULATE R: Write function to do Wallinga-Teunis -------------------------------
# Adapted from code from te Beest, et al.
func.WT <- function(week_list, daily_inc, proxy_name, org_list, serial_int){
df.Reff_results <- data.frame(Week_start=week_list) # Initialize dataframe to hold results
stop <- get(paste0(serial_int, "_stop"))
# Estimate daily R effective
for(v in org_list){ #For each strain
RDaily <- numeric()
for(u in 1:(length(week_list)*7)){ #Estimate for each day
sumt <- 0
for(t in u:(u+stop)){ #Look ahead starting at day u through (u+max SI)
suma <- 0
for(a in 0:(stop)){ #Calc denominator, from day t back through (t-max SI)
suma = daily_inc[t-a,v]*func.SI_pull(a, serial_int) + suma
}
sumt = (daily_inc[t,v]*func.SI_pull(t-u, serial_int))/suma + sumt
}
RDaily[u] = sumt
}
# Take geometric mean over daily values for current week, and one week before + after (for smoothing), to get weekly estimates
RWeekly <- numeric()
for(i in 1:length(week_list)) RWeekly[i] <- geoMean(as.numeric(RDaily[max(1,7*(i-2)+1):min(length(RDaily),7*(i+1))]), na.rm=TRUE)
RWeekly[1:3] <- NA #Set results for first 3 weeks to NA (unreliable using WT)
RWeekly[(length(RWeekly)-2):length(RWeekly)] <- NA #Set results for last 3 weeks to NA (unreliable using WT)
df.Reff_results[,paste0(v,"_R_",proxy_name)] <- RWeekly
}
return(df.Reff_results)
}
### 3. CALCULATE R: Write function to fit spline to estimate daily incidence -----------------
# Implement spline method described in te Beest, et al.
# Turns weekly incidence proxy into estimated daily incidence values
func.daily_inc_spline <- function(week_list, weekly_inc){
temp.week_count <- length(week_list)
temp.cum_weekly_inc <- c(0,cumsum(weekly_inc)) #Calculate cumulative weekly sums
temp.spline <- spline(c(0:temp.week_count)*7, temp.cum_weekly_inc, n=temp.week_count*7+1) # Fit a spline to the cumulative incidences by week (# of data points = # of weeks) and then interpolate by day
temp.daily_inc <- diff((temp.spline$y)) # Using interpolation for cumulative incidence by day, use 'diff' to back-calc daily incidence
temp.daily_inc[temp.daily_inc<0] <- 0.0000001
return(temp.daily_inc)
}
### 3. CALCULATE R: CoV, national, %ILI * %-positive for each strain ----------------------------
CoV_ili_x_pos_pct_daily <- data.frame(CoVNL63=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVNL63_ili_x_pos_pct),
CoVOC43=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVOC43_ili_x_pos_pct),
CoV229E=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoV229E_ili_x_pos_pct),
CoVHKU1=func.daily_inc_spline(df.us_all_national$Week_start, df.us_all_national$CoVHKU1_ili_x_pos_pct))
Reff.CoV_ili_x_pos_pct_SARS <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_SARS", list.CoVs, "SARS")
Reff.CoV_ili_x_pos_pct_nCoV_Li <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_nCoV_Li", list.CoVs, "nCoV_Li")
Reff.CoV_ili_x_pos_pct_nCoV_Nishiura <- func.WT(df.us_all_national$Week_start, CoV_ili_x_pos_pct_daily, "ili_x_pos_pct_nCoV_Nishiura", list.CoVs, "nCoV_Nishiura")
### 3. CALCULATE R: Merge R estimates with national dataset -----------------------------
df.us_all_national_withR <- df.us_all_national %>%
left_join(Reff.CoV_ili_x_pos_pct_SARS, by="Week_start") %>%
left_join(Reff.CoV_ili_x_pos_pct_nCoV_Li, by="Week_start") %>%
left_join(Reff.CoV_ili_x_pos_pct_nCoV_Nishiura, by="Week_start")
### 4. REGRESSION: INITIALIZE parameters for regression --------------------------------
serial_int.for_model <- "SARS"
inc_proxy.for_model <- "ili_x_pos_pct"
R_inc_proxy.for_model <- "ili_x_pos_pct"
season_start <- 40 #epi_week of season start
season_end <- 20 #epi_week of season end (in the following year)
### 4. REGRESSION: Prepare dataset for regression --------------------------------------
# Always start counting weeks in the season at the same time
# Ending week may differ depending on whether years have 53 weeks
season_week.count <- (52-season_start) + season_end + 1 #Number of weeks in season
# Renumber weeks based on start of season
# Also remove weeks that are not "in season" for regression models
df.us_all_national_withR.in_seas <- data.frame()
for(s in 1:2){
temp.df <- df.us_all_national_withR %>% filter(season==s, epi_week>=season_start | epi_week<=season_end)
temp.df <- bind_cols(temp.df, season_week=c(1:min(season_week.count,nrow(temp.df)), rep(NA,max(nrow(temp.df)-season_week.count,0)))) %>% drop_na(season_week)
df.us_all_national_withR.in_seas <- bind_rows(df.us_all_national_withR.in_seas, temp.df)
}
### 4. REGRESSION: Create long dataframe for specified incidence proxy -----------
df.inc_proxy.for_model <- df.us_all_national_withR.in_seas %>%
select(Week_start, epi_year, epi_week, season, season_week, ends_with(inc_proxy.for_model)) %>%
select(-c(ends_with(paste0("R_", inc_proxy.for_model)))) %>% #Remove R for flu
pivot_longer(
cols=contains(inc_proxy.for_model),
names_to="strain",
values_to="inc_proxy") %>%
rowwise() %>%
mutate(strain=substr(strain,1,unlist(gregexpr("_",strain))[1]-1)) #Change name to just be the name of the strain
### 4. REGRESSION: Create long dataframe for R for specified incidence proxy -----------
df.R_inc_proxy.for_model <- df.us_all_national_withR.in_seas %>%
select(Week_start, ends_with(paste0("R_",R_inc_proxy.for_model)), ends_with(paste0("R_",R_inc_proxy.for_model,"_",serial_int.for_model))) %>%
pivot_longer(
cols=contains(paste0("_R_", R_inc_proxy.for_model)),
names_to="strain",
values_to="R_inc_proxy") %>%
rowwise() %>%
mutate(strain=substr(strain,1,unlist(gregexpr("_",strain))[1]-1)) #Change name to just be the name of the strain
### 4. REGRESSION: Calculate depletion of susceptibles -----------------------------------
## Calculate depletion of susceptibles by strain
df.depletion.strain <- df.inc_proxy.for_model %>%
group_by(season, strain) %>%
mutate(depletion=cumsum(inc_proxy)) %>%
ungroup() %>%
select(Week_start, strain, depletion) %>%
mutate(opp_strain=case_when(
strain == "CoVHKU1" ~ "CoVOC43",
strain == "CoVNL63" ~ "CoV229E",
strain == "CoV229E" ~ "CoVNL63",
strain == "CoVOC43" ~ "CoVHKU1")) %>% # Assign "opposite" strain
mutate(subgroup=case_when(
strain == "CoVHKU1" ~ "beta",
strain == "CoVNL63" ~ "alpha",
strain == "CoV229E" ~ "alpha",
strain == "CoVOC43" ~ "beta")) # Assign subgroup
## Calculate depletion of susceptibles by subgroup
df.depletion.subgroup <- df.depletion.strain %>%
group_by(Week_start, subgroup) %>%
summarise(depletion=sum(depletion)) %>%
mutate(opp_subgroup=case_when(
subgroup == "alpha" ~ "beta",
subgroup == "beta" ~ "alpha")) # Assign "opposite" subgroup
### 4. REGRESSION: Create combined and strain-level datasets ----------------------------------------------------------
df.for_model <- df.inc_proxy.for_model %>%
left_join(df.R_inc_proxy.for_model, by=c("Week_start", "strain")) %>%
left_join(df.depletion.strain %>% rename("depletion.same_strain"="depletion"), by=c("Week_start", "strain")) %>%
left_join(df.depletion.strain %>% select(-c(strain, subgroup)) %>% rename("depletion.opp_strain"="depletion"), by=c("Week_start", "strain"="opp_strain")) %>%
left_join(df.depletion.subgroup %>% rename("depletion.same_subgroup"="depletion"), by=c("Week_start", "subgroup")) %>%
left_join(df.depletion.subgroup %>% select(-c(subgroup)) %>% rename("depletion.opp_subgroup"="depletion"), by=c("Week_start", "subgroup"="opp_subgroup")) %>%
drop_na(R_inc_proxy) %>%
mutate(season_name=case_when(
season==1 ~ "2018-19",
season==2 ~ "2019-20")) %>%
mutate(season=factor(season, levels=c("1", "2"))) #Set season 1 as reference group in regression
# Note: with this limited dataset, season 2 is incomplete. Full dataset has 5 complete seasons.
## Create separate dataframes by strain and subgroup
df.for_model.beta <- df.for_model %>% filter(subgroup=="beta")
### 4. REGRESSION: Run model ----------------------------------------------------------
# Estimate model using limited data
df.for_model.beta$strain <- factor(df.for_model.beta$strain, levels=c("CoVHKU1", "CoVOC43")) #Set reference strain (first strain listed)
model.beta <- lm(log(R_inc_proxy) ~ season*strain + strain*depletion.same_strain + strain*depletion.opp_strain + bs(season_week, degree=3, knots=(c(1:7)*4+1), Boundary.knots=c(1,33)), data=df.for_model.beta)
model.beta.robust.vcov <- vcovHC(model.beta, type="HC3") #Calculate robust variance-covariance matrix
model.beta.robust <- coeftest(model.beta, vcov=model.beta.robust.vcov)