How to use the package

James Hay

2017-12-07

Introduction

This vignette describes the work flow for analysis using [zikaInfer] in detail. Users should be able to follow each section to understand how the cod relates to the methods described in the accompanying methods vignette.

Input parameters

The first step in using this package is to define the set of model parameters. An example table of parameters is provided in the package and can be viewed using the following:

data(exampleParTab)

## Extract parameter values to solve model. Note that 
## these parameters MUST be named
pars <- exampleParTab$values
names(pars) <- exampleParTab$names

Refer to the documentation of ?exampleParTab for a description of the parameter table columns and their uses. Similarly, users can use the parTabSetup function to generate a new parameter table from scratch.

Transmission model

The transmission component of the model is an SEIR model with mosquito vectors, described here. The model can be solved as follows:

## Generate starting population sizes based on human population size
## and mosquito density
y0s <- generate_y0s(pars["N_H"], pars["density"],iniI=10)
## Times to solve model over. Note that the unit of time is in days
ts <- seq(0,3003,by=1)
y <- solveSEIRModel_rlsoda(ts, y0s, pars, TRUE)
plot(y[,"I_H"],type='l',col="red",
     xlab="Time (days)", ylab="ZIKV incidence in humans")

This SEIR model is then used to generate a per-capita risk of infection at any given time within the generate_probM function. Note that there is a suite of accompanying functions, for example:

## Calculate R0
print(r0.calc(pars))
## [1] 2.098408
## Find the mosquito density required for a desired R0
print(density.calc(pars, R0=3))
## [1] 4.288966
print(calculate_AR(r0=3))
## [1] 0.9404798

The gestational risk model

There are four implemented risk models that describe the risk of developing a congenital abnormality given infection in a particular gestational week. These are numbered 1-4 (print_version_names()). The generate_micro_curve function will automatically detect which version of the risk curve the user requires based on the parameter names:

risk <- generate_micro_curve(pars)
plot(risk,type='l',col="blue",
     xlab="Gestational time at infection (days)",ylab="Prob of congenital abnormality")

The main version used in the accompanying analyses is model version 1, the gamma risk model.

Combined model

Taken together, the per capita incidence model and gestational risk model are combined to give the expected proportion of microcephaly affected births observed in a given day.

probM <- generate_probM(y[,"I_M"],pars["N_H"],risk,pars["b"],pars["p_MH"],pars["baselineProb"],1)
plot(probM,type='l',col="green",
     xlab="Time (days)",ylab="Proportion of microcephaly affected births")

Simulate data

Incidence and microcephaly incidence data can be simulated using the generate_multiple_data function. The idea is to generate fake data with known parameters that represent actual observed data:

library(ggplot2)

## Generate simulated data with known parameters
simDat <- generate_multiple_data(ts, parTab=exampleParTab, weeks=FALSE, 
                                  dataRangeMicro=c(600,2000),dataRangeInc=c(600,1500), 
                                  noise=FALSE,peakTimeRange=60)
microDat <- simDat[[1]]
incDat <- simDat[[2]]
peakTimes <- simDat[[3]]

## Save simulated data
write.table(microDat,"sim_microDat.csv",sep=",",row.names=FALSE)
write.table(incDat,"sim_incDat.csv",sep=",",row.names=FALSE)

## Check the generated data
ggplot(incDat) + 
  geom_line(aes(x=startDay,y=inc/N_H), col="red") +
  geom_line(data=microDat,aes(x=startDay,y=microCeph*0.001/births),col="blue") + 
  facet_wrap(~local) +
  ylab("Per capita ZIKV incidence (red)") +
  xlab("Time (days)") +
  scale_y_continuous(sec.axis=sec_axis(~.*1000,name="Per birth microcephaly\n incidence (blue)")) +
  theme_bw()

Re-estimate model parameters

Parameter estimation is done using the lazymcmc package as follows:

library(lazymcmc)
## Generate random starting points for MCMC chain
## Note that we have to constrain the starting points for R0 and the epidemic seed time
## such that trajectory is near expected peak time
startTab <- generateStartingParTab(exampleParTab, peakTimes, restrictedR0=TRUE,"")

## MCMC control parameters
mcmcPars <- c("adaptive_period"=10000,"iterations"=20000,"opt_freq"=1000,"thin"=10,"save_block"=100,"popt"=0.44)

## Run MCMC chain using univariate sampler
result <- lazymcmc::run_MCMC(parTab=startTab, data=microDat, mcmcPars=mcmcPars,filename="test_univariate",
                    CREATE_POSTERIOR_FUNC = create_posterior, mvrPars=NULL,PRIOR_FUNC=NULL,
                    OPT_TUNING=0.2, ts=ts,incDat=incDat,peakTimes=NULL)
## Use these results to get covariance matrix to run multivariate sampler
chain <- read.csv(result$file)
chain <- chain[chain$sampno >= mcmcPars["adaptive_period"],]
covMat <- cov(chain[,2:(ncol(chain)-1)])
startTab$values <- get_best_pars(chain)
mvrPars <- list(covMat,2.38/sqrt(nrow(startTab[startTab$fixed==0,])),w=0.8)
mcmcPars <- c("adaptive_period"=10000,"iterations"=20000,"opt_freq"=1000,"thin"=10,"save_block"=100,"popt"=0.234)

## Run MCMC chain using multivariate sampler
## Run two chains to calculate gelman diagnostics
final <- lazymcmc::run_MCMC(parTab=startTab, data=microDat, mcmcPars=mcmcPars,filename="test_2_multivariate",
                    CREATE_POSTERIOR_FUNC = create_posterior, mvrPars=mvrPars,PRIOR_FUNC=NULL,
                    OPT_TUNING=0.2, ts=ts,incDat=incDat,peakTimes=NULL)
finalB <- lazymcmc::run_MCMC(parTab=startTab, data=microDat, mcmcPars=mcmcPars,filename="test_1_multivariate",
                    CREATE_POSTERIOR_FUNC = create_posterior, mvrPars=mvrPars,PRIOR_FUNC=NULL,
                    OPT_TUNING=0.2, ts=ts,incDat=incDat,peakTimes=NULL)
chain <- read.csv(final$file)
chain <- chain[chain$sampno >= mcmcPars["adaptive_period"],]

Assess chain convergence

The MCMC chain should then be checked for correct convergence using the functions available in [lazymcmc], which uses the [coda] package.

library(data.table)
library(coda)
# Function to read in MCMC chains if possible and then calculate convergence diagnostics
read_and_check <- function(wd, parTab, burnin){
  chain <- lazymcmc::load_mcmc_chains(wd, parTab, TRUE,1,burnin, TRUE,FALSE,FALSE)
  if(length(chain[[1]]) > 1){
    ess <- ess_diagnostics(chain[[1]],200)
    gelman <- gelman_diagnostics(chain[[1]], 1.1)
    minESS <- min(ess$ESS)
    whichMinESS <- names(which.min(ess$ESS))
    maxGelman <- gelman$WorstGelman[1]
    maxGelmanName <- gelman$WorstGelman[2]
    mpsrf <- gelman$WorstGelman[3]
    rerun <- gelman$Rerun | minESS < 200
  } else {
    minESS <- NA
    whichMinESS <- NA
    maxGelman <- NA
    maxGelmanName <- NA
    mpsrf <- NA
    rerun <- TRUE
  }
  return(list(minESS,whichMinESS,maxGelman,maxGelmanName,mpsrf,rerun))
}

dir <- getwd() ## The working directory that the MCMC chain was run
## The parameter table used in this run will 
## be saved in the working directory
res <- read_and_check(dir, startTab, mcmcPars["adaptive_period"])
## [1] "/home/james/Documents/Zika/zikaInfer/vignettes/test_1_multivariate_chain.csv"
## [2] "/home/james/Documents/Zika/zikaInfer/vignettes/test_2_multivariate_chain.csv"
## [[1]]
## [1] 2001
## 
## [[2]]
## [1] 2001
print(res)
## [[1]]
## [1] 968.7675
## 
## [[2]]
## [1] "lnlike"
## 
## [[3]]
##        Worst_PSRF 
## "1.1021291950287" 
## 
## [[4]]
## Which_worst 
##  "incPropn" 
## 
## [[5]]
##              MPSRF 
## "1.02155723716588" 
## 
## [[6]]
## [1] TRUE

Plot inferred microcephaly risk curve

plot_random_microceph_curves(chain,100)

indiv_model_fit("sim_microDat.csv","sim_incDat.csv", "bahia","Simulated data",0.001,200,0.03, 500, FALSE, TRUE, startTab, chain, FALSE, FALSE)
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 17 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 17 rows containing missing values (geom_point).