Using multiple imputation with regmedint

Kazuki Yoshida

2024-01-06

Missing data is the norm in real-life data analysis. Multiple imputation via the mice package is a popular option in R. Here we introduce simple missingness and demonstrate use of regmedint along with mice.

Missing data generation

For demonstration purpose, missing data is introduced here.

set.seed(138087069)
library(regmedint)
library(tidyverse)
## Prepare dataset
data(vv2015)
vv2015 <- vv2015 %>%
    select(-cens) %>%
    ## Generate exposure-dependent missing of mediator
    mutate(logit_p_m_miss = -1 + 0.5 * x,
           p_m_miss = exp(logit_p_m_miss) / (1 + exp(logit_p_m_miss)),
           ## Indicator draw
           ind_m_miss = rbinom(n = length(p_m_miss), size = 1, prob = p_m_miss),
           m_true = m,
           m = if_else(ind_m_miss == 1L, as.numeric(NA), m))

Truth fit

Taking the advantage of the simulated setting, the true model is fit here.

regmedint_true <-
    regmedint(data = vv2015,
              ## Variables
              yvar = "y",
              avar = "x",
              mvar = "m_true",
              cvar = c("c"),
              eventvar = "event",
              ## Values at which effects are evaluated
              a0 = 0,
              a1 = 1,
              m_cde = 1,
              c_cond = 0.5,
              ## Model types
              mreg = "logistic",
              yreg = "survAFT_weibull",
              ## Additional specification
              interaction = TRUE,
              casecontrol = FALSE)
summary(regmedint_true)
## ### Mediator model
## 
## Call:
## glm(formula = m_true ~ x + c, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.3545     0.3252  -1.090    0.276
## x             0.3842     0.4165   0.922    0.356
## c             0.2694     0.2058   1.309    0.191
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.59  on 99  degrees of freedom
## Residual deviance: 136.08  on 97  degrees of freedom
## AIC: 142.08
## 
## Number of Fisher Scoring iterations: 4
## 
## ### Outcome model
## 
## Call:
## survival::survreg(formula = Surv(y, event) ~ x + m_true + x:m_true + 
##     c, data = data, dist = "weibull")
##               Value Std. Error     z       p
## (Intercept) -1.0424     0.1903 -5.48 4.3e-08
## x            0.4408     0.3008  1.47    0.14
## m_true       0.0905     0.2683  0.34    0.74
## c           -0.0669     0.0915 -0.73    0.46
## x:m_true     0.1003     0.4207  0.24    0.81
## Log(scale)  -0.0347     0.0810 -0.43    0.67
## 
## Scale= 0.966 
## 
## Weibull distribution
## Loglik(model)= -11.4   Loglik(intercept only)= -14.5
##  Chisq= 6.31 on 4 degrees of freedom, p= 0.18 
## Number of Newton-Raphson Iterations: 5 
## n= 100 
## 
## ### Mediation analysis 
##              est         se         Z          p       lower      upper
## cde  0.541070807 0.29422958 1.8389409 0.06592388 -0.03560858 1.11775019
## pnde 0.488930417 0.21049248 2.3227928 0.02019028  0.07637274 0.90148809
## tnie 0.018240025 0.03706111 0.4921608 0.62260566 -0.05439841 0.09087846
## tnde 0.498503455 0.21209540 2.3503737 0.01875457  0.08280410 0.91420281
## pnie 0.008666987 0.02730994 0.3173565 0.75097309 -0.04485951 0.06219348
## te   0.507170442 0.21090051 2.4047853 0.01618197  0.09381303 0.92052785
## pm   0.045436278 0.09119614 0.4982259 0.61832484 -0.13330488 0.22417743
## 
## Evaluated at:
## avar: x
##  a1 (intervened value of avar) = 1
##  a0 (reference value of avar)  = 0
## mvar: m_true
##  m_cde (intervend value of mvar for cde) = 1
## cvar: c
##  c_cond (covariate vector value) = 0.5
## 
## Note that effect estimates can vary over m_cde and c_cond values when interaction = TRUE.

Naive complete case analysis

regmedint_cca <- vv2015 %>%
    filter(!is.na(m)) %>%
    regmedint(data = .,
              ## Variables
              yvar = "y",
              avar = "x",
              mvar = "m",
              cvar = c("c"),
              eventvar = "event",
              ## Values at which effects are evaluated
              a0 = 0,
              a1 = 1,
              m_cde = 1,
              c_cond = 0.5,
              ## Model types
              mreg = "logistic",
              yreg = "survAFT_weibull",
              ## Additional specification
              interaction = TRUE,
              casecontrol = FALSE)
summary(regmedint_cca)
## ### Mediator model
## 
## Call:
## glm(formula = m ~ x + c, family = binomial(link = "logit"), data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -0.2500     0.3880  -0.644    0.519
## x             0.1278     0.4883   0.262    0.794
## c             0.1587     0.2415   0.657    0.511
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 99.758  on 71  degrees of freedom
## Residual deviance: 99.287  on 69  degrees of freedom
## AIC: 105.29
## 
## Number of Fisher Scoring iterations: 3
## 
## ### Outcome model
## 
## Call:
## survival::survreg(formula = Surv(y, event) ~ x + m + x:m + c, 
##     data = data, dist = "weibull")
##               Value Std. Error     z       p
## (Intercept) -1.2689     0.2229 -5.69 1.2e-08
## x            0.7213     0.3315  2.18    0.03
## m            0.4517     0.2985  1.51    0.13
## c           -0.0652     0.1110 -0.59    0.56
## x:m         -0.2579     0.4750 -0.54    0.59
## Log(scale)  -0.0581     0.0958 -0.61    0.54
## 
## Scale= 0.944 
## 
## Weibull distribution
## Loglik(model)= -6.1   Loglik(intercept only)= -10.5
##  Chisq= 8.79 on 4 degrees of freedom, p= 0.066 
## Number of Newton-Raphson Iterations: 5 
## n= 72 
## 
## ### Mediation analysis 
##              est         se         Z          p       lower      upper
## cde  0.463323084 0.34636957 1.3376553 0.18100884 -0.21554880 1.14219497
## pnde 0.582515084 0.24557485 2.3720470 0.01768984  0.10119722 1.06383295
## tnie 0.006182822 0.02643657 0.2338738 0.81508294 -0.04563191 0.05799755
## tnde 0.574385442 0.24784035 2.3175623 0.02047312  0.08862728 1.06014360
## pnie 0.014312464 0.05541066 0.2582980 0.79617690 -0.09429043 0.12291536
## te   0.588697906 0.24809627 2.3728608 0.01765092  0.10243816 1.07495766
## pm   0.013852661 0.05856686 0.2365273 0.81302354 -0.10093628 0.12864160
## 
## Evaluated at:
## avar: x
##  a1 (intervened value of avar) = 1
##  a0 (reference value of avar)  = 0
## mvar: m
##  m_cde (intervend value of mvar for cde) = 1
## cvar: c
##  c_cond (covariate vector value) = 0.5
## 
## Note that effect estimates can vary over m_cde and c_cond values when interaction = TRUE.

Multiple imputation

This specific data setting is a little tricky in that the outcome variable is a censored survival time variable. Here we will use a log transformed survival time.

library(mice)
## Error in library(mice): there is no package called 'mice'
vv2015_mod <- vv2015 %>%
    mutate(log_y = log(y)) %>%
    select(x,m,c,log_y,event)
## Run mice
vv2015_mice <- mice(data = vv2015_mod, m = 50, printFlag = FALSE)
## Error in mice(data = vv2015_mod, m = 50, printFlag = FALSE): could not find function "mice"
## Object containig 50 imputed dataset
vv2015_mice
## Error in eval(expr, envir, enclos): object 'vv2015_mice' not found

After creating such MI datasets, mediation analysis can be performed in each dataset.

## Fit in each MI dataset
vv2015_mice_regmedint <-
    vv2015_mice %>%
    ## Stacked up dataset
    mice::complete("long") %>%
    as_tibble() %>%
    mutate(y = exp(log_y)) %>%
    group_by(.imp) %>%
    ## Nested data frame
    nest() %>%
    mutate(fit = map(data, function(data) {
        regmedint(data = data,
                  ## Variables
                  yvar = "y",
                  avar = "x",
                  mvar = "m",
                  cvar = c("c"),
                  eventvar = "event",
                  ## Values at which effects are evaluated
                  a0 = 0,
                  a1 = 1,
                  m_cde = 1,
                  c_cond = 0.5,
                  ## Model types
                  mreg = "logistic",
                  yreg = "survAFT_weibull",
                  ## Additional specification
                  interaction = TRUE,
                  casecontrol = FALSE)
    })) %>%
    ## Extract point estimates and variance estimates
    mutate(coef_fit = map(fit, coef),
           vcov_fit = map(fit, vcov))
## Error in loadNamespace(x): there is no package called 'mice'
vv2015_mice_regmedint
## Error in eval(expr, envir, enclos): object 'vv2015_mice_regmedint' not found

The results can be combined using the mitools package.

regmedint_mi <- mitools::MIcombine(results = vv2015_mice_regmedint$coef_fit,
                                   variances = vv2015_mice_regmedint$vcov_fit)
## Error in loadNamespace(x): there is no package called 'mitools'
regmedint_mi_summary <- summary(regmedint_mi)
## Error in eval(expr, envir, enclos): object 'regmedint_mi' not found

Comparison

We can observe the MI estimtates are generally more in alignment with the true estimates than the complete-case analysis estimates.

cbind(true = coef(regmedint_true),
      cca = coef(regmedint_cca),
      mi = regmedint_mi_summary$results)
## Error in eval(expr, envir, enclos): object 'regmedint_mi_summary' not found