Chapter 10 Appendix A: Data generating mechanisms
The data generating mechanisms are characterized by a causal model and a statistical model that generate data given in example.
In the first causal model, the mediator-outcome confounder \(L(1)\) is not affected by the exposure. In the second causal model, the mediator-outcome confounder \(L(1)\) is affected by the exposure.
10.1 First causal model: Data generating mechanism without mediator-outcome confounder affected by the exposure
This data generating mechanism is defined by the following set of structural equations: \[\begin{array}{lll} P(L(0)_{male} = 1) &=& p_{L(0)_{male}}\\ P(L(0)_{soc.env} = 1) &=& p_{L(0)_{soc.env}}\\ P(A_{PM_{2.5}} = 1) &=& \beta_{A} + \beta_{male}^A \times L(0)_{male} + \beta_{soc.env}^A \times L(0)_{soc.env}\\ P(L(1) = 1) &=& p_{L(1)}\\ P(M_{smoking} = 1) &=& \beta_{M} + \beta_{male}^M \times L(0)_{male} + \beta_{soc.env}^M \times L(0)_{soc.env} + \beta_{L(1)}^M \times L(1) + \beta_{A}^M \times A_{PM_{2.5}}\\ P(Y_{death} = 1) &=& \beta_{Y} + \beta_{male}^Y \times L(0)_{male} + \beta_{soc.env}^Y \times L(0)_{soc.env} + \beta_{L(1)}^Y \times L(1)\\ & & + \beta_{A}^Y \times A_{PM_{2.5}} + \beta_{M}^Y \times M_{diabetes} + \beta_{A \ast M }^Y \times A_{PM_{2.5}} \times M_{diabetes}\\ \mathbb{E}(Y_{Qol} = 1) &=& \gamma_{Y} + \gamma_{male}^Y \times L(0)_{male} + \gamma_{soc.env}^Y \times L(0)_{soc.env} + \gamma_{L(1)}^Y \times L(1)\\ & &+ \gamma_{A}^Y \times A_{PM_{2.5}} + \gamma_{M}^Y \times M_{diabetes} + \gamma_{A \ast M }^Y \times A_{PM_{2.5}} \times M_{diabetes} + \varepsilon_Y \end{array}\] where \(\varepsilon_Y \sim \mathcal{N}(0,\sigma_Y = 10)\).
One can set the parameters of these structural equations using the following function param.causal.model.1()
:
param.causal.model.1 <- function(A.M.interaction = NULL) {
# L0
p_L0_male <- 0.5
p_L0_soc_env <- 0.65
# A: A0_PM2.5 <- rbinom( 0.05 + 0.04 * L0_male + 0.06 * L0_soc_env )
b_A <- 0.05 # reference prevalence is 5%
b_male_A <- 0.04 # + 0.04 for the effect of L0_male -> A0_PM2.5
b_soc_env_A <- 0.06 # +0.06 for the effect of L0_soc_env -> A0_PM2.5
# L1: intermediate confounder between M and Y, not influenced by A
p_L1 <- 0.3
# M: M_diabetes <- rbinom( 0.2 + 0.05 * L0_male + 0.06 * L0_soc_env + 0.07 * L1 +
# 0.1 * A0_PM2.5 )
b_M <- 0.2 # reference prevalence is 20%
b_male_M <- 0.05 # +0.05 for the effect of L0_male -> M_diabetes
b_soc_env_M <- 0.06 # +0.06 for the effect of L0_soc_env -> M_diabetes
b_L1_M <- 0.07 # +0.07 for the effect of L1 -> M_diabetes
b_A_M <- 0.1 # +0.10 for the effect of A0_PM2.5 -> M_diabetes
# Y binary: rbinom( 0.10 + 0.06 * L0_male + 0.04 * L0_soc_env + 0.05 * A0_PM2.5 +
# 0.07 * L1 + 0.08 * M_diabetes +
# 0.03 * A0_PM2.5 * M_diabetes * A.M.inter )
b_Y <- 0.1 # reference prevalence is 10%
b_male_Y <- 0.06 # +0.06 for the effect of L0_male -> Y
b_soc_env_Y <- 0.04 # +0.04 for the effect of L0_soc_env -> Y
b_A_Y <- 0.05 # 0.05 for the effect of A0_PM2.5 -> Y
b_L1_Y <- 0.07 # +0.07 for the effect of L1 -> Y
b_M_Y <- 0.08 # 0.08 for the effect of M_diabetes -> Y
b_AM_Y <- 0.03 # 0.03 for the interaction effect A0_PM2.5 * M_diabetes -> Y
# Y continuous: (75 - 1 * L0_male - 3 * L0_soc_env - 4 * A0_PM2.5 -3.5 * L1 -
# 9 * M_diabetes -5 * A0_PM2.5 * M_diabetes * A.M.inter ) +
# rnorm(N, mean = 0, sd = 10)
mu_Y <- 75 # reference mean for QoL
c_male_Y <- -1 # -1 for the effect of L0_male -> Y
c_soc_env_Y <- -3 # -3 for the effect of L0_soc_env -> Y
c_A_Y <- -4 # -4 for the effect of A0_PM2.5 -> Y
c_L1_Y <- -3.5 # -3.5 for the effect of L1 -> Y
c_M_Y <- -9 # -9 for the effect of M_diabetes -> Y
c_AM_Y <- -5 # - 5 for the interaction effect A0_PM2.5 * M_diabetes -> Y
sd_Y <- 10 # standard deviation of the residuals
# A*M interaction ?
A.M.inter <- A.M.interaction
coef <- c( p_L0_male = p_L0_male, p_L0_soc_env = p_L0_soc_env,
b_A = b_A, b_male_A = b_male_A, b_soc_env_A = b_soc_env_A,
p_L1 = p_L1,
b_M = b_M, b_male_M = b_male_M, b_soc_env_M = b_soc_env_M,
b_L1_M = b_L1_M, b_A_M = b_A_M,
b_Y = b_Y, b_male_Y = b_male_Y, b_soc_env_Y = b_soc_env_Y,
b_A_Y = b_A_Y, b_L1_Y = b_L1_Y, b_M_Y = b_M_Y, b_AM_Y = b_AM_Y,
mu_Y = mu_Y, c_male_Y = c_male_Y, c_soc_env_Y = c_soc_env_Y,
c_A_Y = c_A_Y, c_L1_Y = c_L1_Y, c_M_Y = c_M_Y, c_AM_Y = c_AM_Y,
sd_Y = sd_Y, A.M.inter = A.M.inter)
return(coef)
}
10.2 Second causal model: Data generating mechanism with mediator-outcome confounder affected by the exposure
This data generating mechanism is defined by the following set of structural equations: \[\begin{array}{lll} P(L(0)_{male} = 1) &=& p_{L(0)_{male}}\\ P(L(0)_{soc.env} = 1) &=& p_{L(0)_{soc.env}}\\ P(A_{PM_{2.5}} = 1) &=& \beta_{A} + \beta_{male}^A \times L(0)_{male} + \beta_{soc.env}^A \times L(0)_{soc.env}\\ P(L(1) = 1) &=& \beta_{L(1)} + \beta_{male}^{L(1)} \times L(0)_{male} + \beta_{soc.env}^{L(1)} \times L(0)_{soc.env} + \beta_{A}^{L(1)} \times A_{PM_{2.5}}\\ P(M_{diabetes} = 1) &=& \beta_{M} + \beta_{male}^M \times L(0)_{male} + \beta_{soc.env}^M \times L(0)_{soc.env} + \beta_{L(1)}^M \times L(1) + \beta_{A}^M \times A_{PM_{2.5}}\\ P(Y_{death} = 1) &=& \beta_{Y} + \beta_{male}^Y \times L(0)_{male} + \beta_{soc.env}^Y \times L(0)_{soc.env} + \beta_{L(1)}^Y \times L(1)\\ & & + \beta_{A}^Y \times A_{PM_{2.5}} + \beta_{M}^Y \times M_{diabetes} + \beta_{A \ast M }^Y \times A_{PM_{2.5}} \times M_{diabetes}\\ \mathbb{E}(Y_{Qol} = 1) &=& \gamma_{Y} + \gamma_{male}^Y \times L(0)_{male} + \gamma_{soc.env}^Y \times L(0)_{soc.env} + \gamma_{L(1)}^Y \times L(1)\\ & &+ \gamma_{A}^Y \times A_{PM_{2.5}} + \gamma_{M}^Y \times M_{diabetes} + \gamma_{A \ast M }^Y \times A_{PM_{2.5}} \times M_{diabetes} + \varepsilon_Y \end{array}\] where \(\varepsilon_Y \sim \mathcal{N}(0,\sigma_Y = 10)\).
One can set the parameters of these structural equations using the following function param.causal.model.2()
:
param.causal.model.2 <- function(A.M.interaction = NULL) {
# L0
p_L0_male <- 0.5
p_L0_soc_env <- 0.65
# A: A0_PM2.5 <- rbinom( 0.05 + 0.04 * L0_male + 0.06 * L0_soc_env )
b_A <- 0.05 # reference prevalence is 5%
b_male_A <- 0.04 # + 0.04 for the effect of L0_male -> A0_PM2.5
b_soc_env_A <- 0.06 # +0.06 for the effect of L0_soc_env -> A0_PM2.5
# L1: L1 <- rbinom( 0.30 - 0.05 * L0_male + 0.08 * L0_soc_env +
# 0.2 * A0_PM2.5 )
b_L1 <- 0.30 # reference prevalence is 30%
b_male_L1 <- -0.05 # - 0.05 for the effect of L0_male -> L1
b_soc_env_L1 <- +0.08 # + 0.08 for the effect of L0_soc_env -> L1
b_A_L1 <- +0.2 # +0.2 for the effect of A0_PM2.5 -> L1
# M: M_diabetes <- rbinom( 0.2 + 0.05 * L0_male + 0.06 * L0_soc_env +
# 0.2 * L1 + 0.1 * A0_PM2.5 )
b_M <- 0.2 # reference prevalence is 20%
b_male_M <- 0.05 # +0.05 for the effect of L0_male -> M_diabetes
b_soc_env_M <- 0.06 # +0.06 for the effect of L0_soc_env -> M_diabetes
b_A_M <- 0.1 # +0.10 for the effect of A0_PM2.5 -> M_diabetes
b_L1_M <- 0.2 # +0.2 for the effect of L1 -> M_diabetes
# Y binary: rbinom( 0.10 + 0.06 * L0_male + 0.04 * L0_soc_env +
# 0.05 * A0_PM2.5 + 0.07 * L1 + 0.08 * M_diabetes +
# 0.03 * A0_PM2.5 * M_diabetes * A.M.inter )
b_Y <- 0.1 # reference prevalence is 10%
b_male_Y <- 0.06 # +0.06 for the effect of L0_male -> Y
b_soc_env_Y <- 0.04 # +0.04 for the effect of L0_soc_env -> Y
b_A_Y <- 0.05 # 0.05 for the effect of A0_PM2.5 -> Y
b_L1_Y <- 0.07 # +0.07 for the effect of L1 -> Y
b_M_Y <- 0.08 # 0.08 for the effect of M_diabetes -> Y
b_AM_Y <- 0.03 # 0.03 for the interaction effect A0_PM2.5 * M_diabetes -> Y
# Y continuous: (75 - 1 * L0_male - 3 * L0_soc_env - 4 * A0_PM2.5 +
# -3.5 * L1 - 9 * M_diabetes +
# -5 * A0_PM2.5 * M_diabetes * A.M.inter ) + rnorm(N, mean = 0, sd = 10)
mu_Y <- 75 # reference mean for QoL
c_male_Y <- -1 # -1 for the effect of L0_male -> Y
c_soc_env_Y <- -3 # -3 for the effect of L0_soc_env -> Y
c_A_Y <- -4 # -4 for the effect of A0_PM2.5 -> Y
c_L1_Y <- -5 # -5 for the effect of L1 -> Y
c_M_Y <- -9 # -9 for the effect of M_diabetes -> Y
c_AM_Y <- -5 # - 5 for the interaction effect A0_PM2.5 * M_diabetes -> Y
sd_Y <- 10 # standard deviation of the residuals
# A*M interaction ?
A.M.inter <- A.M.interaction
coef <- c( p_L0_male = p_L0_male, p_L0_soc_env = p_L0_soc_env,
b_A = b_A, b_male_A = b_male_A, b_soc_env_A = b_soc_env_A,
b_L1 = b_L1, b_male_L1 = b_male_L1, b_soc_env_L1 = b_soc_env_L1,
b_A_L1 = b_A_L1,
b_M = b_M, b_male_M = b_male_M, b_soc_env_M = b_soc_env_M,
b_L1_M = b_L1_M, b_A_M = b_A_M,
b_Y = b_Y, b_male_Y = b_male_Y, b_soc_env_Y = b_soc_env_Y,
b_A_Y = b_A_Y, b_L1_Y = b_L1_Y, b_M_Y = b_M_Y, b_AM_Y = b_AM_Y,
mu_Y = mu_Y, c_male_Y = c_male_Y, c_soc_env_Y = c_soc_env_Y,
c_A_Y = c_A_Y, c_L1_Y = c_L1_Y, c_M_Y = c_M_Y, c_AM_Y = c_AM_Y,
sd_Y = sd_Y, A.M.inter = A.M.inter)
return(coef)
}
10.3 Simulation of the four data sets used in examples
10.3.1 Data sets generated from the causal model 1
The following function gen.data.causal.model.1
can be used to simulate data sets using the parameters defined previously in the param.causal.model.1
function.
gen.data.causal.model.1 <- function(N, A.M.inter) { # input parameters are the
# sample size N and the presence of A*M interaction with A.M.inter = 0 or 1
b <- param.causal.model.1(A.M.interaction = A.M.inter)
# baseline confounders: parent's educational level=L0_soc_env & sex=L0_male
L0_male <- rbinom(N, size = 1, prob = b["p_L0_male"])
L0_soc_env <- rbinom(N, size = 1, prob = b["p_L0_soc_env"])
# exposure: A0_PM2.5
A0_PM2.5 <- rbinom(N, size = 1, prob = b["b_A"] +
b["b_male_A"] * L0_male +
b["b_soc_env_A"] * L0_soc_env )
# intermediate confounder between M_diabetes and Y, not affected by A0 L1
L1 <- rbinom(N, size = 1, prob = b["p_L1"])
# mediator: M_diabetes
M_diabetes <- rbinom(N, size = 1, prob = b["b_M"] +
b["b_male_M"] * L0_male +
b["b_soc_env_M"] * L0_soc_env +
b["b_A_M"] * A0_PM2.5 +
b["b_L1_M"] * L1)
# Y_death
Y_death <- rbinom(N, size = 1, prob = b["b_Y"] +
b["b_male_Y"] * L0_male +
b["b_soc_env_Y"] * L0_soc_env +
b["b_A_Y"] * A0_PM2.5 +
b["b_L1_Y"] * L1 +
b["b_M_Y"] * M_diabetes +
b["b_AM_Y"] * A0_PM2.5 * M_diabetes * A.M.inter )
# Y_qol
Y_qol <- ( b["mu_Y"] +
b["c_male_Y"] * L0_male +
b["c_soc_env_Y"] * L0_soc_env +
b["c_A_Y"] * A0_PM2.5 +
b["c_L1_Y"] * L1 +
b["c_M_Y"] * M_diabetes +
b["c_AM_Y"] * A0_PM2.5 * M_diabetes * A.M.inter ) +
rnorm(N, mean = 0, sd = b["sd_Y"])
# data.frame
data.sim <- data.frame(L0_male, L0_soc_env, A0_PM2.5, L1, M_diabetes,
Y_death, Y_qol)
return( data.sim )
}
Applying a sample size N=10000, we generate the df1.csv
and df1_int.csv
data sets.
set.seed(1234)
df1 <- gen.data.causal.model.1(N=10000, A.M.inter=0)
write.csv(df1, file = "data/df1.csv", row.names = FALSE)
set.seed(1234)
df1_int <- gen.data.causal.model.1(N=10000, A.M.inter=1)
write.csv(df1_int, file = "data/df1_int.csv", row.names = FALSE)
## L0_male L0_soc_env A0_PM2.5 L1 M_diabetes Y_death Y_qol
## 1 0 1 0 1 0 0 93.41819
## 2 1 1 1 1 0 1 64.03221
## 3 1 1 0 0 0 0 75.56249
## 4 1 0 0 0 0 0 89.77055
## 5 1 1 0 0 0 0 77.22353
## 6 1 1 0 0 1 0 73.87975
## L0_male L0_soc_env A0_PM2.5 L1 M_diabetes Y_death Y_qol
## 1 0 1 0 1 0 0 93.41819
## 2 1 1 1 1 0 1 64.03221
## 3 1 1 0 0 0 0 75.56249
## 4 1 0 0 0 0 0 89.77055
## 5 1 1 0 0 0 0 77.22353
## 6 1 1 0 0 1 0 73.87975
10.3.2 Data sets generated from the causal model 2
The following function gen.data.causal.model.2
can be used to simulate data sets using the parameters defined previously in the param.causal.model.2
function.
gen.data.causal.model.2 <- function(N, A.M.inter) { # input parameters are the
# sample size N and the presence of A*M interaction with A.M.inter = 0 or 1
b <- param.causal.model.2(A.M.interaction = A.M.inter)
# baseline confounders: parent's educational level=L0_soc_env & sex=L0_male
L0_male <- rbinom(N, size = 1, prob = b["p_L0_male"])
L0_soc_env <- rbinom(N, size = 1, prob = b["p_L0_soc_env"])
# exposure: A0_PM2.5
A0_PM2.5 <- rbinom(N, size = 1, prob = b["b_A"] +
b["b_male_A"] * L0_male +
b["b_soc_env_A"] * L0_soc_env )
# intermediate confounder between M_diabetes and Y,
L1 <- rbinom(N, size = 1, prob = b["b_L1"] +
b["b_male_L1"] * L0_male +
b["b_soc_env_L1"] * L0_soc_env +
b["b_A_L1"]* A0_PM2.5)
# mediator: M_diabetes
M_diabetes <- rbinom(N, size = 1, prob = b["b_M"] +
b["b_male_M"] * L0_male +
b["b_soc_env_M"] * L0_soc_env +
b["b_A_M"] * A0_PM2.5 +
b["b_L1_M"] * L1)
# Y_death
Y_death <- rbinom(N, size = 1, prob = b["b_Y"] +
b["b_male_Y"] * L0_male +
b["b_soc_env_Y"] * L0_soc_env +
b["b_A_Y"] * A0_PM2.5 +
b["b_L1_Y"] * L1 +
b["b_M_Y"] * M_diabetes +
b["b_AM_Y"] * A0_PM2.5 * M_diabetes * A.M.inter )
# Y_qol
Y_qol <- ( b["mu_Y"] +
b["c_male_Y"] * L0_male +
b["c_soc_env_Y"] * L0_soc_env +
b["c_A_Y"] * A0_PM2.5 +
b["c_L1_Y"] * L1 +
b["c_M_Y"] * M_diabetes +
b["c_AM_Y"] * A0_PM2.5 * M_diabetes * A.M.inter ) +
rnorm(N, mean = 0, sd = b["sd_Y"])
# data.frame
data.sim <- data.frame(L0_male, L0_soc_env, A0_PM2.5, L1, M_diabetes,
Y_death, Y_qol)
return( data.sim )
}
Applying a sample size N=10000, we generate the df2.csv
and df2_int.csv
data sets.
set.seed(1234)
df2 <- gen.data.causal.model.2(N=10000, A.M.inter=0)
write.csv(df2, file = "data/df2.csv", row.names = FALSE)
set.seed(1234)
df2_int <- gen.data.causal.model.2(N=10000, A.M.inter=1)
write.csv(df2_int, file = "data/df2_int.csv", row.names = FALSE)
## L0_male L0_soc_env A0_PM2.5 L1 M_diabetes Y_death Y_qol
## 9995 0 1 0 1 1 0 53.25115
## 9996 0 1 0 0 1 0 66.36484
## 9997 0 1 1 1 1 0 74.20579
## 9998 1 1 0 0 1 0 41.30248
## 9999 0 0 0 1 0 0 85.60169
## 10000 1 1 0 0 0 0 61.56969
## L0_male L0_soc_env A0_PM2.5 L1 M_diabetes Y_death Y_qol
## 9995 0 1 0 1 1 0 53.25115
## 9996 0 1 0 0 1 0 66.36484
## 9997 0 1 1 1 1 0 69.20579
## 9998 1 1 0 0 1 0 41.30248
## 9999 0 0 0 1 0 0 85.60169
## 10000 1 1 0 0 0 0 61.56969