5 Simulation example

#Load packages
library(ggdag)
library(dagitty)
library(rstanarm)
library(tidybayes)
library(ggdist)
library(tidyverse)
library(gt)
library(rstan)
library(bayesboot)

5.1 Data-generating process

The data-generating process is described via the directed acyclic graph (DAG) in Fig. 5.1. In this DAG, acoustic sensation \((\text{C})\) is influenced directly by noise \((\text{N})\), biological sex \((\text{S})\) and age \((\text{A})\). Participation \((\text{P})\) in the survey response is affected by biological sex \((\text{S})\) and age \((\text{A})\).

Code

dag_coords.ex5 <-
  data.frame(name = c('S', 'N', 'P', 'C', 'A'),
             x = c(1, 1, 3.5, 3.5, 6),
             y = c(2, 3, 1, 3, 2))

DAG.ex5 <-
  dagify(C ~ N + S + A,
         P ~ S + A,
         coords = dag_coords.ex5)

node_labels <- c(
  S = 'bold(S)', 
  N = 'bold(N)', 
  P = 'bold(P)', 
  C = 'bold(C)', 
  A = 'bold(A)'
)

ggplot(data = DAG.ex5, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_point(x = 3.5, y = 1, shape = 0, size = 12, stroke = 0.9, color = 'black') +
  geom_dag_text(aes(label = node_labels[name]),
                parse = TRUE,
                colour = 'black',
                size = 10,
                family = 'mono') +
  geom_dag_edges(arrow_directed = grid::arrow(length = grid::unit(10, 'pt'), type = 'open'),
                 edge_colour = 'black',
                 family = 'mono',
                 fontface = 'bold') +
  annotate('text', x = 3.5, y = 0.68, label = 'participation', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 3.5, y = 3.3, label = 'acoustic sensation', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 1, y = 3.3, label = 'noise', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 6, y = 1.7, label = 'age', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 1, y = 1.7, label = 'sex', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  coord_cartesian(xlim = c(0.5, 6.5), ylim = c(0.8, 3.2))  +
  theme_dag()

Fig. 5.1. Graphical representation via DAG of the data-generating process. The box around P indicates that the sample is conditioned on participation.

The DAG in Fig. 5.1 can be written as:

\(C \sim f_{C}(N, S, A)\), read as ‘acoustic sensation \((\text{C})\) is some function of noise \((\text{N})\), biological sex \((\text{S})\) and age \((\text{A})\)’.
\(P \sim f_{P}(S, A)\), read as ‘participation \((\text{P})\) is some function of biological sex \((\text{S})\) and age \((\text{A})\)’.

5.2 Synthetic data set

To generate synthetic data, we defined the custom function data.sim_ex5(). This function takes as inputs the sample size n and generates synthetic data according to the DAG in Fig. 5.1.

data.sim_ex5 <- function(n) {
  b_age.conf = 0.3 #direct causal effect of A on C 1 or 0.2
  b_sex.conf = c(0, -1) #direct causal effect of S on C
  b_noise.conf = 0.7 #direct causal effect of N on C
  #Simulate age
  A <- sample(18:59, size = n, replace = TRUE)
  #Simulate biological sex
  S <- factor(sample(c('male', 'female'), size = n, replace = TRUE)) 
  #Simulate noise
  N <- rnorm(n = n, mean = 45, sd = 4)
  #Simulate acoustic sensation
  C <- rnorm(n = n, mean = ifelse(S == 'male', 10 + b_sex.conf[1] + b_noise.conf * N + b_age.conf * A, 10 + b_sex.conf[2] + b_noise.conf * N + b_age.conf * A), sd = 3)
  #Simulate participation
  P <- rbinom(n = n, size = 1, prob = plogis(ifelse(S == 'male', +23 -15.6 - 0.25 * A, +23 -13.5 - 0.25 * A)))
  #Return tibble with simulated values
  return(tibble(A, S, N, C, P))
  }

From this data generation mechanism, we simulated the target population, which consists of one million observations.

set.seed(2025) #set random number for reproducibility
#Simulate the population
ex5_population <- data.sim_ex5(n = 1e6)
#Set biological sex reference category to 'male'
ex5_population$S <- relevel(ex5_population$S, ref = 'male')
#View the data frame
ex5_population

# A tibble: 1,000,000 × 5
       A S          N     C     P
   <int> <fct>  <dbl> <dbl> <int>
 1    30 male    46.2  53.3     1
 2    29 female  47.5  47.4     1
 3    53 male    44.3  57.7     0
 4    43 female  40.3  52.4     0
 5    18 male    37.4  39.9     1
 6    40 female  36.2  48.1     0
 7    27 female  46.2  55.9     1
 8    30 female  48.2  52.7     1
 9    29 female  47.6  51.3     1
10    21 female  36.2  42.8     1
# ℹ 999,990 more rows

From this population, we obtained two data sets of five thousand observations each: one using simple random sampling and the other using non-random sampling.

The first sample (i.e., ex5_sample.random) will use simple random sampling, whereas the second one (i.e., ex5_sample.bias) will use non-random sampling. In this case, we will first select only the observations with P == 1 and subsequently sampling randomly from them. As a result, the sample obtained is biased.

n_sims <- 1e3 #number of data sets to simulate

#Set random number for reproducibility
set.seed(2025)  
#Generate a vector of random numbers for reproducibility
ex5_random.seed <- sample(1:1e5, size = n_sims, replace = FALSE)

set.seed(ex5_random.seed[1])
#Sample one data set of 5,000 observations
ex5_sample.random <- 
  ex5_population %>% 
  slice_sample(n = 5e3) #take a simple random sample of size 5,000

set.seed(ex5_random.seed[1])
#Sample one data set of 5,000 observations
ex5_sample.bias <- 
  ex5_population %>% 
  filter(P == 1) %>% slice_sample(n = 5e3) #take a biased sample of size 5,000

A summary of descriptive statistics for all variables in the data sets is given below.

summary(ex5_sample.random)

       A              S              N               C               P        
 Min.   :18.00   male  :2544   Min.   :31.27   Min.   :34.81   Min.   :0.000  
 1st Qu.:28.00   female:2456   1st Qu.:42.25   1st Qu.:48.85   1st Qu.:0.000  
 Median :39.00                 Median :44.90   Median :52.72   Median :0.000  
 Mean   :38.75                 Mean   :44.92   Mean   :52.62   Mean   :0.373  
 3rd Qu.:49.00                 3rd Qu.:47.66   3rd Qu.:56.58   3rd Qu.:1.000  
 Max.   :59.00                 Max.   :60.65   Max.   :69.52   Max.   :1.000

summary(ex5_sample.bias)

       A              S              N               C               P    
 Min.   :18.00   male  :1869   Min.   :30.84   Min.   :33.79   Min.   :1  
 1st Qu.:22.00   female:3131   1st Qu.:42.41   1st Qu.:46.10   1st Qu.:1  
 Median :26.00                 Median :45.04   Median :49.16   Median :1  
 Mean   :27.48                 Mean   :45.04   Mean   :49.17   Mean   :1  
 3rd Qu.:32.00                 3rd Qu.:47.79   3rd Qu.:52.28   3rd Qu.:1  
 Max.   :59.00                 Max.   :62.38   Max.   :65.06   Max.   :1

5.3 Data analysis

In this example, the target of our analysis is the total average causal effect, ACE (also known as total average treatment effect, ATE) of biological sex \((\text{S})\) on acoustic sensation \((\text{C})\), which stands for the expected increase of \(\text{C}\) in response to a change from male to female in \(\text{S}\) due to an intervention. The causal effect of interest is visualized in Fig. 5.2.

Code

ggplot(data = DAG.ex5, aes(x = x, y = y, xend = xend, yend = yend)) +
  #visualize causal effect path
  geom_segment(x = 1, xend = 3.5, y = 2, yend = 3,
               linewidth = 14, lineend = 'round', colour = '#009E73', alpha = 0.05) +
  geom_point(x = 3.5, y = 1, shape = 0, size = 12, stroke = 0.9, color = 'black') +
  geom_dag_text(aes(label = node_labels[name]),
                parse = TRUE,
                colour = 'black',
                size = 10,
                family = 'mono') +
  geom_dag_edges(arrow_directed = grid::arrow(length = grid::unit(10, 'pt'), type = 'open'),
                 edge_colour = 'black',
                 family = 'mono',
                 fontface = 'bold') +
  annotate('text', x = 3.5, y = 0.68, label = 'participation', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 3.5, y = 3.3, label = 'acoustic sensation', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 1, y = 3.3, label = 'noise', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 6, y = 1.7, label = 'age', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  annotate('text', x = 1, y = 1.7, label = 'sex', 
           size = 4, hjust = 0.5, colour = 'grey50') +
  #causal effect number
  annotate('text', x = 1.9, y = 2.6, label = '-1', 
           size = 4.5, hjust = 0.5, colour = 'black', parse = TRUE) +
  coord_cartesian(xlim = c(0.5, 6.5), ylim = c(0.8, 3.2))  +
  theme_dag()

Fig. 5.2. Graphical representation via DAG of the data-generating process. The box around P indicates that the sample is conditioned on participation. The green line indicates the causal question of interest, and the number on the path indicates the total average causal effect.

5.3.1 Identification

The first step to answer the causal question of interest is identification. Identification answers a ‘theoretical’ question by determining whether a causal effect can, in principle, be estimated from observed data. The backdoor criterion and its generalization, the adjustment criterion, allow us to understand whether our causal effect of interest can be identified and, if so, which variables we should (or should not) statistically adjust for (i.e., the adjustment set) to estimate the causal effect from the data.

Given its simplicity, we will first apply the backdoor criterion to identify valid adjustment sets to estimate the causal effect of interest. If the backdoor criterion is not applicable, we will apply its generalization, the adjustment criterion.

Backdoor criterion

Applying the backdoor criterion, we found no backdoor paths since no arrows enter into biological sex. However, we noticed that participation \((\text{P})\) is a descendant of biological sex \((\text{S})\) and is selected. Although selection and adjustment are conceptually distinct, both are forms of conditioning. Since the sample is implicitly conditioned on \(\text{P}\), a descendant of \(\text{S}\), the conditions required to apply the backdoor criterion are not met. We then apply the adjustment criterion.

Adjustment criterion

With the adjustment criterion, conditioning on \(\text{P}\) is allowed because \(\text{P}\) is not on the causal path from \(\text{S}\) to \(\text{C}\), but we also need to check that all the non-causal paths from \(\text{S}\) to \(\text{C}\) are closed. Since participation \((\text{P})\) is a collider (i.e., common effect) of biological sex \((\text{S})\) and age \((\text{A})\), as females and older individuals are more likely to participate in the experiment, the non-causal path \(\text{S} \rightarrow \text{P} \leftarrow \text{A} \rightarrow \text{C}\) is open. As a result, there is a spurious association between sex \((\text{S})\) and acoustic sensation \((\text{C})\).

Given the DAG in Fig. 5.1, we can use the adjustmentSets() function to identify the adjustment set algorithmically. Unfortunately, adjustmentSets() does not allow finding the adjustment set(s) when there is selection bias (i.e., a variable is conditioned on in the DAG). Nevertheless, for this example there is a workaround. We can still use adjustmentSets() to get the adjustment set(s) but then only select the one(s) that includes \(\text{P}\).

adjustmentSets(DAG.ex5,
               exposure = 'S', #biological sex
               outcome = 'C', #acoustic sensation
               type = 'all', 
               effect = 'total', 
               max.results = Inf)

 {}
{ A }
{ N }
{ A, N }
{ A, P }
{ A, N, P }

The resulting adjustment sets that include participation \((\text{P})\) are only two: adjust for age \((\text{A})\) or adjust for age \((\text{A})\) and noise \((\text{N})\); failing to do so will lead to bias.

5.3.2 Estimation

Following the identification step is the estimation step. This step addresses a statistical question by determining how the causal effect identified in the previous step can be estimated. To perform this step, we used a parametric (model-based) estimator, specifically, linear regression. This was possible because we designed the illustrative examples to be simple and with a linear relationship between the variables. This way, we limited the complexity of the examples themselves and shifted the focus to the application of the backdoor criterion to define ‘correct’ adjustment sets.

For transparency and understanding, all (implicit) assumptions used for this illustrative example are (explicitly) provided in Table 5.1.

Table 5.1. Summary description of the simulation example

Research question	Total average causal effect (ACE) of biological sex (S) on acoustic sensation (C).
Assumptions	Limited random variability: large sample size.
	Independence of observations: each observation represents independent bits of information.
	No confounding: the DAG includes all shared causes among the variables.
	No model error: perfect functional form specification.
	No measurement error: all variables are measured perfectly.
Variables	Noise (N): continuous variable [unit: dB(A)]
	Biological sex (S): categorical variable ['male'; 'female']
	Age (A): discrete (count) variable [unit: years]
	Acoustic sensation (C): continuous variable [unit: -]

To carry out the estimation step, we utilized linear regression within both the frequentist and Bayesian frameworks. Specifically, we will run three regression models:

Model.1 will include only biological sex \((\text{S})\) as predictor;
Model.2 will include biological sex \((\text{S})\) and age \((\text{A})\) as predictors.
Model.3 will include biological sex \((\text{S})\) and noise \((\text{N})\) as predictors.

The results of the fitted statistical models (i.e., Model.1, Model.2 and Model.3) are presented here.

Frequentist framework

#Fit the linear regression model with S and C (Model 1)
ex5_Model.1 <-
  lm(formula = C ~ S,
     data = ex5_sample.bias)
#View of the model summary
summary(ex5_Model.1)


Call:
lm(formula = C ~ S, data = ex5_sample.bias)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4275  -3.0739  -0.0302   3.1096  15.9110 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 49.21609    0.10604 464.114   <2e-16 ***
Sfemale     -0.07153    0.13401  -0.534    0.594    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.584 on 4998 degrees of freedom
Multiple R-squared:  5.701e-05, Adjusted R-squared:  -0.0001431 
F-statistic: 0.2849 on 1 and 4998 DF,  p-value: 0.5935

#Fit the linear regression model with S, A and C (Model 2)
ex5_Model.2 <-
  lm(formula = C ~ S + A,
     data = ex5_sample.bias)
#View of the model summary
summary(ex5_Model.2)


Call:
lm(formula = C ~ S + A, data = ex5_sample.bias)

Residuals:
    Min      1Q  Median      3Q     Max 
-14.456  -2.763   0.013   2.757  15.015 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 41.956227   0.231256 181.428  < 2e-16 ***
Sfemale     -1.001084   0.123454  -8.109 6.37e-16 ***
A            0.285392   0.008283  34.457  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.121 on 4997 degrees of freedom
Multiple R-squared:  0.192, Adjusted R-squared:  0.1917 
F-statistic: 593.8 on 2 and 4997 DF,  p-value: < 2.2e-16

#Fit the linear regression model with S, N and C (Model 3)
ex5_Model.3 <-
  lm(formula = C ~ S + N,
     data = ex5_sample.bias)
#View of the model summary
summary(ex5_Model.3)


Call:
lm(formula = C ~ S + N, data = ex5_sample.bias)

Residuals:
    Min      1Q  Median      3Q     Max 
-12.545  -2.523  -0.049   2.505  14.146 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 18.18180    0.59013  30.810   <2e-16 ***
Sfemale     -0.01190    0.10713  -0.111    0.912    
N            0.68816    0.01295  53.140   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.665 on 4997 degrees of freedom
Multiple R-squared:  0.3611,    Adjusted R-squared:  0.3608 
F-statistic:  1412 on 2 and 4997 DF,  p-value: < 2.2e-16

The estimated coefficients for the three models are then plotted in Fig. 5.3.

Code

b_sex.conf = c(0, -1)

data.frame(model = c('Model.1', 'Model.2', 'Model.3'),
           estimate = c(coef(ex5_Model.1)['Sfemale'], 
                        coef(ex5_Model.2)['Sfemale'],
                        coef(ex5_Model.3)['Sfemale']),
           lower.95.CI = c(confint(ex5_Model.1, level = 0.95, type = 'Wald')['Sfemale', 1], 
                           confint(ex5_Model.2, level = 0.95, type = 'Wald')['Sfemale', 1],
                           confint(ex5_Model.3, level = 0.95, type = 'Wald')['Sfemale', 1]),
           upper.95.CI = c(confint(ex5_Model.1, level = 0.95, type = 'Wald')['Sfemale', 2], 
                           confint(ex5_Model.2, level = 0.95, type = 'Wald')['Sfemale', 2],
                           confint(ex5_Model.3, level = 0.95, type = 'Wald')['Sfemale', 2])) %>%
  
ggplot(aes(x = estimate, y = model, xmin = lower.95.CI, xmax = upper.95.CI)) + 
  geom_vline(xintercept = b_sex.conf[2], alpha = 0.8, linetype = 'dashed', colour = 'blue') + 
  geom_linerange() +
  geom_point(shape = 21, size = 2, fill = 'white', stroke = 1) +
  scale_x_continuous('Estimate', 
                     breaks = seq(from = -10, to = 10, by = 0.25),
                     limits = c(-1.55, 0.55)
                     ) +
  theme(panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA),
        axis.title.y = element_blank())

Fig. 5.3. Estimates of the biological sex coefficient for the three models. The white dots represent the point estimate, and the black lines represent the 95% confidence intervals. The blue dashed line represents the parameter used to generate the data.

Fig. 5.3 shows the estimates (point estimate and 95% confidence interval) of the coefficient for biological sex for Model.1, Model.2and Model.3.

For Model.1 we found a negative coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) equal to -0.072 with 95% confidence interval (CI) [-0.334, 0.191]. Since the 95% CI includes zero, the regression coefficient is not statistically significantly different from zero at the 0.05 level (p-value = 0.594). The observed data are consistent with zero, but also consistent with other non-zero values (i.e., all numbers within the 95% confidence interval). Additionally, since the 95% CI does not include the data-generating parameter for sex (i.e., b_sex.conf = -1), we can deduce that the estimated coefficient for sex is statistically significantly different from -1 at the 0.05 level. We can test this more formally by using the linearHypothesis() function. Specifically,

#Test for Sfemale = -1
car::linearHypothesis(ex5_Model.1, 'Sfemale = -1') # car:: access the function from the car package without loading it in the environment


Linear hypothesis test:
Sfemale = - 1

Model 1: restricted model
Model 2: C ~ S

  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1   4999 106053                                  
2   4998 105044  1    1008.9 48.004 4.791e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The resulting p-value is 4.79e-12, which indicates that we can reject the null hypothesis (i.e., Sfemale = -1) at the 0.05 level. This result suggests that the regression coefficient for sex (i.e., -0.072) is statistically significantly different from -1. Since the estimated causal effect from Model.1 is biased, we would erroneously conclude that changing biological sex from male to female causes a decrease in acoustic sensation by -0.072 units (95% CI [-0.334, 0.191]).

For Model.2 we found a negative coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) equal to -1.001 with 95% CI [-1.243, -0.759]. Since the 95% CI excludes zero, the regression coefficient is statistically significantly different from zero at the 0.05 level (p-value = 6.37e-16). Additionally, since the 95% CI includes the data-generating parameter for sex (i.e., b_sex.conf = -1), we can deduce that the estimated coefficient for sex is not statistically significantly different from -1 at the 0.05 level (although this will be the case for all numbers within the 95% confidence interval). We can test this more formally by using the linearHypothesis() function. Specifically,

#Test for Sfemale = -1
car::linearHypothesis(ex5_Model.2, 'Sfemale = -1') # car:: access the function from the car package without loading it in the environment


Linear hypothesis test:
Sfemale = - 1

Model 1: restricted model
Model 2: C ~ S + A

  Res.Df   RSS Df Sum of Sq     F Pr(>F)
1   4998 84877                          
2   4997 84877  1 0.0013098 1e-04  0.993

The resulting p-value is 0.993, which indicates that we fail to reject the null hypothesis (i.e., Sfemale = -1) at the 0.05 level. This result suggests that the regression coefficient for sex (i.e., -1.001) is not statistically significantly different from -1. Since the estimated causal effect from Model.2 is unbiased, we would correctly conclude that changing biological sex from male to female causes a decrease in acoustic sensation by -1.001 units (95% CI [-1.243, -0.759]).

For Model.3 we found a negative coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) equal to -0.012 with 95% CI [-0.222, 0.198]. Since the 95% CI includes zero, the regression coefficient is not statistically significantly different from zero at the 0.05 level (p-value = 0.912). The observed data are consistent with zero, but also consistent with other non-zero values (i.e., all numbers within the 95% confidence interval). Additionally, since the 95% CI does not include the data-generating parameter for sex (i.e., b_sex.conf = -1), we can deduce that the estimated coefficient for sex is statistically significantly different from -1 at the 0.05 level. We can test this more formally by using the linearHypothesis() function. Specifically,

#Test for Sfemale = -1
car::linearHypothesis(ex5_Model.3, 'Sfemale = -1') # car:: access the function from the car package without loading it in the environment


Linear hypothesis test:
Sfemale = - 1

Model 1: restricted model
Model 2: C ~ S + N

  Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
1   4998 68259                                  
2   4997 67116  1    1142.5 85.066 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The resulting p-value is 4.16e-20, which indicates that we can reject the null hypothesis (i.e., Sfemale = -1) at the 0.05 level. This result suggests that the regression coefficient for sex (i.e., -0.012) is statistically significantly different from -1. Since the estimated causal effect from Model.1 is biased, we would erroneously conclude that changing biological sex from male to female causes a decrease in acoustic sensation by -0.012 units (95% CI [-0.222, 0.198]).

Importantly, for Model.1, Model.2 and Model.3, the 95% confidence interval means that if we were to repeat the sampling process and calculate the interval many times, 95% of those calculated intervals would contain the true population parameter. To highlight this, we can repeat the analysis by fitting the two models to one thousand data sets randomly selected from our population.

The for-loop shown in the code below performs the following operations. First, sample (using simple random sampling) a data set of 5,000 observations from the target population. Subsequently, perform linear regression using Model.1, Model.2 and Model.3 and store the estimated coefficients for sex, its standard error and 95% confidence interval in the data frame coefs_ex5. This operation is repeated a thousand times, resulting in the data frame coefs_ex5 containing the estimates (point estimate, standard error and confidence interval) of a thousand random samples of size 5,000 using both Model.1, Model.2 and Model.3.

n_model <- c('mod.1', 'mod.2', 'mod.3')     

n_row <- n_sims*length(n_model)

#Create an empty data frame
empty.df <- data.frame(matrix(NA, nrow = n_row, ncol = 7))
#Rename the data frame columns
colnames(empty.df) <- c('sim.id', 'estimate', 'se', 'CI_2.5', 'CI_97.5',
                        'model', 'coverage')

#Sample a thousand data sets of 5,000 observations and perform linear regression 
coefs_ex5 <- empty.df #assign the empty data frame
k = 1
for (i in 1:n_sims){
  set.seed(ex5_random.seed[i]) #set unique seed for each simulation 
#Sample data set from population   
  sample.bias <- 
    ex5_population %>% 
    filter(P == 1) %>% 
    slice_sample(n = 5e3) #take a simple random sample of size 5,000
#Fit models
  for (j in 1:length(n_model)){
    if (n_model[j] == 'mod.1'){
      fit <- lm(formula = C ~ S,
                data = sample.bias)
    } else if (n_model[j] == 'mod.2'){
      fit <- lm(formula = C ~ S + A,
                data = sample.bias)  
    } else {
      fit <- lm(formula = C ~ S + N,
                data = sample.bias)
    }  
#Compile matrix  
  coefs_ex5[k, 1] <- i #simulation ID
  coefs_ex5[k, 2] <- coef(fit)['Sfemale'] #point estimate
  coefs_ex5[k, 3] <- summary(fit)$coef['Sfemale','Std. Error'] #standard error
  coefs_ex5[k, 4:5] <- confint(fit, level = 0.95, type = 'Wald')['Sfemale', ] #confidence interval (Wald)
  coefs_ex5[k, 6] <- n_model[j] #sample size
  k = k + 1
  }
}
coefs_ex5 <- as_tibble(coefs_ex5)
#View the data frame
coefs_ex5

# A tibble: 3,000 × 7
   sim.id estimate    se CI_2.5 CI_97.5 model coverage
    <int>    <dbl> <dbl>  <dbl>   <dbl> <chr> <lgl>   
 1      1  -0.0715 0.134 -0.334   0.191 mod.1 NA      
 2      1  -1.00   0.123 -1.24   -0.759 mod.2 NA      
 3      1  -0.0119 0.107 -0.222   0.198 mod.3 NA      
 4      2   0.0547 0.136 -0.211   0.321 mod.1 NA      
 5      2  -0.973  0.125 -1.22   -0.727 mod.2 NA      
 6      2   0.0312 0.108 -0.180   0.243 mod.3 NA      
 7      3  -0.0387 0.137 -0.306   0.229 mod.1 NA      
 8      3  -1.08   0.125 -1.32   -0.831 mod.2 NA      
 9      3   0.0386 0.108 -0.173   0.250 mod.3 NA      
10      4  -0.0152 0.136 -0.282   0.252 mod.1 NA      
# ℹ 2,990 more rows

The coverage is defined by setting its value to 1 if the confidence interval overlaps the data-generating parameter for sex (i.e., b_sex.conf = -1) and 0 otherwise.

#Calculate coverage
coefs_ex5 <-
  coefs_ex5 %>%
  mutate(coverage = case_when(CI_2.5 > b_sex.conf[2] | CI_97.5 < b_sex.conf[2] ~ 0,
                              CI_2.5 <= b_sex.conf[2] & CI_97.5 >= b_sex.conf[2] ~ 1,
                              .default = NA))
#View the data frame
coefs_ex5

# A tibble: 3,000 × 7
   sim.id estimate    se CI_2.5 CI_97.5 model coverage
    <int>    <dbl> <dbl>  <dbl>   <dbl> <chr>    <dbl>
 1      1  -0.0715 0.134 -0.334   0.191 mod.1        0
 2      1  -1.00   0.123 -1.24   -0.759 mod.2        1
 3      1  -0.0119 0.107 -0.222   0.198 mod.3        0
 4      2   0.0547 0.136 -0.211   0.321 mod.1        0
 5      2  -0.973  0.125 -1.22   -0.727 mod.2        1
 6      2   0.0312 0.108 -0.180   0.243 mod.3        0
 7      3  -0.0387 0.137 -0.306   0.229 mod.1        0
 8      3  -1.08   0.125 -1.32   -0.831 mod.2        1
 9      3   0.0386 0.108 -0.173   0.250 mod.3        0
10      4  -0.0152 0.136 -0.282   0.252 mod.1        0
# ℹ 2,990 more rows

The results are then plotted in Fig. 5.4.

Code

model_names <- c('mod.1' = 'Model.1', 
                 'mod.2' = 'Model.2',
                 'mod.3' = 'Model.3')

ggplot(data = subset(coefs_ex5, sim.id <= 2e2), aes(x = sim.id, y = estimate, ymin = CI_2.5, ymax = CI_97.5, colour = as.factor(coverage))) + 
  geom_hline(yintercept = b_sex.conf[2], alpha = 0.8, linetype = 'dashed', colour = 'blue') + 
  geom_linerange() +
  geom_point(shape = 21, size = 1, fill = 'white', stroke = 0.5) +
  scale_colour_manual('', 
                      values = c('black', '#FF4040'),
                      breaks = c('1','0')) +
  scale_y_continuous('Estimate', 
                     breaks = seq(from = -10, to = 10, by = 0.5),
                     limits = c(-1.75, 1)) +
  scale_x_continuous('Simulation ID') +
  facet_wrap(model~., 
             labeller = labeller(model = model_names),
             nrow = 3,
             scales = 'fixed') +
  theme(legend.position = 'none', 
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA),
        axis.title.y = element_blank())

Fig. 5.4. Estimates of the biological sex coefficient (only the first hundreds of the thousand simulations are shown). The white dots represent the point estimate, and the black lines represent the 95% confidence intervals. In red are highlighted the confidence intervals that do not overlap the parameter used to generate the data (dashed blue line).

Fig. 5.4 shows the first 200 estimates (point estimate and confidence interval) of the coefficient for biological sex for Model.1, Model.2 and Model.3. If all the thousand simulations are considered, the frequency of the coverage of the calculated confidence intervals (i.e., how many times the confidence intervals overlap the data-generating parameter) is 0.0%, 95.7% and 0.0% for Model.1, Model.2 and Model.3, respectively. Since the estimates of the causal effect for Model.1 and Model.3 are biased, the calculated confidence intervals for these models do not have the expected coverage. In fact, confidence intervals only quantify the uncertainty due to random error (i.e., sample variability), not systematic error (i.e., bias). Instead, the estimates from Model.2 are unbiased and the calculated 95% confidence intervals have the expected coverage (i.e., overlap the data-generating parameter 95% of the times).

We can also visualize all the 1,000 estimates of the coefficient for biological sex for Model.1, Model.2 and Model.3 (the estimates were shown as white dots in Fig. 5.4).

Code

ggplot(data = coefs_ex5, aes(x = estimate, y = ifelse(after_stat(count) > 0, after_stat(count), NA))) +
  geom_histogram(binwidth = 0.05, fill = 'white', colour = 'grey50') +
  geom_vline(xintercept = b_sex.conf[2], colour = 'blue', linetype = 'dashed') +
  scale_x_continuous('Estimate', 
                     breaks = seq(from = -10, to = 10, by = 0.25),
                     limits = c(-1.55, 0.55)) +
  facet_grid(model~., 
             labeller = labeller(model = model_names),
             scales = 'fixed') +
  theme(axis.title.y = element_blank(), 
        axis.text.y = element_blank(), 
        axis.ticks.y = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA))

Fig. 5.5. Histogram of the 1,000 estimates of the biological sex coefficient for the three models. The blue dashed line represents the parameter used to generate the data.

In Fig. 5.5, the estimated coefficients are visualized through a histogram. Here, we can see that the estimates for Model.1 and Model.3are not clustered around the data-generating parameter (blue dashed line) having a mean estimate of 0.024 (with 0.135 standard deviation) and 0.027 (with 0.104 standard deviation), respectively. In contrast, the estimates for Model.2 having a mean estimate of -0.998 (with 0.122 standard deviation) are centered around the data-generating parameter.

As expected, the inclusion of age \((\text{A})\) in the regression model (i.e., using Model.2) leads to the correct estimate of the total average causal effect, while its exclusion (i.e., using Model.1 and Model.3) leads to bias.

The bias occurs because there is selection bias. Generally, selection bias takes place when the participants selected for a study are not representative of the larger population intended to be analyzed. In our example, participation is not random but influenced by variables that also affect the outcome, specifically, biological sex \((\text{S})\) and age \((\text{A})\). As a result, biological sex \((\text{S})\) and age \((\text{A})\) are associated in the biased sample, while they are not in the population (and consequently the random sample).

To better explain this bias, we visualize the relationship between biological sex \((\text{S})\) and age \((\text{A})\) in the biased and random sample.

Code

truth.data.sim_ex5 <- function(n) {
  b_age.conf = 0.3 #direct causal effect of A on C 1 or 0.2
  b_sex.conf = c(0, -1) #direct causal effect of S on C
  b_noise.conf = 0.7 #direct causal effect of N on C

  A <- rep(18:59, length.out = n)
  S <- c(rep('male', times = 84/2), rep('female', times = 84/2))
  N <- 45
  #Simulate acoustic sensation
  C <- ifelse(S == 'male', 10 + b_sex.conf[1] + b_noise.conf * N + b_age.conf * A, 10 + b_sex.conf[2] + b_noise.conf * N + b_age.conf * A)
  #Return tibble with simulated values
  return(tibble(A, S, N, C))
  }

truth <- truth.data.sim_ex5(84) %>%
                    mutate(sample = 'population')


ex5_plot <- rbind(ex5_sample.random %>%
                    mutate(sample = 'random'),
                  ex5_sample.bias %>%
                    mutate(sample = 'biased'))


#Plot
mean_val <- 
  ex5_plot %>% 
  group_by(S, sample) %>% 
  summarise(A = mean(A)) 

line_val <- 
  ex5_plot %>% 
  group_by(S, sample) %>% 
  summarise(A = mean(A)) %>%
  pivot_wider(names_from = S, values_from = A)

truth_line <-
  truth %>% 
  summarise(A = mean(A))

ex5_plot %>%
  ggplot(aes(x = as.numeric(S), y = A)) +
  geom_jitter(aes(fill = S, colour = S), shape = 19, size = 2, alpha = 0.1, stroke = NA, width = 0.2) + 
  geom_hline(data = truth_line, mapping = aes(yintercept = A , linetype = 'linea'), colour = 'black') +
  geom_segment(data = line_val, aes(x = 1, y = male, xend = 2, yend = female), colour = 'black', linetype = 'solid', linewidth = 0.5) +
  geom_point(data = mean_val, aes(fill = S, shape = '2'), colour = 'black', size = 3, stroke = 0.75) + #23
  scale_colour_manual('',
                      breaks = c('male', 'female', 'black'),
                      values = c('#0072B2', '#D55E00', 'black')) +
  scale_fill_manual('',
                    breaks = c('male', 'female', 'black'),
                    values = c('#0072B2', '#D55E00', NA)) +
  scale_shape_manual('',
                     breaks = c('2'),
                     labels = c('mean age (sample)'),
                     values = c(23)) +
  scale_linetype_manual('',
                        values = c('dashed'),
                        breaks = c('linea'),
                        labels = c('mean age (population)')) +
  scale_x_continuous('Biological sex',
                     breaks = c(1, 2),
                     labels = c('male', 'female')) + 
  scale_y_continuous('Age', 
                     breaks = seq(from = 0, to = 100, by = 10)) +
  guides(colour = guide_legend(override.aes = list(alpha = 1, fill = NA, size = 2), order = 1),
         fill = 'none',
         shape = guide_legend(override.aes = list(alpha = 1, size = 2), order = 2),
         linetype = guide_legend(override.aes = list(alpha = 1, size = 2), order = 3)
         ) +
  facet_grid(.~sample) + 
  theme(legend.position = 'bottom',
        legend.direction = 'horizontal',
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA))

Fig. 5.6. Visualization of the relationship between biological sex and age in the biased and random sample.

Fig. 5.6 shows the relationship between biological sex \((\text{S})\) and age \((\text{A})\) in the biased and random sample. The horizontal dashed black line represents the average age in the population (i.e., 38.50 years) for both males and females. In the random sample, biological sex \((\text{S})\) and age \((\text{A})\) are not associated and the average age for males (i.e., 38.66 years) and females (i.e., 38.85 years) are similar to one another with a difference of 0.20 years. However, due to selection bias, the average age for males and females in the biased sample is clearly different from the population average. Additionally, biological sex \((\text{S})\) and age \((\text{A})\) are associated because females are more likely to be selected than males, and older people are less likely to be selected than younger people. As a result, females in the biased sample are, on average, older than males (average of 28.70 years for females, 25.44 years for males and a difference of 3.26 years).

We can test for a difference between the average age of males and females more formally using bootstrapping. We will do so by using the custom functions my_bootstrap() and summary_bootstrap() (see code below).

Code

#Perform bootstrapping
my_bootstrap <- 
  function(data, variable, group){
    samples <- tapply(X = data[[variable]], INDEX = data[[group]], FUN = function(x) sample(x, length(x), TRUE))
    sapply(samples, mean)
    }

#Summarize the results from the bootstrapping
summary_bootstrap <-
  function(bootstrap){
    data.frame(t(bootstrap)) %>% 
      mutate(diff_f.m = female - male) %>%
      mean_qi(.width = 0.95) %>%
      rownames_to_column() %>%
      select(-c(.width, .point, .interval)) %>%
      pivot_longer(cols = !rowname, 
                   names_to = 'variable',
                   values_to = 'value') %>%
      select(-rowname) %>%
      mutate(type = case_when(grepl('.lower', variable) ~ 'CI.lower',
                              grepl('.upper', variable) ~ 'CI.upper',
                              .default = 'estimate'),
             variable = case_when(grepl('female', variable) ~ 'female',
                                  grepl('male', variable) ~ 'male',
                                  grepl('diff_f.m', variable) ~ 'diff_f.m',
                                  .default = NA)) %>%
      pivot_wider(names_from = 'type',
                  values_from = 'value')
    }

First, we define the number of times to run the my_bootstrap() function. We fixed the number of bootstrap samples equal to 10,000 (i.e., n_bootstrap = 1e4). Subsequently, we run the my_bootstrap() function using as input both the random and biased sample.

#Number of bootstrap samples
n_bootstrap = 1e4
set.seed(2025) #for reproducibility
#Bootstrap for random sample
bootstrap_sample.random <- 
  replicate(n_bootstrap, my_bootstrap(data = ex5_sample.random, 
                                      variable = 'A', 
                                      group = 'S'))
set.seed(2025) #for reproducibility
#Bootstrap for biased sample
bootstrap_sample.bias <- 
  replicate(n_bootstrap, my_bootstrap(data = ex5_sample.bias, 
                                      variable = 'A', 
                                      group = 'S'))

The results from the bootstrapping are summarized using the summary_bootstrap() function.

#summarize bootstrap for random sample
summary_sample.random <-
  summary_bootstrap(bootstrap_sample.random)
#View summary
summary_sample.random

# A tibble: 3 × 4
  variable estimate CI.lower CI.upper
  <chr>       <dbl>    <dbl>    <dbl>
1 male       38.7     38.2     39.1  
2 female     38.9     38.4     39.3  
3 diff_f.m    0.199   -0.476    0.883

#summarize bootstrap for biased sample
summary_sample.bias <- 
  summary_bootstrap(bootstrap_sample.bias)
#View summary
summary_sample.bias

# A tibble: 3 × 4
  variable estimate CI.lower CI.upper
  <chr>       <dbl>    <dbl>    <dbl>
1 male        25.4     25.2     25.7 
2 female      28.7     28.4     29.0 
3 diff_f.m     3.26     2.88     3.65

For the random sample, the age difference between males and females is 0.20 with 95% CI [-0.48, 0.88]. Since the 95% CI includes zero, the average age of females is not statistically significantly different from the average age of males, at the 0.05 level. Instead, for the biased sample, the age difference between males and females is 3.26 with 95% CI [2.88, 3.65]. Since the 95% CI does not include zero, the average age of females is statistically significantly different from the average age of males, at the 0.05 level.

However, once we adjust for age \((\text{A})\) in the regression model (i.e., using Model.2) the bias disappears. We can visualize this in Fig. 5.7

Code

pred_ex5_Model.2 <- cbind(ex5_sample.bias, 
                          predict(ex5_Model.2, interval = 'confidence', level = 0.95))

ex5_sample.bias %>%
  ggplot(aes(x = A, y = C)) +
  geom_point(aes(colour = S), shape = 19, size = 2, alpha = 0.07, stroke = NA) +
  geom_ribbon(data = pred_ex5_Model.2, aes(x = A , y = fit, ymin = lwr, ymax = upr, fill = S), alpha = 0.2, show.legend = NA) +
  geom_line(data = pred_ex5_Model.2, aes(x = A , y = fit, colour = S, linetype = 'sample', linewidth = 'sample')) + 
  geom_line(data = truth, mapping = aes(colour = S, linetype = 'pop', linewidth = 'pop')) +
scale_colour_manual('',
                      breaks = c('male', 'female'),
                      values = c('#0072B2', '#D55E00')) +
  scale_fill_manual('',
                      breaks = c('male', 'female'),
                      values = c('#0072B2', '#D55E00')) +
  scale_linetype_manual('',
                        breaks = c('pop', 'sample'),
                        labels = c('population', 'sample'),
                        values = c('dashed', 'solid')) +
  scale_linewidth_manual('',
                         breaks = c('pop', 'sample'),
                         labels = c('population', 'sample'),
                         values = c(1, 0.5)) +
  scale_x_continuous('Age') + 
  scale_y_continuous('Acoustic sensation', 
                     breaks = seq(from = 0, to = 100, by = 10)) +
  facet_grid(.~S) +
  guides(linetype = guide_legend(override.aes = list(alpha = 1, colour = 'black', fill = NA, linewidth = 0.5))) +
  theme(legend.position = 'bottom',
        legend.direction = 'horizontal',
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA))

Fig. 5.7. Visualization of the relationship between biological sex and acoustic sensation conditioning on age.

In Fig. 5.7, we can observe that within each age level the regression lines for the biased sample for males (i.e., solid blue line) and females (i.e., solid orange line) is very similar to the population lines (i.e., dashed lines). Adjusting for age closed the backdoor path leading to the correct estimate of the total average causal effect.

Bayesian framework

Click to expand

#Fit the linear regression model with S and C (Model 1)
ex5_Model.1 <- stan_glm(formula = C ~ S,
                        family = gaussian(),
                        data = ex5_sample.bias,
                        #Prior coefficients
                        prior = normal(location = c(0), scale = c(2)),
                        #Prior intercept
                        prior_intercept = normal(location = 10, scale = 6),
                        #Prior sigma
                        prior_aux = exponential(rate = 0.5),
                        iter = 4000, warmup = 1000, 
                        save_warmup = TRUE,
                        chains = 4, cores = 4,
                        seed = 2025) #for reproducibility   
#View of the model
ex5_Model.1

stan_glm
 family:       gaussian [identity]
 formula:      C ~ S
 observations: 5000
 predictors:   2
------
            Median MAD_SD
(Intercept) 49.2    0.1  
Sfemale     -0.1    0.1  

Auxiliary parameter(s):
      Median MAD_SD
sigma 4.6    0.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

#Fit the linear regression model with S, A and C (Model 2)
ex5_Model.2 <- stan_glm(formula = C ~ S + A,
                        family = gaussian(),
                        data = ex5_sample.bias,
                        #Prior coefficients
                        prior = normal(location = c(0, 0), scale = c(2, 1)),
                        #Prior intercept
                        prior_intercept = normal(location = 10, scale = 6),
                        #Prior sigma
                        prior_aux = exponential(rate = 0.5),
                        iter = 4000, warmup = 1000, 
                        save_warmup = TRUE,
                        chains = 4, cores = 4,
                        seed = 2025) #for reproducibility
#View of the model
ex5_Model.2

stan_glm
 family:       gaussian [identity]
 formula:      C ~ S + A
 observations: 5000
 predictors:   3
------
            Median MAD_SD
(Intercept) 42.0    0.2  
Sfemale     -1.0    0.1  
A            0.3    0.0  

Auxiliary parameter(s):
      Median MAD_SD
sigma 4.1    0.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

#Fit the linear regression model with S, N and C (Model 3)
ex5_Model.3 <- stan_glm(formula =  C ~ S + N,
                        family = gaussian(),
                        data = ex5_sample.bias,
                        #Prior coefficients
                        prior = normal(location = c(0, 0), scale = c(2, 1.4)),
                        #Prior intercept
                        prior_intercept = normal(location = 10, scale = 6),
                        #Prior sigma
                        prior_aux = exponential(rate = 0.5),
                        iter = 4000, warmup = 1000, 
                        save_warmup = TRUE,
                        chains = 4, cores = 4,
                        seed = 2025) #for reproducibility
#View of the model
ex5_Model.3

stan_glm
 family:       gaussian [identity]
 formula:      C ~ S + N
 observations: 5000
 predictors:   3
------
            Median MAD_SD
(Intercept) 18.2    0.6  
Sfemale      0.0    0.1  
N            0.7    0.0  

Auxiliary parameter(s):
      Median MAD_SD
sigma 3.7    0.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

In Bayesian analysis, there are important diagnostics that have to be carried out in order to assess the convergence and efficiency of the Markov Chains. This is done by using the monitor() function which computes summaries of MCMC (Markov Chain Monte Carlo) draws and monitor convergence. Specifically, we will look at Rhat, Bulk_ESS and Tail_ESS metrics.

#Diagnostics for model 1
monitor(ex5_Model.1$stanfit)

Inference for the input samples (4 chains: each with iter = 4000; warmup = 0):

                    Q5      Q50      Q95     Mean  SD  Rhat Bulk_ESS Tail_ESS
(Intercept)       49.0     49.2     49.4     49.2 0.1     1    11813     8119
Sfemale           -0.3     -0.1      0.2     -0.1 0.1     1    11714     7257
sigma              4.5      4.6      4.7      4.6 0.0     1    11962     8118
mean_PPD          49.0     49.2     49.3     49.2 0.1     1    12216    10407
log-posterior -14737.4 -14734.6 -14733.6 -14735.0 1.2     1     4766     7546

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

#Diagnostics for model 2
monitor(ex5_Model.2$stanfit)

Inference for the input samples (4 chains: each with iter = 4000; warmup = 0):

                    Q5      Q50      Q95     Mean  SD  Rhat Bulk_ESS Tail_ESS
(Intercept)       41.6     42.0     42.3     42.0 0.2     1    15213     8716
Sfemale           -1.2     -1.0     -0.8     -1.0 0.1     1    13319     7901
A                  0.3      0.3      0.3      0.3 0.0     1    13720     9289
sigma              4.1      4.1      4.2      4.1 0.0     1    15589     8367
mean_PPD          49.0     49.2     49.3     49.2 0.1     1    12840    11249
log-posterior -14206.2 -14203.2 -14201.8 -14203.5 1.4     1     5074     7215

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

#Diagnostics for model 2
monitor(ex5_Model.3$stanfit)

Inference for the input samples (4 chains: each with iter = 4000; warmup = 0):

                    Q5      Q50      Q95     Mean  SD  Rhat Bulk_ESS Tail_ESS
(Intercept)       17.2     18.2     19.1     18.2 0.6     1    14685     8872
Sfemale           -0.2      0.0      0.2      0.0 0.1     1    15154     8110
N                  0.7      0.7      0.7      0.7 0.0     1    14833     8453
sigma              3.6      3.7      3.7      3.7 0.0     1    14710     8520
mean_PPD          49.0     49.2     49.3     49.2 0.1     1    13218    10150
log-posterior -13619.1 -13616.0 -13614.7 -13616.4 1.4     1     5176     7062

For each parameter, Bulk_ESS and Tail_ESS are crude measures of 
effective sample size for bulk and tail quantities respectively (an ESS > 100 
per chain is considered good), and Rhat is the potential scale reduction 
factor on rank normalized split chains (at convergence, Rhat <= 1.05).

Rhat is a metric used to assess the convergence of Markov Chain Monte Carlo (MCMC) simulations. It helps determine if the MCMC chains have adequately explored the target posterior distribution. Specifically, it compares the between- and within-chain estimates for model parameters: If chains have not mixed well (i.e., the between- and within-chain estimates do not agree), R-hat is larger than 1. A general rule of thumb is to use the sample only if R-hat is less than 1.05; a larger value suggests that the chains have not mixed well, and the results might not be reliable. In our two models, all Rhat are equal to 1 indicating that the chains have mixed well and have adequately explored the target posterior distribution.

Bulk_ESS and Tail_ESS stand for ‘Bulk Effective Sample Size’ and ‘Tail Effective Sample Size,’ respectively. Since MCMC samples are not truly independent (they are correlated), these metrics assess the sampling efficiencies, that is, they help evaluate how efficiently the MCMC sampler is exploring the parameter space.

Bulk_ESS is a useful measure for sampling efficiency in the bulk (center) of the distribution (e.g., efficiency of mean and median estimates);
Tail_ESS is a useful measure for sampling efficiency in the tails of the distribution (e.g., efficiency of variance and tail quantile estimates). A general rule of thumb is that both Bulk-ESS and Tail-ESS should be at least 100 (approximately) per Markov Chain in order to be reliable and indicate that estimates of respective posterior quantiles are reliable. In our two models, all Bulk-ESS and Tail-ESS are well above 400 (i.e., 100 multiplied by 4, the number of chains we used) indicating that estimates of posterior quantiles are reliable.

Since we have established that the posteriors are reliable, we can now explore the model estimates. The estimated coefficients for the two models are then plotted in Fig. 5.8.

Code

#Extract draws from model 1 
post_ex5_Model.1 <-
  ex5_Model.1 %>% 
  spread_draws(Sfemale) %>% #extract draws from the fitted model
  mutate(model = 'Model.1') #add a new column to specify that the model

#Extract draws from model 2
post_ex5_Model.2 <-
  ex5_Model.2 %>% 
  spread_draws(Sfemale) %>% #extract draws from the fitted model
  mutate(model = 'Model.2') #add a new column to specify that the model

#Extract draws from model 2
post_ex5_Model.3 <-
  ex5_Model.3 %>% 
  spread_draws(Sfemale) %>% #extract draws from the fitted model
  mutate(model = 'Model.3') #add a new column to specify that the model

#Combine draws
plot.post <- rbind(post_ex5_Model.1, post_ex5_Model.2, post_ex5_Model.3)

# Plot
plot.post  %>%
  ggplot(aes(y = model, x = Sfemale)) +
  stat_slabinterval(point_interval = 'mean_hdi',
                    .width = c(.95)) +
  geom_vline(xintercept = b_sex.conf[2], alpha = 0.8, linetype = 'dashed', colour = 'blue') + 
  scale_x_continuous('Estimate', 
                     breaks = seq(from = -5, to = 5, by = 0.25),
                     limits = c(-1.55, 0.55)
                     ) +
  theme(panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA),
        axis.title.y = element_blank())

Fig. 5.8. Posterior distribution of the biological sex coefficient for the three models. The black line and dot at the bottom of each distribution represent the highest density interval (HDI) and the mean, respectively.

Fig. 5.8 shows the estimates (mean and 95% HDI) of the coefficient for biological sex for Model.1, Model.2, and Model.3.

For Model.1 we found a negative coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) (mean = -0.072, 95% HDI [-0.337, 0.193]). The estimated causal effect is biased, leading to the wrong conclusion that changing biological sex from male to female causes a decrease in acoustic sensation by -0.072 units, on average.

For Model.2 we found a negative coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) (mean = -0.997, 95% HDI [-1.243, -0.756]). The estimated causal effect is unbiased, leading to the correct conclusion that changing biological sex from male to female causes a decrease in acoustic sensation by -0.997 units, on average.

For Model.3 we found a positive coefficient between biological sex \((\text{S})\) and acoustic sensation \((\text{C})\) (mean = -0.011, 95% HDI [-0.224, 0.195]). The estimated causal effect is biased, leading to the wrong conclusion that changing biological sex from male to female causes a decrease in acoustic sensation by -0.011 units, on average.

Importantly, for both Model.1, Model.2 and Model.3, the 95% HDI is the range of parameter values within which the most credible 95% of the posterior distribution falls. Unlike a frequentist confidence interval, the Bayesian 95% HDI has a direct probabilistic meaning: every point inside the HDI has a higher probability density than any point outside the interval. Therefore, given the model, the prior and the data, we can say that there is a 95% probability that the data-generating parameter (i.e., b_sex.conf = -1) lies within the HDI. However, since Model.1 and Model.3 lead to biased estimates, we will reach the wrong conclusion by stating that there is a 95% probability that the data-generating parameter lies within the [-0.337, 0.193] and [-0.224, 0.195] intervals, for Model.1 and Model.3 respectively. This probability is 0%.

The bias occurs because there is selection bias. Generally, selection bias takes place when the participants selected for a study, are not representative of the larger population intended to be analyzed. In our example, participation is not random but influenced by variables that also affect the outcome, specifically, biological sex \((\text{S})\) and age \((\text{A})\). As a result, biological sex \((\text{S})\) and age \((\text{A})\) are associated in the biased sample, while they are not in the population (and consequently the random sample).

To better explain this bias, we visualize the relationship between biological sex \((\text{S})\) and age \((\text{A})\) in the biased and random sample.

Code

ex5_plot %>%
  ggplot(aes(x = as.numeric(S), y = A)) +
  geom_jitter(aes(fill = S, colour = S), shape = 19, size = 2, alpha = 0.1, stroke = NA, width = 0.2) + 
  geom_hline(data = truth_line, mapping = aes(yintercept = A , linetype = 'linea'), colour = 'black') +
  geom_segment(data = line_val, aes(x = 1, y = male, xend = 2, yend = female), colour = 'black', linetype = 'solid', linewidth = 0.5) +
  geom_point(data = mean_val, aes(fill = S, shape = '2'), colour = 'black', size = 3, stroke = 0.75) + 
  scale_colour_manual('',
                      breaks = c('male', 'female', 'black'),
                      values = c('#0072B2', '#D55E00', 'black')) +
  scale_fill_manual('',
                    breaks = c('male', 'female', 'black'),
                    values = c('#0072B2', '#D55E00', NA)) +
  scale_shape_manual('',
                     breaks = c('2'),
                     labels = c('mean age (sample)'),
                     values = c(23)) +
  scale_linetype_manual('',
                        values = c('dashed'),
                        breaks = c('linea'),
                        labels = c('mean age (population)')) +
  scale_x_continuous('Biological sex',
                     breaks = c(1, 2),
                     labels = c('male', 'female')) + 
  scale_y_continuous('Age', 
                     breaks = seq(from = 0, to = 100, by = 10)) +
  guides(colour = guide_legend(override.aes = list(alpha = 1, fill = NA, size = 2), order = 1),
         fill = 'none',
         shape = guide_legend(override.aes = list(alpha = 1, size = 2), order = 2),
         linetype = guide_legend(override.aes = list(alpha = 1, size = 2), order = 3)
         ) +
  facet_grid(.~sample) + 
  theme(legend.position = 'bottom',
        legend.direction = 'horizontal',
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA))

Fig. 5.9. Visualization of the relationship between biological sex and age in the biased and random sample.

Fig. 5.9 shows the relationship between biological sex \((\text{S})\) and age \((\text{A})\) in the biased and random sample. The horizontal dashed black line represents the average age in the population (i.e., 38.50 years) for both males and females. In the random sample biological sex \((\text{S})\) and age \((\text{A})\) are not associated and the average age for males (i.e., 38.66 years) and females (i.e., 38.85 years) are similar to one another with a difference of 0.20 years. However, due to selection bias, the average age for males and females in the biased sample is clearly different from the population average. Additionally, biological sex \((\text{S})\) and age \((\text{A})\) are associated because females are more likely to be selected than males, and older people are less likely to be selected than younger people. As a result, females in the biased sample are, on average, older than males (average of 28.70 years for females, 25.44 years for males and a difference of 3.26 years).

We can test for a difference between the average age of males and females more formally using Bayesian bootstrapping. We will do so by using the custom functions my_Bayesian.bootstrap() (see code below).

Code

#Perform bootstrapping
my_Bayesian.bootstrap <-
  function(data, n_bootstrap, variable, group, group.level){
    data <- as.data.frame(data)
    bootstrap <- data.frame(c(bayesboot(data = data[ data[[group]] == group.level[1],  variable], R = n_bootstrap, weighted.mean, use.weights = TRUE) %>% pull()),
                            c(bayesboot(data = data[ data[[group]] == group.level[2],  variable], R = n_bootstrap, weighted.mean, use.weights = TRUE) %>% pull())) 
    
    colnames(bootstrap) <- c(group.level[1], group.level[2])
    return(bootstrap)
  }

First, we define the number of times to run the my_Bayesian.bootstrap() function. We fixed the number of bootstrap samples equal to 10,000 (i.e., n_bootstrap = 1e4). Subsequently, we run the my_Bayesian.bootstrap() function using as input both the random and biased sample.

set.seed(2025) #for reproducibility
#Bootstrap for random sample
Bayesian.bootstrap_sample.random <- 
  my_Bayesian.bootstrap(data = ex5_sample.random, 
                        n_bootstrap = 1e4,
                        variable = 'A', 
                        group = 'S',
                        group.level = c('male', 'female'))

set.seed(2025) #for reproducibility
#Bootstrap for biased sample
Bayesian.bootstrap_sample.biased <- 
  my_Bayesian.bootstrap(data = ex5_sample.bias, 
                        n_bootstrap = 1e4,
                        variable = 'A', 
                        group = 'S',
                        group.level = c('male', 'female'))

We can then calculate and visualize the age difference of the posterior sample between females and males (see Fig. 5.10).

Code

#Extract draws from model (random sample)
post_Bayesian.bootstrap_sample.random <-
  Bayesian.bootstrap_sample.random %>% 
  mutate(diff = female - male,
         sample = 'random') #add a new column to specify that the model

#Extract draws from model (biased sample)
post_Bayesian.bootstrap_sample.biased <-
  Bayesian.bootstrap_sample.biased %>% 
    mutate(diff = female - male,
           sample = 'biased') #add a new column to specify that the model

#Combine draws
plot.post.corr <- rbind(post_Bayesian.bootstrap_sample.random, post_Bayesian.bootstrap_sample.biased)

# Plot
plot.post.corr %>%
  ggplot(aes(y = sample, x = diff)) +
  stat_slabinterval(point_interval = 'mean_hdi',
                    .width = c(.95)) +
  geom_vline(xintercept = 0, alpha = 0.8, linetype = 'dashed', colour = 'blue') + 
  scale_x_continuous('Estimate', 
                     breaks = seq(from = -5, to = 5, by = 0.5),
                     ) +
  theme(panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA),
        axis.title.y = element_blank())

Fig. 5.10. Age difference between males and females in the biased and random sample.

For the random sample, the age difference between males and females has a mean of 0.20 with 95% HDI [-0.51, 0.83]. Instead, for the biased sample, the age difference between males and females has a mean of 3.26 with 95% HDI [2.88, 3.64].

However, once we adjust for age \((\text{A})\) in the regression model (i.e., using Model.2) the bias disappears. We can visualize this in Fig. 5.11.

Code

#Expected value of the posterior predictive distribution (epred) for model 2
epred_ex5_Model.2 <- 
  ex5_Model.2 %>%
  epred_draws(newdata = ex5_sample.bias, 
              seed = 2025) %>%
  group_by(A, S) %>%
  mean_hdi(.epred, .width = .95) %>%
  ungroup()

ex5_sample.bias %>%
  ggplot(aes(x = A, y = C)) +
  geom_point(aes(colour = S), shape = 19, size = 2, alpha = 0.07, stroke = NA) +
  geom_ribbon(data = epred_ex5_Model.2, aes(x = A , y = .epred, ymin = .lower, ymax = .upper, fill = S), alpha = 0.2, show.legend = NA) +
  geom_line(data = epred_ex5_Model.2, aes(x = A , y = .epred, colour = S, linetype = 'sample', linewidth = 'sample')) + 
  geom_line(data = truth, mapping = aes(colour = S, linetype = 'pop', linewidth = 'pop')) +

  scale_colour_manual('',
                      breaks = c('male', 'female'),
                      values = c('#0072B2', '#D55E00')) +
  scale_fill_manual('',
                      breaks = c('male', 'female'),
                      values = c('#0072B2', '#D55E00')) +
  scale_linetype_manual('',
                        breaks = c('pop', 'sample'),
                        labels = c('population', 'sample'),
                        values = c('dashed', 'solid')) +
  scale_linewidth_manual('',
                         breaks = c('pop', 'sample'),
                         labels = c('population', 'sample'),
                         values = c(1, 0.5)) +
  scale_x_continuous('Age') + 
  scale_y_continuous('acoustic sensation', 
                     breaks = seq(from = 0, to = 100, by = 10)) +
  facet_grid(.~S) + 
  guides(linetype = guide_legend(override.aes = list(alpha = 1, colour = 'black', fill = NA, linewidth = 0.5))) +
  theme(legend.position = 'bottom',
        legend.direction = 'horizontal',
        panel.grid = element_blank(),
        panel.background = element_blank(),
        panel.border = element_rect(colour = 'black', fill = NA))

Fig. 5.11. Visualization of the relationship between biological sex and acoustic sensation conditioning on age.

In Fig. 5.11, we can observe that within each age level the regression lines for the biased sample for males (i.e., solid blue line) and females (i.e., solid orange line) is very similar to the population lines (i.e., dashed lines). Adjusting for age closed the backdoor path leading to the correct estimate of the total average causal effect.

Session info

Version information about R, the OS and attached or loaded packages.

sessionInfo()

R version 4.2.3 (2023-03-15 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 26200)

Matrix products: default

locale:
[1] LC_COLLATE=English_United Kingdom.utf8 
[2] LC_CTYPE=English_United Kingdom.utf8   
[3] LC_MONETARY=English_United Kingdom.utf8
[4] LC_NUMERIC=C                           
[5] LC_TIME=English_United Kingdom.utf8    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bayesboot_0.2.3     rstan_2.32.7        StanHeaders_2.32.10
 [4] gt_1.1.0            lubridate_1.9.4     forcats_1.0.1      
 [7] stringr_1.6.0       dplyr_1.1.4         purrr_1.2.0        
[10] readr_2.1.6         tidyr_1.3.1         tibble_3.3.0       
[13] ggplot2_4.0.1       tidyverse_2.0.0     ggdist_3.3.3       
[16] tidybayes_3.0.7     rstanarm_2.32.2     Rcpp_1.1.0         
[19] dagitty_0.3-4       ggdag_0.2.13       

loaded via a namespace (and not attached):
  [1] backports_1.5.0       plyr_1.8.9            igraph_2.2.1         
  [4] splines_4.2.3         svUnit_1.0.8          crosstalk_1.2.2      
  [7] rstantools_2.5.0      inline_0.3.21         digest_0.6.38        
 [10] htmltools_0.5.8.1     viridis_0.6.5         magrittr_2.0.4       
 [13] checkmate_2.3.3       memoise_2.0.1         tzdb_0.5.0           
 [16] graphlayouts_1.2.2    RcppParallel_5.1.11-1 matrixStats_1.5.0    
 [19] xts_0.14.1            timechange_0.3.0      colorspace_2.1-2     
 [22] ggrepel_0.9.6         rbibutils_2.4         xfun_0.54            
 [25] jsonlite_2.0.0        lme4_1.1-37           survival_3.8-3       
 [28] zoo_1.8-14            glue_1.8.0            reformulas_0.4.2     
 [31] polyclip_1.10-7       gtable_0.3.6          V8_8.0.1             
 [34] distributional_0.5.0  car_3.1-3             pkgbuild_1.4.8       
 [37] abind_1.4-8           scales_1.4.0          miniUI_0.1.2         
 [40] viridisLite_0.4.2     xtable_1.8-4          Formula_1.2-5        
 [43] stats4_4.2.3          DT_0.34.0             htmlwidgets_1.6.4    
 [46] threejs_0.3.4         arrayhelpers_1.1-0    RColorBrewer_1.1-3   
 [49] posterior_1.6.1       pkgconfig_2.0.3       loo_2.8.0            
 [52] farver_2.1.2          sass_0.4.10           utf8_1.2.6           
 [55] tidyselect_1.2.1      labeling_0.4.3        rlang_1.1.6          
 [58] reshape2_1.4.5        later_1.4.4           tools_4.2.3          
 [61] cachem_1.1.0          cli_3.6.5             generics_0.1.4       
 [64] evaluate_1.0.5        fastmap_1.2.0         yaml_2.3.10          
 [67] knitr_1.50            fs_1.6.6              tidygraph_1.3.1      
 [70] ggraph_2.2.2          nlme_3.1-168          mime_0.13            
 [73] xml2_1.5.0            compiler_4.2.3        bayesplot_1.14.0     
 [76] shinythemes_1.2.0     rstudioapi_0.17.1     curl_7.0.0           
 [79] tweenr_2.0.3          stringi_1.8.7         lattice_0.22-7       
 [82] Matrix_1.6-5          nloptr_2.2.1          markdown_2.0         
 [85] shinyjs_2.1.0         tensorA_0.36.2.1      vctrs_0.6.5          
 [88] pillar_1.11.1         lifecycle_1.0.4       Rdpack_2.6.4         
 [91] httpuv_1.6.16         QuickJSR_1.8.1        R6_2.6.1             
 [94] promises_1.5.0        gridExtra_2.3         codetools_0.2-20     
 [97] boot_1.3-32           colourpicker_1.3.0    MASS_7.3-58.2        
[100] gtools_3.9.5          withr_3.0.2           shinystan_2.6.0      
[103] parallel_4.2.3        hms_1.1.4             grid_4.2.3           
[106] coda_0.19-4.1         minqa_1.2.8           rmarkdown_2.30       
[109] S7_0.2.1              carData_3.0-5         otel_0.2.0           
[112] ggforce_0.5.0         shiny_1.11.1          base64enc_0.1-3      
[115] dygraphs_1.1.1.6