-
Notifications
You must be signed in to change notification settings - Fork 0
/
Item Response Probit.Rmd
258 lines (199 loc) · 5.94 KB
/
Item Response Probit.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
---
title: "Item Response Probit Model"
subtitle: Summer 2024
output:
pdf_document:
extra_dependencies: amsmath
word_document: default
header-includes: \usepackage{amsmath}
---
This report writes about the DIHOPIT model with the random effect deleted. It is also called the item response probit model in psychometric. I plan to write down the current information and try to integrate the psychometric and linear regression terms. This approach may help us further investigate how we can use this model to solve the urban-rural bias that exists in the current PCA approach.
### Mathematical Formula
The DIHOPIT model without random effect can be written as
$$
\begin{aligned}
& y_i^*=X_i^{\prime} \beta+\varepsilon_i \quad i=1, \ldots, N \\
& \varepsilon_i \sim N(0,1)
\end{aligned}
$$
The observation mechanism for each indicator variable $a=1, \ldots, A$:
$$
\begin{aligned}
y_i^a=0 & \text { if } & -\infty<y_i^* \leq \tau^a \\
y_i^a=1 & \text { if } & \tau^a<y_i^* \leq+ \infty
\end{aligned}
$$
where $\tau^a$ is the threshold for the $a^{th}$ indicator variable.
$$
\begin{aligned}
P(y_i^a = 1 | X_i) &= P(X_i^{\prime} \beta + \varepsilon_i > \tau^a) \\
&= P(\varepsilon_i > \tau^a - X_i^{\prime} \beta) \\
&= 1 - P(\varepsilon_i \leq \tau^a - X_i^{\prime} \beta) \\
&= 1 - \Phi(\tau^a - X_i^{\prime} \beta) \\
&= \Phi(X_i^{\prime} \beta - \tau^a)
\end{aligned}
$$
The same model has another form of expression in terms of
$$
\begin{aligned}
P(y_i^a = 1 | \theta_i, a_a, b_a) &= \Phi(a_a(\theta_i - b_a)) \quad a=1, \ldots, A \\
\theta_i &\sim N(0,1) \quad i=1, \ldots, N
\end{aligned}
$$
where
$\theta_i$ is the latent trait (ability) of respondent $i$
$y_i^a$ is the response of individual $i$ to item $a$
$\Phi(\cdot)$ is the standard normal cumulative distribution function (probit function)
$a_a$ is the discrimination parameter for item $a$
$b_a$ is the difficulty parameter for item $a$
### Simulation Result
```{r}
# Set seed for reproducibility
set.seed(123)
# Step 1: Set up parameters
N <- 1000 # number of respondents
P <- 3 # number of predictor variables
A <- 5 # number of items/indicators
# Arbitrarily set beta and tau
beta <- c(0.8, -0.5, 1.2)
tau <- c(-1.5, -0.5, 0, 0.5, 1.5)
# Step 2: Generate latent variables
X <- matrix(rnorm(N*P), nrow=N, ncol=P)
colnames(X) <- paste0("X", 1:P)
epsilon <- rnorm(N)
y_star <- X %*% beta + epsilon
# Step 3: Check if y_star passes each threshold tau
Y <- matrix(0, nrow=N, ncol=A)
colnames(Y) <- paste0("Y", 1:A)
for(a in 1:A) {
Y[,a] <- as.integer(y_star > tau[a])
}
# Create a data frame
df <- data.frame(
id = 1:N,
X,
y_star = y_star,
Y
)
# Print first few rows of the data frame
cat("\nFirst 5 rows of the data frame:\n")
print(head(df, 5))
```
### Simulation by Rstan
```{r message=TRUE, warning=TRUE}
library(rstan)
# Stan model
stan_model <- "
data {
int<lower=1> N; // number of observations
int<lower=1> K; // number of predictors
int<lower=1> M; // number of items
matrix[N, K] X; // predictor matrix
int<lower=0, upper=1> Y[N, M]; // binary response matrix
}
parameters {
vector[K] beta; // coefficients
vector[M] tau; // thresholds
}
model {
// Priors
beta ~ normal(0, 5);
tau ~ normal(0, 5);
// Likelihood
for (j in 1:M) {
Y[,j] ~ bernoulli(Phi(X * beta - tau[j]));
}
}
"
# Compile the model
stan_model_compiled <- stan_model(model_code = stan_model)
# Prepare data
X <- as.matrix(df[, c("X1", "X2", "X3")])
Y <- as.matrix(df[, paste0("Y", 1:A)])
stan_data <- list(
N = nrow(X),
K = ncol(X),
M = ncol(Y),
X = X,
Y = Y
)
# Fit the model
fit <- sampling(stan_model_compiled, data = stan_data, iter = 2000, chains = 4)
```
```{r}
# Extract results
posterior <- extract(fit)
estimated_beta <- colMeans(posterior$beta)
estimated_tau <- colMeans(posterior$tau)
# Calculate standard errors
se_beta <- apply(posterior$beta, 2, sd)
se_tau <- apply(posterior$tau, 2, sd)
cat(
"Estimated beta coefficients (shared across all items):\n",
paste(round(estimated_beta, 4), collapse = ", "), "\n\n",
"Standard errors for beta:\n",
paste(round(se_beta, 4), collapse = ", "), "\n\n",
"Estimated thresholds (tau) for each item:\n",
paste(round(estimated_tau, 4), collapse = ", "), "\n\n",
"Standard errors for tau:\n",
paste(round(se_tau, 4), collapse = ", "), "\n\n",
"True beta coefficients:\n",
paste(round(beta, 4), collapse = ", "), "\n\n",
"True thresholds:\n",
paste(round(tau, 4), collapse = ", "), "\n",
sep = ""
)
```
### Simulation by Maximum Likelihood Estimation
$$
\begin{aligned}
L(\boldsymbol{\beta}, \boldsymbol{\tau} | \mathbf{X}, \mathbf{Y}) &= \sum_{j=1}^M \sum_{i=1}^N \left[ Y_{ij} \log(p_{ij}) + (1 - Y_{ij}) \log(1 - p_{ij}) \right] \\
\text{ where } p_{ij} &= \Phi(\mathbf{X}_i \boldsymbol{\beta} - \tau_j)
\end{aligned}
$$
```{r}
# Prepare data
X <- as.matrix(df[, c("X1", "X2", "X3")])
Y <- as.matrix(df[, paste0("Y", 1:A)])
# Log-likelihood function
log_likelihood <- function(params, X, Y) {
beta <- params[1:3]
tau <- params[4:8]
ll <- 0
for (j in 1:ncol(Y)) {
p <- pnorm(X %*% beta - tau[j])
ll <- ll + sum(dbinom(Y[,j], size = 1, prob = p, log = TRUE))
}
return(-ll) # Return negative log-likelihood for minimization
}
# Initial parameter values
initial_params <- c(rep(0, 3), rep(0, 5)) # 3 betas and 5 taus
# Optimize using optim
fit <- optim(par = initial_params,
fn = log_likelihood,
X = X,
Y = Y,
method = "BFGS",
hessian = TRUE)
# Extract results
beta_estimates <- fit$par[1:3]
tau_estimates <- fit$par[4:8]
# Calculate standard errors
se <- sqrt(diag(solve(fit$hessian)))
beta_se <- se[1:3]
tau_se <- se[4:8]
# Print results
cat("Beta estimates:\n")
print(beta_estimates)
cat("\nBeta standard errors:\n")
print(beta_se)
cat("\nTau estimates:\n")
print(tau_estimates)
cat("\nTau standard errors:\n")
print(tau_se)
# Compare with true values
cat("\nTrue beta coefficients:\n")
print(beta)
cat("\nTrue thresholds:\n")
print(tau)
```