-
Notifications
You must be signed in to change notification settings - Fork 0
/
glm_ks_HW6.m
115 lines (94 loc) · 3.85 KB
/
glm_ks_HW6.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
%% Models of the Neuron: Evaluating a statistical model (part 2)
% Homework 6
% Script glm_part1_ks.m
%
% MATLAB code to fit a GLM model of the relation between
% spiking and the rat's position, and draw K-S plot for the
% second Neuroinformatics 2005 GLM problem set.
% load the rat trajectory and spiking data;
clr;
load('train.mat');
[ISI,x_at_spiketimes,y_at_spiketimes,Vx,Vy,dir] = generate_new_variables(xN,yN,spikes_binned);
% pick a neuron
spikes_binned = spikes_binned(:,8);
% visualize the raw data
figure(7); clf; hold on;
title('Visualize the raw data');
plot(xN,yN,x_at_spiketimes{8},y_at_spiketimes{8},'r.');
%% 4. Characterize the goodness-of-fit between data and model
% fit a GLM model to the x and y positions. (ADD ALL YOUR MODEL CANDIDATES
% HERE!!! Two possible models are listed below.
[b_lin,dev_lin,stats_lin] = glmfit([xN yN xN.^2 yN.^2],spikes_binned,'poisson');
[b_quad,dev_quad,stats_quad] = glmfit([xN.^2 yN.^2],spikes_binned,'poisson');
%******* K-S Plot *******************
% Note that the ks plot for one model is given below. You should overlay ks
% plots for each of your models.
% graph the K-S plot and confidence intervals for the K-S statistic
% first generate the conditional intensity at each timestep
% ** Adjust the below line according to your choice of model.
% remember to include a column of ones to multiply the default constant GLM
% parameter beta_0**
% based on our GLM model with the log "link function"
% Use your parameter estimates (b) from glmfit along
% with the covariates you used (xN, yN, ...)
lambdaEst_lin = exp(b_lin(1)+b_lin(2)*xN+ b_lin(3)*yN+b_lin(4)*xN.^2+ b_lin(5)*yN.^2);
lambdaEst_quad = exp(b_quad(1)+b_quad(2)*xN.^2+ b_quad(3)*yN.^2);
% linear-quadratic model
timestep = 1;
lambdaInt = 0;
j=0;
for t=1:length(spikes_binned)
lambdaInt = lambdaInt + lambdaEst_lin(t)*timestep;
if (spikes_binned(t))
j = j + 1;
KS_lin(j) = 1-exp(-lambdaInt);
lambdaInt = 0;
end
end
KSSorted_lin = sort(KS_lin);
N_lin = length(KSSorted_lin);
% quadratic model
timestep = 1;
lambdaInt = 0;
j=0;
for t=1:length(spikes_binned)
lambdaInt = lambdaInt + lambdaEst_quad(t)*timestep;
if (spikes_binned(t))
j = j + 1;
KS_quad(j) = 1-exp(-lambdaInt);
lambdaInt = 0;
end
end
KSSorted_quad = sort(KS_quad);
N_quad = length(KSSorted_quad);
% plot "perfect" models
figure(8); clf; hold on;
plot(0:.01:1,0:.01:1, 'g',...
0:.01:1, [0:.01:1]+1.36/sqrt(N_lin),'r',...
0:.01:1,[0:.01:1]-1.36/sqrt(N_lin), 'r')
% plot KS plots
h_lin = plot(([1:N_lin]-.5)/N_lin, KSSorted_lin, 'b',...
'DisplayName', 'Linear-quadratic model');
h_quad = plot(([1:N_quad]-.5)/N_quad, KSSorted_quad, 'k',...
'DisplayName', 'Quadratic model');
axis( [0 1 0 1] );
xlabel('Uniform CDF');
ylabel('Empirical CDF of Rescaled ISIs');
title('KS Plot with 95% Confidence Intervals');
legend([h_lin h_quad], 'Location', 'northwest');
ks_stat_lin = max(abs(KSSorted_lin - ([1:N_lin]-.5)/N_lin))
ks_stat_quad = max(abs(KSSorted_quad - ([1:N_quad]-.5)/N_quad))
%%
% The linear-quadratic model does a better job than the purely quadratic
% model in capturing the statistical structure of the data, since it has a
% lower K-S statistic (0.1132 vs 0.2357), which represents the maximum
% difference between the cumulative distribution of the model to an
% independent identically distributed exponential set of random variables.
%
% That said, the linear-quadratic model still does not capture all of the
% statistical structure of the data, though it is very close. This can be
% seen from the K-S plot, where the distribution of the linear-quadratic
% model deviates significantly from the unity (45 degree) line (which
% represents perfect fit of the model to the data), at multiple locations.
% Only the regions that sits within the 95 confidence bounds can be
% considered good fits.