-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathk_means_clustering_training.m
154 lines (146 loc) · 5.28 KB
/
k_means_clustering_training.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
%% Code by Gaurav Kumar Singh
% This code demonstrates how to perform clustering on driving data and
% generate useful predictions. The data used for training is contained in
% the file "driver_1_trip_1_regular.csv". There are 13 columns in the csv
% file. They are
% 1 Driver
% 2 Trip
% 3 Time
% 4 Ax
% 5 Ay
% 6 Az
% 7 PitchRate
% 8 RollRate
% 9 YawRate
% 10 Pitch
% 11 Roll
% 12 Yaw
% 13 ImuTime
clear all
close all
clc
%% Read Data
data=csvread('driver_1_trip_1_regular.csv',1);
%% Put variables of interest in column vectors
% Longitudinal Acceleration
ax=data(:,4);
% Lateral Acceleration
ay=data(:,5);
% Yaw Rate
yaw=data(:,12)*(pi/180);
% Keeping first 25000 points for training and rest for testing
% To save efforts making new variables, we call training data as ax, ay,
% yaw only, while validation data is called ax_test, ay_test, yaw.
ax_test=ax(25001:end);
ay_test=ay(25001:end);
yaw_test=yaw(25001:end);
ax=ax(1:25000);
ay=ay(1:25000);
yaw=yaw(1:25000);
%% Clustering on yaw and predicting on yaw
% It is possible to cluster on ax and ay and predict on yaw. BUt, for
% demonstration purposes we a will cluster only on yaw and then predict on
% yaw. What exactly we are doing is that we take 10 points of yaw at a time
% and then see what next 5 points are doing. For example we have 1:10,
% 2:11, 3:12 etc as 10 dimensional vectors. We stack them upon each other
% to cluster. For example if 1:10 and 5:14 belong in same clusters then we
% assume that points 11:15 and 15:19 belong together.
% Number of clusters
num=100;
%Prediction Horizon
horizon=5;
% Taking how many points for clustering
windowsize=10;
% CreateRollingWindow function is useful for stacking vectors of windowsize
% together.
X=createRollingWindow(yaw,windowsize);
% Time Vector
time=data(:,13);
% Setting Clustering Parameters
opts = statset('Display','final');
%% Performing Clustering
% idx is the cluster assignment, for example if set 1 is in cluster 4, then
% idx for that set is 4.
% C is centroids of the cluster. There will be num (number of clusters) centroids
[idx,C] = kmeans(X,num,'Distance','cityblock',...
'Replicates',10,'Options',opts);
%% Probability distribution On every time instant calculation
% Using ecdf functionality of MATLAB, F is a vector of values of the cdf
% calculated at points metioned in x. Remember Cumulative Distribution
% function is 100 X 5 cell with each cell containing cumulative
% distribution function over the total number of points belonging to that
% cluster.
F={};
x={};
for i=1:num
P=[];
for j=1:length(find(idx==i))
c1_coord=find(idx==i);
P(j,:)=yaw(c1_coord(j)+1:c1_coord(j)+horizon);
end
size(P)
for p=1:horizon
[F{i,p},x{i,p}]=ecdf(P(:,p));
end
end
%% Validation part
% Create Rollingwindow for validation part
X_test=createRollingWindow(yaw_test,windowsize);
time=data(:,13);
% Setting clusteing parameters
opts = statset('Display','final');
% idx_test is the cluster assignment, for example if set 1 is in cluster 4, then
% idx_test for that set is 4.
% C_test is centroids of the cluster. There will be num (number of clusters) centroids
[idx_test,C_test] = kmeans(X_test,num,'Distance','cityblock',...
'Replicates',10,'Options',opts);
%% Comparison of clusters
% We divided trainign and test data into 100 clusters. But clustering is
% random, and we dont know what cluster in test data closely matches to
% cluster in train data. We can cmopare centroids for that. The centroids
% which are closest will be similar. FInd each centroids edistance with
% other centroids and find the minimum distances from 100 X 100 matrices.
for i=1:num
for j=1:num
compliance(i,j)=norm(C(i,:)-C_test(j,:));
end
end
[x_min,id]=min(compliance);
%% Performance Evaluation
% Comparing whether for each of the next 5 samples the prediction remains
% within the range.
M_test=[];
M_Z=[];
for i=1:num
P=[];
Z=[];
for j=1:length(find(idx_test==i))
c1_coord=find(idx_test==i);
Z(j,:)=yaw_test(c1_coord(j)+1:c1_coord(j)+horizon);
% P will be a matrix of logicals. 1 for within the range and 0 for
% out of the range.
P(j,:)=[(yaw_test(c1_coord(j)+1)<max(x{id(i),1})) & (yaw_test(c1_coord(j)+1)>min(x{id(i),1})), ...
(yaw_test(c1_coord(j)+2)<max(x{id(i),2})) & (yaw_test(c1_coord(j)+2)>min(x{id(i),2})),...
(yaw_test(c1_coord(j)+3)<max(x{id(i),3})) & (yaw_test(c1_coord(j)+3)>min(x{id(i),3})), ...
(yaw_test(c1_coord(j)+4)<max(x{id(i),4})) & (yaw_test(c1_coord(j)+4)>min(x{id(i),4})),...
(yaw_test(c1_coord(j)+5)>min(x{id(i),5})) & (yaw_test(c1_coord(j)+5)>min(x{id(i),5}))];
end
% Computing Recall Function. Recall Function demonstrates that how wide
% the predictions are.
for r=1:horizon
N(i,r)=((max(x{id(i),r})-min(x{id(i),r}))/(6.26))*((length(find(idx_test==i)))/(length(yaw_test)));
end
M_test=[M_test;P];
M_Z=[M_Z;Z];
end
%% Computing accuracy
% From M_test a matrix of logicals for 5 columns. If throughout the row,
% M_test is 1 (True), this implies that the prediction is good enough for
% next 5 samples. So, we assign a True (1) only when all the columns are
% True in this row. The total percentage of such 'True' will give us the
% accuracy.
for m=1:length(M_test)
ResMat(m)=all(M_test(m,:)>0);
end
success=sum(ResMat)/length(M_test)
recall=1-sum(sum(abs(N)))