-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathTADPOLE_SimpleForecastExampleLeaderboard.m
More file actions
365 lines (337 loc) · 15.7 KB
/
TADPOLE_SimpleForecastExampleLeaderboard.m
File metadata and controls
365 lines (337 loc) · 15.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
% TADPOLE_SimpleForecastExampleLeaderboard.m
%
% Example code showing how to construct a forecast in the correct format
% for a leaderboard submission to TADPOLE Challenge 2017.
%
% Uses the LB2 prediction set generated by makeLeaderboardDataset.py (see
% github.com/noxtoby/TADPOLE/evaluation for the code).
%
% The simple forecast example here simply uses a set of predefined
% defaults:
% 1. Likelihoods of each diagnosis (CN, MCI, AD) that depend on the
% subject's most recent clinical status.
% 2. Forecasts of future ADAS13 score and Ventricles volume are just the
% same as the most recent measurement, or filled with defaults where
% the data is missing.
%
% ****** The purpose of the code is not to give a good forecast! ******
%
% It is simply to show how to read in and make sense of the TADPOLE data
% sets and to output a forecast in the right format.
%
% You may wish to use this code as a starting point for generating your own
% leaderboard submission (spreadsheet).
%
%============
% Authors:
% Daniel C. Alexander, Neil P. Oxtoby, and Razvan Valentin-Marinescu
% University College London
% Date:
% 9 August 2017
% Last updated:
% 13 September 2017
%% Read in the TADPOLE data set and extract a few columns of salient information.
% Script requires that TADPOLE_D1_D2.csv is in the parent directory.
% Change this if necessary.
dataLocationD1D2 = '../'; % parent directory
dataLocationLB1LB2 = './';% current directory
tadpoleD1D2File = fullfile(dataLocationD1D2,'TADPOLE_D1_D2.csv');
tadpoleLB1LB2File = fullfile(dataLocationLB1LB2,'TADPOLE_LB1_LB2.csv');
outputFile = 'TADPOLE_Submission_Leaderboard_TeamName1.csv';
errorFlag = 0;
if ~(exist(tadpoleD1D2File, 'file') == 2)
error(sprintf(strcat('File %s does not exist. You need to download\n ', ...
'it from ADNI and/or move it in the right directory'), tadpoleD1D2File))
errorFlag = 1;
end
if errorFlag
exit;
end
% choose whether to display warning messages
verbose = 0;
%* Read in the D1_D2 spreadsheet. This contains all the necessary data, as
%* the TADPOLE_LB1_LB2.csv spreadsheet only contains the LB1 and LB2
%* indicators, aligned to TADPOLE_D1_D2.csv
TADPOLE_Table = readtable(tadpoleD1D2File,'Delimiter','comma','TreatAsEmpty',{''},'HeaderLines',0);
%* Read in the LB1_LB2 spreadsheet
LB_Table = readtable(tadpoleLB1LB2File,'Delimiter','comma','TreatAsEmpty',{''},'HeaderLines',0);
% Depending on your version of MATLAB, you may see a warning about
% datetime not being in the right format, but the read does work.
%* Target variables: check whether numeric and convert if necessary
targetVariables = {'DX','ADAS13','Ventricles'};
variablesToCheck = [{'RID','ICV_bl'},targetVariables]; % also check RosterID and IntraCranialVolume
for kt=1:length(variablesToCheck)
if not(strcmpi('DX',variablesToCheck{kt}))
if iscell(TADPOLE_Table.(variablesToCheck{kt}))
%* Convert strings (cells) to numeric (arrays)
TADPOLE_Table.(variablesToCheck{kt}) = str2double(TADPOLE_Table.(variablesToCheck{kt}));
end
end
end
%* Copy numeric target variables into arrays.
%* We encode missing data as -1 for convenience
% ADAS13 scores
ADAS13_Col = TADPOLE_Table.ADAS13;
ADAS13_Col(isnan(ADAS13_Col)) = -1;
% Ventricles volumes, normalised by intracranial volume
Ventricles_Col = TADPOLE_Table.Ventricles;
Ventricles_Col(isnan(Ventricles_Col)) = -1;
ICV_Col = TADPOLE_Table.ICV_bl;
ICV_Col(Ventricles_Col==-1) = 1;
Ventricles_ICV_Col = Ventricles_Col./ICV_Col;
%* Create an array containing the clinical status (current diagnosis)
%* column from the spreadsheet - DX
DXCHANGE = TADPOLE_Table.DX; % 'NL to MCI', 'MCI to Dementia', etc. = '[previous DX] to [current DX]'
DX = DXCHANGE; % Note: missing data encoded as empty string
%* Convert DXCHANGE to current DX, i.e., the final
for kr=1:length(DXCHANGE)
spaces = strfind(DXCHANGE{kr},' '); % find the spaces in DXCHANGE
if not(isempty(spaces))
DX{kr} = DXCHANGE{kr}((spaces(end)+1):end); % extract current DX
end
end
CLIN_STAT_Col = DX;
%* Copy the subject ID column from the spreadsheet into an array.
RID_Col = TADPOLE_Table.RID;
RID_Col(isnan(RID_Col)) = -1; % missing data encoded as -1
%* Compute months since Jan 2000 for each exam date
EXAMDATE = cell2mat(TADPOLE_Table.EXAMDATE);
ExamMonth_Col = zeros(length(TADPOLE_Table.EXAMDATE),1);
for i=1:length(TADPOLE_Table.EXAMDATE)
ExamMonth_Col(i) = (str2num(TADPOLE_Table.EXAMDATE{i}(1:4))-2000)*12 + str2num(TADPOLE_Table.EXAMDATE{i}(6:7));
end
%* Copy the column specifying membership of LB2 into an array
if iscell(LB_Table.LB2)
LB2_col = str2num(cell2mat(LB_Table.LB2));
else
LB2_col = LB_Table.LB2;
end
%% Generate the very simple forecast
display('Generating forecast ...')
%* Get the list of subjects to forecast from LB2 - the ordering is the
%* same as in the submission template.
lbInds = find(LB2_col);
LB2_SubjList = unique(RID_Col(lbInds));
N_LB2 = length(LB2_SubjList);
% As opposed to the actual submission, we require 84 months of forecast
% data. This is because some ADNI2 subjects in LB4 have visits as long as
% 7 years after their last ADNI1 visit in LB2
%* Create arrays to contain the 84 monthly forecasts for each LB2 subject
nForecasts = 7*12; % forecast 7 years (84 months).
% 1. Clinical status forecasts
% i.e. relative likelihood of NL, MCI, and Dementia (3 numbers)
CLIN_STAT_forecast = zeros(N_LB2, nForecasts, 3);
% 2. ADAS13 forecasts
% (best guess, upper and lower bounds on 50% confidence interval)
ADAS13_forecast = zeros(N_LB2, nForecasts, 3);
% 3. Ventricles volume forecasts
% (best guess, upper and lower bounds on 50% confidence interval)
Ventricles_ICV_forecast = zeros(N_LB2, nForecasts, 3);
%* Our example forecast for each subject is based on the most recent
%* available (not missing) data for each target variable in LB2.
%* Extract most recent data.
most_recent_CLIN_STAT = cell(N_LB2, 1);
most_recent_ADAS13 = zeros(N_LB2, 1);
most_recent_Ventricles_ICV = zeros(N_LB2, 1);
display_info = 0; % Useful for checking and debugging (see below)
%*** Some defaults where data is missing
%* Ventricles
% Missing data = typical volume +/- broad interval = 25000 +/- 20000
Ventricles_typical = 25000;
Ventricles_broad_50pcMargin = 20000; % +/- (broad 50% confidence interval)
% Default CI = 1000
Ventricles_default_50pcMargin = 1000; % +/- (broad 50% confidence interval)
% Convert to Ventricles/ICV via linear regression
lm = fitlm(Ventricles_Col(Ventricles_Col>0),Ventricles_ICV_Col(Ventricles_Col>0));
Ventricles_ICV_typical = predict(lm,Ventricles_typical);
Ventricles_ICV_broad_50pcMargin = abs(predict(lm,Ventricles_broad_50pcMargin) - predict(lm,-Ventricles_broad_50pcMargin))/2;
Ventricles_ICV_default_50pcMargin = abs(predict(lm,Ventricles_default_50pcMargin) - predict(lm,-Ventricles_default_50pcMargin))/2;
%* ADAS13
ADAS13_typical = 12;
ADAS13_typical_lower = ADAS13_typical - 10;
ADAS13_typical_upper = ADAS13_typical + 10;
for i=1:N_LB2 % Each subject in LB2
%* Rows in LB2 corresponding to Subject LB2_SubjList(i)
subj_rows = find(RID_Col==LB2_SubjList(i) & LB2_col);
subj_exam_dates = ExamMonth_Col(subj_rows);
% Non-empty data among these rows
exams_with_CLIN_STAT = ~strcmpi(CLIN_STAT_Col(subj_rows),'');
exams_with_ADAS13 = ADAS13_Col(subj_rows)>0;
exams_with_ventsv = Ventricles_ICV_Col(subj_rows)>0;
%exams_with_allData = exams_with_CLIN_STAT & exams_with_ADAS13 & exams_with_ventsv;
%* Extract most recent non-empty data
% 1. Clinical status
if sum(exams_with_CLIN_STAT)>=1 % Subject has a Clinical status
% Index of most recent visit with a Clinical status
ind = subj_rows( subj_exam_dates(exams_with_CLIN_STAT) == max(subj_exam_dates(exams_with_CLIN_STAT)) );
most_recent_CLIN_STAT{i} = CLIN_STAT_Col(ind(end));
else % Subject has no Clinical statuses in the data set
most_recent_CLIN_STAT{i} = '';
end
% 2. ADAS13 score
if sum(exams_with_ADAS13)>=1 % Subject has an ADAS13 score
% Index of most recent visit with an ADAS13 score
ind = subj_rows( subj_exam_dates(exams_with_ADAS13) == max(subj_exam_dates(exams_with_ADAS13)) );
most_recent_ADAS13(i) = ADAS13_Col(ind(end));
else % Subject has no ADAS13 scores in the data set
most_recent_ADAS13(i) = -1;
end
% 3. Most recent ventricles volume measurement
if sum(exams_with_ventsv)>=1 % Subject has a ventricles volume recorded
% Index of most recent visit with a ventricles volume
ind = subj_rows( subj_exam_dates(exams_with_ventsv) == max(subj_exam_dates(exams_with_ventsv)) );
most_recent_Ventricles_ICV(i) = Ventricles_ICV_Col(ind(end));
else % Subject has no ventricle volume measurement in the data set
most_recent_Ventricles_ICV(i) = -1;
end
%* "Debug mode": prints out some stuff (set display_info=1 above)
if(display_info)
ExamMonth_Col(subj_rows)
CLIN_STAT_Col(subj_rows)
Ventricles_ICV_Col(subj_rows)
ADAS13_Col(subj_rows)
[i most_recent_CLIN_STAT{i} most_recent_ADAS13(i) most_recent_Ventricles_ICV(i)]
end
%*** Construct example forecasts
%* Clinical status forecast: predefined likelihoods per current status
if(strcmp(most_recent_CLIN_STAT{i}, 'NL'))
CNp=0.75; MCIp=0.15; ADp=0.1;
elseif(strcmp(most_recent_CLIN_STAT{i}, 'MCI'))
CNp=0.1; MCIp=0.5; ADp=0.4;
elseif(strcmp(most_recent_CLIN_STAT{i}, 'Dementia'))
CNp=0.1; MCIp=0.1; ADp=0.8;
else
CNp=0.33; MCIp=0.33; ADp=0.34;
if verbose
disp(['Unrecognised status '; most_recent_CLIN_STAT{i}])
end
end
CLIN_STAT_forecast(i,:,1) = CNp;
CLIN_STAT_forecast(i,:,2) = MCIp;
CLIN_STAT_forecast(i,:,3) = ADp;
%* ADAS13 forecast: = most recent score, default confidence interval
if(most_recent_ADAS13(i)>=0)
ADAS13_forecast(i,:,1) = most_recent_ADAS13(i);
ADAS13_forecast(i,:,2) = max([0, most_recent_ADAS13(i)-1]); % Set to zero if best-guess less than 1.
ADAS13_forecast(i,:,3) = most_recent_ADAS13(i)+1;
else % Subject has no history of ADAS13 measurement, so we'll take a
% typical score of 12 with wide confidence interval +/-10.
ADAS13_forecast(i,:,1) = ADAS13_typical;
ADAS13_forecast(i,:,2) = ADAS13_typical_lower;
ADAS13_forecast(i,:,3) = ADAS13_typical_upper;
end
%* Ventricles volume forecast: = most recent measurement, default confidence interval
if(most_recent_Ventricles_ICV(i)>0)
Ventricles_ICV_forecast(i,:,1) = most_recent_Ventricles_ICV(i);
Ventricles_ICV_forecast(i,:,2) = most_recent_Ventricles_ICV(i) - Ventricles_ICV_default_50pcMargin;
Ventricles_ICV_forecast(i,:,3) = most_recent_Ventricles_ICV(i) + Ventricles_ICV_default_50pcMargin;
else % Subject has no imaging history, so we'll take a typical
% ventricles volume of 25000 & wide confidence interval +/-20000
Ventricles_ICV_forecast(i,:,1) = Ventricles_ICV_typical;
Ventricles_ICV_forecast(i,:,2) = Ventricles_ICV_typical - Ventricles_ICV_broad_50pcMargin;
Ventricles_ICV_forecast(i,:,3) = Ventricles_ICV_typical + Ventricles_ICV_broad_50pcMargin;
end
end
% Round to 9 decimal places to match python sample code
Ventricles_ICV_forecast = round(1e9*Ventricles_ICV_forecast)/1e9;
%% Now construct the forecast spreadsheet and output it.
display(sprintf('Constructing the output spreadsheet %s ...', outputFile))
% the start date for the leaderboard should be 1 May 2010.
startDate = datenum('01-May-2010');
submission_table = cell2table(cell(N_LB2*nForecasts,12), ...
'VariableNames', {'RID', 'ForecastMonth', 'ForecastDate', ...
'CNRelativeProbability', 'MCIRelativeProbability', 'ADRelativeProbability', ...
'ADAS13', 'ADAS1350_CILower', 'ADAS1350_CIUpper', ...
'Ventricles_ICV', 'Ventricles_ICV50_CILower', 'Ventricles_ICV50_CIUpper' });
%* Repeated matrices - compare with submission template
submission_table.RID = reshape(repmat(LB2_SubjList, [1, nForecasts])', N_LB2*nForecasts, 1);
submission_table.ForecastMonth = repmat((1:nForecasts)', [N_LB2, 1]);
%* First subject's submission dates
for m=1:nForecasts
submission_table.ForecastDate{m} = datestr(addtodate(startDate, m-1, 'month'), 'yyyy-mm');
end
%* Repeated matrices for submission dates - compare with submission template
submission_table.ForecastDate = repmat(submission_table.ForecastDate(1:nForecasts), [N_LB2, 1]);
%* Pre-fill forecast data, encoding missing data as NaN
nanColumn = nan(size(submission_table.CNRelativeProbability));
submission_table.CNRelativeProbability = nanColumn;
submission_table.MCIRelativeProbability = nanColumn;
submission_table.ADRelativeProbability = nanColumn;
submission_table.ADAS13 = nanColumn;
submission_table.ADAS1350_CILower = nanColumn;
submission_table.ADAS1350_CIUpper = nanColumn;
submission_table.Ventricles_ICV = nanColumn;
submission_table.Ventricles_ICV50_CILower = nanColumn;
submission_table.Ventricles_ICV50_CIUpper = nanColumn;
%*** Paste in month-by-month forecasts **
%* 1. Clinical status
%* a) CN probabilities
col = 4;
t = CLIN_STAT_forecast(:,:,1)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* b) MCI probabilities
col = 5;
t = CLIN_STAT_forecast(:,:,2)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* c) AD probabilities
col = 6;
t = CLIN_STAT_forecast(:,:,3)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* 2. ADAS13 score
col = 7;
t = ADAS13_forecast(:,:,1)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* a) Lower and upper bounds (50% confidence intervals)
col = 8;
t = ADAS13_forecast(:,:,2)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
col = 9;
t = ADAS13_forecast(:,:,3)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* 3. Ventricles volume (normalised by intracranial volume)
col = 10;
t = Ventricles_ICV_forecast(:,:,1)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* a) Lower and upper bounds (50% confidence intervals)
col = 11;
t = Ventricles_ICV_forecast(:,:,2)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
col = 12;
t = Ventricles_ICV_forecast(:,:,3)';
col = submission_table.Properties.VariableNames(col);
submission_table{:,col} = t(:);
%* Convert all numbers to strings - only required for MATLAB
hdr = submission_table.Properties.VariableNames;
for k=1:length(hdr)
if ~iscell(submission_table.(hdr{k}))
if k>9
submission_table.(hdr{k}) = strrep(cellstr(num2str(submission_table{:,hdr{k}},'%.9f')),' ','');
else
submission_table.(hdr{k}) = strrep(cellstr(num2str(submission_table{:,hdr{k}})),' ','');
end
end
end
%* Use column names that match the submission template
columnNames = {'RID', 'Forecast Month', 'Forecast Date',...
'CN relative probability', 'MCI relative probability', 'AD relative probability', ...
'ADAS13', 'ADAS13 50% CI lower', 'ADAS13 50% CI upper', 'Ventricles_ICV', ...
'Ventricles_ICV 50% CI lower', 'Ventricles_ICV 50% CI upper'};
%* Convert table to cell array to write to file, line by line
% This is necessary because of spaces in the column names: writetable()
% doesn't handle this.
tableCell = table2cell(submission_table);
tableCell = [columnNames;tableCell];
%* Write file line-by-line
fid = fopen(outputFile,'w');
for i=1:size(tableCell,1)
fprintf(fid,'%s\n', strjoin(tableCell(i,:), ','));
end
fclose(fid);