-
Notifications
You must be signed in to change notification settings - Fork 107
/
Copy pathresult.py
568 lines (424 loc) · 15.3 KB
/
result.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
import copy
import os
import json
from hpbandster.core.base_iteration import Datum
class Run(object):
"""
Not a proper class, more a 'struct' to bundle important
information about a particular run
"""
def __init__(self, config_id, budget, loss, info, time_stamps, error_logs):
self.config_id = config_id
self.budget = budget
self.error_logs = error_logs
self.loss = loss
self.info = info
self.time_stamps = time_stamps
def __repr__(self):
return(\
"config_id: %s\t"%(self.config_id,) + \
"budget: %f\t"%self.budget + \
"loss: %s\n"%self.loss + \
"time_stamps: {submitted} (submitted), {started} (started), {finished} (finished)\n".format(**self.time_stamps) + \
"info: %s\n"%self.info
)
def __getitem__ (self, k):
"""
in case somebody wants to use it like a dictionary
"""
return(getattr(self, k))
def extract_HBS_learning_curves(runs):
"""
function to get the hyperband learning curves
This is an example function showing the interface to use the
HB_result.get_learning_curves method.
Parameters
----------
runs: list of HB_result.run objects
the performed runs for an unspecified config
Returns
-------
list of learning curves: list of lists of tuples
An individual learning curve is a list of (t, x_t) tuples.
This function must return a list of these. One could think
of cases where one could extract multiple learning curves
from these runs, e.g. if each run is an independent training
run of a neural network on the data.
"""
sr = sorted(runs, key=lambda r: r.budget)
lc = list(filter(lambda t: not t[1] is None, [(r.budget, r.loss) for r in sr]))
return([lc,])
class json_result_logger(object):
def __init__(self, directory, overwrite=False):
"""
convenience logger for 'semi-live-results'
Logger that writes job results into two files (configs.json and results.json).
Both files contain propper json objects in each line.
This version opens and closes the files for each result.
This might be very slow if individual runs are fast and the
filesystem is rather slow (e.g. a NFS).
Parameters
----------
directory: string
the directory where the two files 'configs.json' and
'results.json' are stored
overwrite: bool
In case the files already exist, this flag controls the
behavior:
* True: The existing files will be overwritten. Potential risk of deleting previous results
* False: A FileExistsError is raised and the files are not modified.
"""
os.makedirs(directory, exist_ok=True)
self.config_fn = os.path.join(directory, 'configs.json')
self.results_fn = os.path.join(directory, 'results.json')
try:
with open(self.config_fn, 'x') as fh: pass
except FileExistsError:
if overwrite:
with open(self.config_fn, 'w') as fh: pass
else:
raise FileExistsError('The file %s already exists.'%self.config_fn)
except:
raise
try:
with open(self.results_fn, 'x') as fh: pass
except FileExistsError:
if overwrite:
with open(self.results_fn, 'w') as fh: pass
else:
raise FileExistsError('The file %s already exists.'%self.config_fn)
except:
raise
self.config_ids = set()
def new_config(self, config_id, config, config_info):
if not config_id in self.config_ids:
self.config_ids.add(config_id)
with open(self.config_fn, 'a') as fh:
fh.write(json.dumps([config_id, config, config_info]))
fh.write('\n')
def __call__(self, job):
if not job.id in self.config_ids:
#should never happen! TODO: log warning here!
self.config_ids.add(job.id)
with open(self.config_fn, 'a') as fh:
fh.write(json.dumps([job.id, job.kwargs['config'], {}]))
fh.write('\n')
with open(self.results_fn, 'a') as fh:
fh.write(json.dumps([job.id, job.kwargs['budget'], job.timestamps, job.result, job.exception]))
fh.write("\n")
def logged_results_to_HBS_result(directory):
"""
function to import logged 'live-results' and return a HB_result object
You can load live run results with this function and the returned
HB_result object gives you access to the results the same way
a finished run would.
Parameters
----------
directory: str
the directory containing the results.json and config.json files
Returns
-------
hpbandster.core.result.Result: :object:
TODO
"""
data = {}
time_ref = float('inf')
budget_set = set()
with open(os.path.join(directory, 'configs.json')) as fh:
for line in fh:
line = json.loads(line)
if len(line) == 3:
config_id, config, config_info = line
if len(line) == 2:
config_id, config, = line
config_info = 'N/A'
data[tuple(config_id)] = Datum(config=config, config_info=config_info)
with open(os.path.join(directory, 'results.json')) as fh:
for line in fh:
config_id, budget,time_stamps, result, exception = json.loads(line)
id = tuple(config_id)
data[id].time_stamps[budget] = time_stamps
data[id].results[budget] = result
data[id].exceptions[budget] = exception
budget_set.add(budget)
time_ref = min(time_ref, time_stamps['submitted'])
# infer the hyperband configuration from the data
budget_list = sorted(list(budget_set))
HB_config = {
'eta' : None if len(budget_list) < 2 else budget_list[1]/budget_list[0],
'min_budget' : min(budget_set),
'max_budget' : max(budget_set),
'budgets' : budget_list,
'max_SH_iter': len(budget_set),
'time_ref' : time_ref
}
return(Result([data], HB_config))
class Result(object):
"""
Object returned by the HB_master.run function
This class offers a simple API to access the information from
a Hyperband run.
"""
def __init__ (self, HB_iteration_data, HB_config):
self.data = HB_iteration_data
self.HB_config = HB_config
self._merge_results()
def __getitem__(self, k):
return(self.data[k])
def get_incumbent_id(self):
"""
Find the config_id of the incumbent.
The incumbent here is the configuration with the smallest loss
among all runs on the maximum budget! If no run finishes on the
maximum budget, None is returned!
"""
tmp_list = []
for k,v in self.data.items():
try:
# only things run for the max budget are considered
res = v.results[self.HB_config['max_budget']]
if not res is None:
tmp_list.append((res['loss'], k))
except KeyError as e:
pass
except:
raise
if len(tmp_list) > 0:
return(min(tmp_list)[1])
return(None)
def get_incumbent_trajectory(self, all_budgets=True, bigger_is_better=True, non_decreasing_budget=True):
"""
Returns the best configurations over time
Parameters
----------
all_budgets: bool
If set to true all runs (even those not with the largest budget) can be the incumbent.
Otherwise, only full budget runs are considered
bigger_is_better:bool
flag whether an evaluation on a larger budget is always considered better.
If True, the incumbent might increase for the first evaluations on a bigger budget
non_decreasing_budget: bool
flag whether the budget of a new incumbent should be at least as big as the one for
the current incumbent.
Returns
-------
dict:
dictionary with all the config IDs, the times the runs
finished, their respective budgets, and corresponding losses
"""
all_runs = self.get_all_runs(only_largest_budget = not all_budgets)
if not all_budgets:
all_runs = list(filter(lambda r: r.budget==res.HB_config['max_budget'], all_runs))
all_runs.sort(key=lambda r: r.time_stamps['finished'])
return_dict = { 'config_ids' : [],
'times_finished': [],
'budgets' : [],
'losses' : [],
}
current_incumbent = float('inf')
incumbent_budget = self.HB_config['min_budget']
for r in all_runs:
if r.loss is None: continue
new_incumbent = False
if bigger_is_better and r.budget > incumbent_budget:
new_incumbent = True
if r.loss < current_incumbent:
new_incumbent = True
if non_decreasing_budget and r.budget < incumbent_budget:
new_incumbent = False
if new_incumbent:
current_incumbent = r.loss
incumbent_budget = r.budget
return_dict['config_ids'].append(r.config_id)
return_dict['times_finished'].append(r.time_stamps['finished'])
return_dict['budgets'].append(r.budget)
return_dict['losses'].append(r.loss)
if current_incumbent != r.loss:
r = all_runs[-1]
return_dict['config_ids'].append(return_dict['config_ids'][-1])
return_dict['times_finished'].append(r.time_stamps['finished'])
return_dict['budgets'].append(return_dict['budgets'][-1])
return_dict['losses'].append(return_dict['losses'][-1])
return (return_dict)
def get_runs_by_id(self, config_id):
"""
returns a list of runs for a given config id
The runs are sorted by ascending budget, so '-1' will give
the longest run for this config.
"""
d = self.data[config_id]
runs = []
for b in d.results.keys():
try:
err_logs = d.exceptions.get(b, None)
if d.results[b] is None:
r = Run(config_id, b, None, None , d.time_stamps[b], err_logs)
else:
r = Run(config_id, b, d.results[b]['loss'], d.results[b]['info'] , d.time_stamps[b], err_logs)
runs.append(r)
except:
raise
runs.sort(key=lambda r: r.budget)
return(runs)
def get_learning_curves(self, lc_extractor=extract_HBS_learning_curves, config_ids=None):
"""
extracts all learning curves from all run configurations
Parameters
----------
lc_extractor: callable
a function to return a list of learning_curves.
defaults to hpbanster.HB_result.extract_HP_learning_curves
config_ids: list of valid config ids
if only a subset of the config ids is wanted
Returns
-------
dict
a dictionary with the config_ids as keys and the
learning curves as values
"""
config_ids = self.data.keys() if config_ids is None else config_ids
lc_dict = {}
for id in config_ids:
runs = self.get_runs_by_id(id)
lc_dict[id] = lc_extractor(runs)
return(lc_dict)
def get_all_runs(self, only_largest_budget=False):
"""
returns all runs performed
Parameters
----------
only_largest_budget: boolean
if True, only the largest budget for each configuration
is returned. This makes sense if the runs are continued
across budgets and the info field contains the information
you care about. If False, all runs of a configuration
are returned
"""
all_runs = []
for k in self.data.keys():
runs = self.get_runs_by_id(k)
if len(runs) > 0:
if only_largest_budget:
all_runs.append(runs[-1])
else:
all_runs.extend(runs)
return(all_runs)
def get_id2config_mapping(self):
"""
returns a dict where the keys are the config_ids and the values
are the actual configurations
"""
new_dict = {}
for k, v in self.data.items():
new_dict[k] = {}
new_dict[k]['config'] = copy.deepcopy(v.config)
try:
new_dict[k]['config_info'] = copy.deepcopy(v.config_info)
except:
pass
return(new_dict)
def _merge_results(self):
"""
hidden function to merge the list of results into one
dictionary and 'normalize' the time stamps
"""
new_dict = {}
for it in self.data:
new_dict.update(it)
for k,v in new_dict.items():
for kk, vv in v.time_stamps.items():
for kkk,vvv in vv.items():
new_dict[k].time_stamps[kk][kkk] = vvv - self.HB_config['time_ref']
self.data = new_dict
def num_iterations(self):
return(max([k[0] for k in self.data.keys()]) + 1)
def get_fANOVA_data(self, config_space, budgets=None, loss_fn=lambda r: r.loss, failed_loss=None):
import numpy as np
import ConfigSpace as CS
id2conf = self.get_id2config_mapping()
if budgets is None:
budgets = self.HB_config['budgets']
if len(budgets)>1:
config_space.add_hyperparameter(CS.UniformFloatHyperparameter('budget', min(budgets), max(budgets), log=True))
hp_names = config_space.get_hyperparameter_names()
hps = config_space.get_hyperparameters()
needs_transform = list(map(lambda h: isinstance(h, CS.CategoricalHyperparameter), hps))
all_runs = self.get_all_runs(only_largest_budget=False)
all_runs=list(filter( lambda r: r.budget in budgets, all_runs))
X = []
y = []
for r in all_runs:
if r.loss is None:
if failed_loss is None: continue
else: y.append(failed_loss)
else:
y.append(loss_fn(r))
config = id2conf[r.config_id]['config']
if len(budgets)>1:
config['budget'] = r.budget
config = CS.Configuration(config_space, config)
x = []
for (name, hp, transform) in zip(hp_names, hps, needs_transform):
if transform:
x.append(hp._inverse_transform(config[name]))
else:
x.append(config[name])
X.append(x)
return(np.array(X), np.array(y), config_space)
def get_pandas_dataframe(self, budgets=None, loss_fn=lambda r: r.loss):
import numpy as np
import pandas as pd
id2conf = self.get_id2config_mapping()
df_x = pd.DataFrame()
df_y = pd.DataFrame()
if budgets is None:
budgets = self.HB_config['budgets']
all_runs = self.get_all_runs(only_largest_budget=False)
all_runs=list(filter( lambda r: r.budget in budgets, all_runs))
all_configs = []
all_losses = []
for r in all_runs:
if r.loss is None: continue
config = id2conf[r.config_id]['config'].copy()
if len(budgets)>1:
config['budget'] = r.budget
all_configs.append(config)
all_losses.append({'loss': r.loss})
#df_x = df_x.append(config, ignore_index=True)
#df_y = df_y.append({'loss': r.loss}, ignore_index=True)
df_X = pd.DataFrame(all_configs)
df_y = pd.DataFrame(all_losses)
return(df_X, df_y)
def get_sorted_runs_dataframe(self):
'''
Turns the results of self.get_all_runs() to dataframe to make it more user-friendly.
The dataframe is sorted by loss and budgets (epochs) to make the hyper-parameter value combination with the
smallest loss and budget appear on the top.
Output:
A dataframe where the rows are runs. The first few columns are run_id, budget, and loss. The rest of the columns
are hyperparameters, each with a column.
'''
all_runs_results = self.get_all_runs(only_largest_budget=False)
id_to_hyper_parameter_value_combination_dictionary = bohb_result.get_id2config_mapping()
id_list = []
budget_list = []
loss_list = []
hyper_parameter_name_to_value_dictionary_list = [] # store hyperparemter value combination of each run
for i in range(len(all_runs_results)):
current_run = all_runs_results[i]
id_of_the_current_run = current_run.config_id
budget_of_the_current_run = current_run.budget
loss_of_the_current_run = current_run.loss
id_list.append(id_of_the_current_run)
budget_list.append(budget_of_the_current_run)
loss_list.append(loss_of_the_current_run)
hyper_parameter_name_to_value_dictionary = \
id_to_hyper_parameter_value_combination_dictionary[id_of_the_current_run]['config']
hyper_parameter_name_to_value_dictionary_list.append(hyper_parameter_name_to_value_dictionary)
id_budget_loss_dataframe = pd.DataFrame({'id': id_list, 'budget': budget_list,
'loss': loss_list}) # the dataframe for info associated with the hyperparemter value combination
hyper_parameter_value_combination_dataframe = pd.DataFrame(
hyper_parameter_name_to_value_dictionary_list) # the dataframe for hyperparameter value combination
complete_result_dataframe = pd.concat([id_budget_loss_dataframe, hyper_parameter_value_combination_dataframe],
axis=1).sort_values(['loss', 'budget'])
return complete_result_dataframe