ml-pipeline-experiment/main.py at master · auto-flow/ml-pipeline-experiment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Date    : 2021-04-22
# @Contact    : qichun.tang@bupt.edu.cn
'''
实验脚本，实验步骤如下：
0. 从DATAPATH加载测试数据（bz2 压缩的 pickle）
1. 根据HDL构造一个空间，然后用网格搜索的方法遍历整个空间，得到4万个配置
2. 根据参数，对这组配置进行切分
3. 对切分后的子配置进行遍历
    3.1 计算配置的hash值，作为主键
        （如果主键存在，跳过）
    3.2 5折交叉，并记录每折的各种metrics
    3.3 对metrics求平均
    3.4 整理好数据，上传数据库
'''
import os
from collections import defaultdict
from pprint import pprint
from time import time

import numpy as np
import peewee as pw
from joblib import load
from playhouse.postgres_ext import JSONField
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from pipeline_space.automl_pipeline.construct_pipeline import construct_pipeline
from pipeline_space.build_ml_pipeline_space import get_all_configs
from pipeline_space.metrics import calculate_score, f1
from pipeline_space.utils import get_chunks, get_hash_of_str

# psql -U postgres
db = pw.PostgresqlDatabase(
    database="ml_pipeline_experiment",
    host="123.56.90.56",
    user="postgres",
    password="xenon"
)


class Trial(pw.Model):
    config_id = pw.CharField(primary_key=True)
    cost_time = pw.FloatField(null=True)
    failed_info = pw.TextField(null=True)
    all_score = JSONField(null=True)
    config = JSONField(null=True)

    class Meta:
        database = db
        table_name = os.environ['TABLE_NAME']


SPLITS = int(os.environ['SPLITS'])
INDEX = int(os.environ['INDEX'])
KFOLD = int(os.environ['KFOLD'])
DATAPATH = os.environ['DATAPATH']
CONFIG_ID = os.environ.get('CONFIG_ID')  # c478a1b5bde6f36883bc429f39a66b41

Trial.create_table(safe=True)
all_configs, config_id_to_config = get_all_configs()
if CONFIG_ID is None:
    np.random.seed(0)
    np.random.shuffle(all_configs)
    print("all_configs[0]")
    print(all_configs[0])
    print("all_configs[-1]")
    print(all_configs[-1])
    N = len(all_configs)
    split_configs = get_chunks(all_configs, SPLITS)
    sub_configs = split_configs[INDEX]
    pprint(sub_configs[-1])
else:
    sub_configs = [config_id_to_config[CONFIG_ID][0]]
X, y, cat = load(DATAPATH)
X = X.values
X = X[:, ~np.array(cat)]
y = LabelEncoder().fit_transform(y)
cv = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=0)
print(next(cv.split(X, y))[0])
for config in tqdm(sub_configs):
    config_id = get_hash_of_str(str(config))
    print(config)
    all_scores_list = defaultdict(list)
    start_time = time()
    if len(list(Trial.select(Trial.config_id).where(Trial.config_id == config_id).dicts())) == 0:
        Trial.create(config_id=config_id)
    else:
        print(f'{config_id} exists, continue')
        continue
    try:
        for train_ix, test_ix in cv.split(X, y):
            X_train = X[train_ix, :]
            y_train = y[train_ix]
            X_test = X[test_ix, :]
            y_test = y[test_ix]
            pipeline = construct_pipeline(config, verbose=True, n_jobs=8)
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict_proba(X_test)
            # 算 metrics
            all_scores = calculate_score(y_test, y_pred, "classification", f1, True)[1]
            for metric_name, score in all_scores.items():
                all_scores_list[metric_name].append(score)
        all_scores_mean = {}
        for metric_name, scores in all_scores_list.items():
            all_scores_mean[metric_name] = float(np.mean(scores))
        failed_info = None
    except Exception as e:
        failed_info = str(e)
        all_scores_mean = None
    cost_time = time() - start_time  # 因为缓存的存在，所以可能不准
    print('accuracy', all_scores_mean['accuracy'])
    Trial.update(
        cost_time=cost_time,
        failed_info=failed_info,
        all_score=all_scores_mean,
        config=config
    ).where(Trial.config_id == config_id).execute()
    # 整理数据，上传数据库
    print()