-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparams.yaml
More file actions
91 lines (91 loc) · 2.19 KB
/
params.yaml
File metadata and controls
91 lines (91 loc) · 2.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# -- General params --
raw_data_folder: raw/20250403/
outcome_data_folder: raw/outcome_stage_1-5/
repo_data_folder: ./data/
dataset: LC
random_state: 42
# -- Master feature generation params --
master_params:
output_folder: features/master/
features:
numeric:
- Age_at_Diagnosis
categorical:
- GENDER
- RACE
- PATIENT_STATE_CODE
- PATIENT_REGION
- STAGE_AT_DIAGNOSIS
- CANCER_TYPE
- STAGE
# -- feature generation params --
feature_params:
# -- diag --
diag_params:
apply_func: # Function and its params to apply to process features
# # - Initial counting icd codes approach
# diag_count_featurization:
# map_file: ./src/DX_CD_to_broad_icd10.yaml
# mapped_column_name: broad_ICD10
# col_to_count: broad_ICD10
# - llm embeddings approach
diag_medical_embeddings:
col_to_embed: DESCRIPTION
model_name: dmis-lab/biobert-base-cased-v1.2
max_length: 128
batch_size: 16
# -- Procdr --
procdr_params:
apply_func: # Function and its params to apply to process features
# procdr_count:
# col_to_count: ID
# -- drug --
drug_params:
apply_func:
# -- moltest --
moltest_params:
apply_func:
# -- test --
test_params:
apply_func:
# -- Train parametrization --
train_params:
n_repeats: 33
n_splits: 3
kbest: 100
# Here we define the file we will use for each table
features_to_use:
master: complete
diag: care
procdr:
drug:
moltest:
test:
transformers_to_use: # Define the transformers.py transformers to use for each file
master:
num: # Apply the 'num' transformer to this column
- Age_at_Diagnosis
cat: # Apply the 'cat' transformer to this column
- GENDER
- RACE
- PATIENT_REGION
- PATIENT_STATE_CODE
- STAGE_AT_DIAGNOSIS
- CANCER_TYPE
- STAGE
diag: diag
procdr: procdr
model:
# -- Logistic Regression CV
name: lrcv
params:
max_iter: 1000
scoring: matthews_score
class_weight: balanced
# # -- EBM
# name: ebm
# params:
# n_jobs: 50
# -- Predict params --
predict_params:
metric_for_best_model: test_matthews_score