Skip to content

Commit f1ca3a8

Browse files
authored
53 export model schema (#54)
* Added: annotations contain descriptions for fields * Added: exported model schema
1 parent 3055564 commit f1ca3a8

File tree

7 files changed

+1533
-74
lines changed

7 files changed

+1533
-74
lines changed

autoxai4omics/utils/parser/config_model.py

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,21 +11,60 @@
1111
from .plotting_model import PlottingModel
1212
from .prediction_model import PredictionModel
1313
from .tabular_model import TabularModel
14-
from pydantic import BaseModel, model_validator
14+
from pydantic import BaseModel, model_validator, Field
1515
from typing import Union
16+
from typing_extensions import Annotated
1617

1718

1819
class ConfigModel(BaseModel):
19-
data: DataModel
20-
ml: MlModel
21-
plotting: PlottingModel = PlottingModel()
22-
tabular: Union[TabularModel, None] = TabularModel()
23-
microbiome: Union[MicrobiomeModel, None] = MicrobiomeModel()
24-
metabolomic: Union[MetabolomicModel, None] = MetabolomicModel()
25-
gene_expression: Union[GeneExpressionModel, None] = GeneExpressionModel(
26-
expression_type="OTHER"
27-
)
28-
prediction: Union[PredictionModel, None] = None
20+
data: Annotated[
21+
DataModel,
22+
Field(
23+
description="A subsection corresponding to the data to be used in this job"
24+
),
25+
]
26+
ml: Annotated[
27+
MlModel,
28+
Field(
29+
description="A subsection corresponding to the machine learning settings to be used in this job"
30+
),
31+
]
32+
plotting: Annotated[
33+
PlottingModel,
34+
Field(
35+
description="A subsection corresponding to the plotting settings to be used in this job"
36+
),
37+
] = PlottingModel()
38+
tabular: Annotated[
39+
Union[TabularModel, None],
40+
Field(
41+
description="A subsection with settings if the data is of tabular type, this field can be None if not."
42+
),
43+
] = TabularModel()
44+
microbiome: Annotated[
45+
Union[MicrobiomeModel, None],
46+
Field(
47+
description="A subsection with settings if the data is of microbiome type, this field can be None if not."
48+
),
49+
] = MicrobiomeModel()
50+
metabolomic: Annotated[
51+
Union[MetabolomicModel, None],
52+
Field(
53+
description="A subsection with settings if the data is of metabolomic type, this field can be None if not."
54+
),
55+
] = MetabolomicModel()
56+
gene_expression: Annotated[
57+
Union[GeneExpressionModel, None],
58+
Field(
59+
description="A subsection with settings if the data is of gene expression type, this field can be None if not."
60+
),
61+
] = GeneExpressionModel(expression_type="OTHER")
62+
prediction: Annotated[
63+
Union[PredictionModel, None],
64+
Field(
65+
description="A subsection containing setting if a prediction job is to be run, this field can be None if not."
66+
),
67+
] = None
2968

3069
@model_validator(mode="after")
3170
def check(self):

autoxai4omics/utils/parser/data_model.py

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,50 @@
44
# https://opensource.org/licenses/MIT
55

66
from typing import Literal, Union
7-
from pydantic import BaseModel, FilePath, DirectoryPath, model_validator
7+
from pydantic import BaseModel, FilePath, DirectoryPath, model_validator, Field
8+
from typing_extensions import Annotated
89

910

1011
class DataModel(BaseModel):
11-
name: str
12-
file_path: FilePath
13-
metadata_file: Union[FilePath, None] = None
14-
file_path_holdout_data: Union[FilePath, None] = None
15-
metadata_file_holdout_data: Union[FilePath, None] = None
16-
save_path: DirectoryPath = "/experiments/"
17-
target: str
18-
data_type: Literal[
19-
"tabular", "gene_expression", "microbiome", "metabolomic", "other", "R2G"
12+
name: Annotated[str, Field(description="The name that is to be given to this job.")]
13+
file_path: Annotated[
14+
FilePath,
15+
Field(description="The path to the data that this job is to be run on."),
16+
]
17+
metadata_file: Annotated[
18+
Union[FilePath, None],
19+
Field(
20+
description="The metadata file that accompanies the data to be trained on."
21+
),
22+
] = None
23+
file_path_holdout_data: Annotated[
24+
Union[FilePath, None],
25+
Field(
26+
description="The path to the dataset that is to be used as a holdout set."
27+
),
28+
] = None
29+
metadata_file_holdout_data: Annotated[
30+
Union[FilePath, None],
31+
Field(
32+
description="The path to the metadata file that accompanies the holdout set"
33+
),
34+
] = None
35+
save_path: Annotated[
36+
DirectoryPath, Field(description="The path where the results shall be save.")
37+
] = "/experiments/"
38+
target: Annotated[
39+
str,
40+
Field(
41+
description="The name of the column in the dataset or in the metadata that is to be predicted."
42+
),
43+
]
44+
data_type: Annotated[
45+
Literal[
46+
"tabular", "gene_expression", "microbiome", "metabolomic", "other", "R2G"
47+
],
48+
Field(
49+
description='The type of the data that this job will be run on. Note - "R2G" means Ready to Go, meaning that no preprocessing is required and that the dataset is already split into train/test sets (denoted by a column called "set") and has labels present in a "label" column.'
50+
),
2051
]
2152

2253
@model_validator(mode="after")

autoxai4omics/utils/parser/featureSelection_model.py

Lines changed: 69 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@
88
# https://opensource.org/licenses/MIT
99

1010
from typing import Union, Literal
11-
from pydantic import (
12-
BaseModel,
13-
PositiveInt,
14-
NonNegativeFloat,
15-
)
11+
from typing_extensions import Annotated
12+
from pydantic import BaseModel, PositiveInt, NonNegativeFloat, Field
1613

1714
from models.model_defs import MODELS
1815
from metrics.metric_defs import METRICS
@@ -26,12 +23,37 @@
2623

2724

2825
class AutoModel(BaseModel):
29-
min_features: PositiveInt = 10
30-
max_features: Union[PositiveInt, None] = None
31-
interval: PositiveInt = 1
32-
eval_model: Union[None, Literal[MODEL_NAMES_ALL]] = None
33-
eval_metric: Union[None, Literal[METRICS_NAMES_ALL]] = None
34-
low: bool = True
26+
min_features: Annotated[
27+
PositiveInt, Field(description="The minimium number of features to consider.")
28+
] = 10
29+
max_features: Annotated[
30+
Union[PositiveInt, None],
31+
Field(
32+
description="The maximum number of features to consider, if None will default to the number of columns in the given dataset."
33+
),
34+
] = None
35+
interval: Annotated[
36+
PositiveInt,
37+
Field(
38+
description="The size of the logarithmic increments to consider when searching for the best number of features."
39+
),
40+
] = 1
41+
eval_model: Annotated[
42+
Union[None, Literal[MODEL_NAMES_ALL]],
43+
Field(description="The estimator to use to evaluate the selected features."),
44+
] = None
45+
eval_metric: Annotated[
46+
Union[None, Literal[METRICS_NAMES_ALL]],
47+
Field(
48+
description="The metric to use to evaluate the model trained on the selected features."
49+
),
50+
] = None
51+
low: Annotated[
52+
bool,
53+
Field(
54+
description="A bool to indicate if the lower the eval_metric the better."
55+
),
56+
] = True
3557

3658
def validateWithProblemType(self, problemType):
3759
if problemType not in [CLASSIFICATION, REGRESSION]:
@@ -72,9 +94,20 @@ def validateWithProblemType(self, problemType):
7294

7395

7496
class MethodModel(BaseModel):
75-
name: Literal[FS_NAMES_MENTHODS] = "SelectKBest"
76-
metric: Union[None, Literal[FS_NAMES_KBMETRICS]] = None
77-
estimator: Union[None, Literal[MODEL_NAMES_ALL]] = None
97+
name: Annotated[
98+
Literal[FS_NAMES_MENTHODS],
99+
Field(description="The feature selection method to use"),
100+
] = "SelectKBest"
101+
metric: Annotated[
102+
Union[None, Literal[FS_NAMES_KBMETRICS]],
103+
Field(
104+
description="The metric to use during the feature selection, if required."
105+
),
106+
] = None
107+
estimator: Annotated[
108+
Union[None, Literal[MODEL_NAMES_ALL]],
109+
Field(description="the model to use during the feature selection if required."),
110+
] = None
78111

79112
def validateWithProblemType(self, problemType):
80113
if problemType not in [CLASSIFICATION, REGRESSION]:
@@ -110,10 +143,28 @@ def validateWithProblemType(self, problemType):
110143

111144

112145
class FeatureSelectionModel(BaseModel):
113-
k: Union[PositiveInt, Literal["auto"]] = "auto"
114-
var_threshold: NonNegativeFloat = 0
115-
auto: Union[None, AutoModel] = AutoModel()
116-
method: Union[None, MethodModel] = MethodModel()
146+
k: Annotated[
147+
Union[PositiveInt, Literal["auto"]],
148+
Field(
149+
description='The number of features to select, if "auto" is chosen it will find the best number of features to use'
150+
),
151+
] = "auto"
152+
var_threshold: Annotated[
153+
NonNegativeFloat,
154+
Field(description="The value to use for variance thresholding."),
155+
] = 0
156+
auto: Annotated[
157+
Union[None, AutoModel],
158+
Field(
159+
description="The setting for configuring the automated feature selection"
160+
),
161+
] = AutoModel()
162+
method: Annotated[
163+
Union[None, MethodModel],
164+
Field(
165+
description="The setting for the method to use for the feature selection."
166+
),
167+
] = MethodModel()
117168

118169
# TODO: do conditional validation
119170

autoxai4omics/utils/parser/ml_model.py

Lines changed: 91 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,32 +9,105 @@
99
from .featureSelection_model import FeatureSelectionModel
1010
from metrics.metric_defs import METRICS
1111
from models.model_defs import MODELS
12-
from pydantic import BaseModel, NonNegativeInt, confloat, model_validator
12+
from pydantic import BaseModel, NonNegativeInt, confloat, model_validator, Field
1313
from typing import Literal, Union, List
14+
from typing_extensions import Annotated
1415

1516
TestSize = confloat(strict=True, le=1, ge=0)
1617
METRICS_NAMES_ALL = tuple(set().union(*METRICS.values()))
1718
MODEL_NAMES_ALL = tuple(set().union(*MODELS.values()))
1819

1920

2021
class MlModel(BaseModel):
21-
seed_num: NonNegativeInt = 29292
22-
test_size: TestSize = 0.2 # type: ignore
23-
problem_type: Literal[CLASSIFICATION, REGRESSION]
24-
hyper_tuning: Literal["random", "grid"] = "random"
25-
hyper_budget: NonNegativeInt = 50
26-
stratify_by_groups: Literal["Y", "N"] = "N"
27-
standardize: bool = True
28-
groups: str = None # need to check
29-
balancing: Literal["OVER", "UNDER", "NONE"] = "NONE"
30-
fit_scorer: Union[None, Literal[METRICS_NAMES_ALL]] = None
31-
scorer_list: Union[None, List[Literal[METRICS_NAMES_ALL]]] = []
32-
model_list: List[Literal[MODEL_NAMES_ALL]]
33-
encoding: Literal["label", "onehot", None] = None
34-
autokeras_config: Union[AutoKerasModel, None] = AutoKerasModel()
35-
autolgbm_config: Union[AutoLgbmModel, None] = AutoLgbmModel()
36-
autoxgboost_config: Union[AutoXgboostModel, None] = AutoXgboostModel()
37-
feature_selection: Union[FeatureSelectionModel, None] = FeatureSelectionModel()
22+
seed_num: Annotated[
23+
NonNegativeInt,
24+
Field(
25+
description="The random set to set for this run, used for making the results reproducible."
26+
),
27+
] = 29292
28+
test_size: Annotated[TestSize, Field(description="The percentage of the data to use for testing")] = 0.2 # type: ignore
29+
problem_type: Annotated[
30+
Literal[CLASSIFICATION, REGRESSION],
31+
Field(description="The problem type that this job shall be attempting."),
32+
]
33+
# TODO: consider making hyper tuning a submodel
34+
# TODO: add None to hyper tunning method
35+
hyper_tuning: Annotated[
36+
Literal["random", "grid"],
37+
Field(description="The hyper_tunning method to use during the job."),
38+
] = "random"
39+
hyper_budget: Annotated[
40+
NonNegativeInt,
41+
Field(
42+
description='The budget to give for hyper tuning, only used if hyper_tuning is "random".'
43+
),
44+
] = 50
45+
# TODO: consider making a stratification /split submodel
46+
# TODO: change below to a boolean
47+
stratify_by_groups: Annotated[
48+
Literal["Y", "N"],
49+
Field(
50+
description="A field to indicate if the test/train dataset should be stratified by a group"
51+
),
52+
] = "N"
53+
groups: Annotated[
54+
str, Field(description="The name of the column to stratify the group by.")
55+
] = None # need to check
56+
# TODO consider making a sub-model for preprocessing
57+
standardize: Annotated[
58+
bool,
59+
Field(description="A bool to indicate if the data should be standardised."),
60+
] = True
61+
balancing: Annotated[
62+
Literal["OVER", "UNDER", "NONE"],
63+
Field(
64+
description="A field to indicate which balancing methodology to use, only relevant for classification problems."
65+
),
66+
] = "NONE"
67+
fit_scorer: Annotated[
68+
Union[None, Literal[METRICS_NAMES_ALL]],
69+
Field(description="Which metric the models should optimis during training."),
70+
] = None
71+
scorer_list: Annotated[
72+
Union[None, List[Literal[METRICS_NAMES_ALL]]],
73+
Field(description="Which metrics should be calculated for evaluation."),
74+
] = []
75+
# TODO: consider adding a None option which will default to all applicable models.
76+
model_list: Annotated[
77+
List[Literal[MODEL_NAMES_ALL]],
78+
Field(description="A list of models to be trained in the job."),
79+
]
80+
# TODO: check what the below actually drive
81+
encoding: Annotated[
82+
Literal["label", "onehot", None],
83+
Field(
84+
description="Which encoding method to use, only relevant in classification problems."
85+
),
86+
] = None
87+
autokeras_config: Annotated[
88+
Union[AutoKerasModel, None],
89+
Field(
90+
description="setting to be used for AutoKeras if it is chosen to be trained. Can be set to None if not selected."
91+
),
92+
] = AutoKerasModel()
93+
autolgbm_config: Annotated[
94+
Union[AutoLgbmModel, None],
95+
Field(
96+
description="settings to be used for AutoLgbm if chosen to be train. Can be set to None if not selected."
97+
),
98+
] = AutoLgbmModel()
99+
autoxgboost_config: Annotated[
100+
Union[AutoXgboostModel, None],
101+
Field(
102+
description="settings to be used for AutoXgboost if chosen to be train. Can be set to None if not selected."
103+
),
104+
] = AutoXgboostModel()
105+
feature_selection: Annotated[
106+
Union[FeatureSelectionModel, None],
107+
Field(
108+
description="Settings to be used for feature selection. If None no feature selection will be done."
109+
),
110+
] = FeatureSelectionModel()
38111

39112
@model_validator(mode="after")
40113
def check(self):

0 commit comments

Comments
 (0)