|
9 | 9 | from .featureSelection_model import FeatureSelectionModel |
10 | 10 | from metrics.metric_defs import METRICS |
11 | 11 | from models.model_defs import MODELS |
12 | | -from pydantic import BaseModel, NonNegativeInt, confloat, model_validator |
| 12 | +from pydantic import BaseModel, NonNegativeInt, confloat, model_validator, Field |
13 | 13 | from typing import Literal, Union, List |
| 14 | +from typing_extensions import Annotated |
14 | 15 |
|
15 | 16 | TestSize = confloat(strict=True, le=1, ge=0) |
16 | 17 | METRICS_NAMES_ALL = tuple(set().union(*METRICS.values())) |
17 | 18 | MODEL_NAMES_ALL = tuple(set().union(*MODELS.values())) |
18 | 19 |
|
19 | 20 |
|
20 | 21 | class MlModel(BaseModel): |
21 | | - seed_num: NonNegativeInt = 29292 |
22 | | - test_size: TestSize = 0.2 # type: ignore |
23 | | - problem_type: Literal[CLASSIFICATION, REGRESSION] |
24 | | - hyper_tuning: Literal["random", "grid"] = "random" |
25 | | - hyper_budget: NonNegativeInt = 50 |
26 | | - stratify_by_groups: Literal["Y", "N"] = "N" |
27 | | - standardize: bool = True |
28 | | - groups: str = None # need to check |
29 | | - balancing: Literal["OVER", "UNDER", "NONE"] = "NONE" |
30 | | - fit_scorer: Union[None, Literal[METRICS_NAMES_ALL]] = None |
31 | | - scorer_list: Union[None, List[Literal[METRICS_NAMES_ALL]]] = [] |
32 | | - model_list: List[Literal[MODEL_NAMES_ALL]] |
33 | | - encoding: Literal["label", "onehot", None] = None |
34 | | - autokeras_config: Union[AutoKerasModel, None] = AutoKerasModel() |
35 | | - autolgbm_config: Union[AutoLgbmModel, None] = AutoLgbmModel() |
36 | | - autoxgboost_config: Union[AutoXgboostModel, None] = AutoXgboostModel() |
37 | | - feature_selection: Union[FeatureSelectionModel, None] = FeatureSelectionModel() |
| 22 | + seed_num: Annotated[ |
| 23 | + NonNegativeInt, |
| 24 | + Field( |
| 25 | + description="The random set to set for this run, used for making the results reproducible." |
| 26 | + ), |
| 27 | + ] = 29292 |
| 28 | + test_size: Annotated[TestSize, Field(description="The percentage of the data to use for testing")] = 0.2 # type: ignore |
| 29 | + problem_type: Annotated[ |
| 30 | + Literal[CLASSIFICATION, REGRESSION], |
| 31 | + Field(description="The problem type that this job shall be attempting."), |
| 32 | + ] |
| 33 | + # TODO: consider making hyper tuning a submodel |
| 34 | + # TODO: add None to hyper tunning method |
| 35 | + hyper_tuning: Annotated[ |
| 36 | + Literal["random", "grid"], |
| 37 | + Field(description="The hyper_tunning method to use during the job."), |
| 38 | + ] = "random" |
| 39 | + hyper_budget: Annotated[ |
| 40 | + NonNegativeInt, |
| 41 | + Field( |
| 42 | + description='The budget to give for hyper tuning, only used if hyper_tuning is "random".' |
| 43 | + ), |
| 44 | + ] = 50 |
| 45 | + # TODO: consider making a stratification /split submodel |
| 46 | + # TODO: change below to a boolean |
| 47 | + stratify_by_groups: Annotated[ |
| 48 | + Literal["Y", "N"], |
| 49 | + Field( |
| 50 | + description="A field to indicate if the test/train dataset should be stratified by a group" |
| 51 | + ), |
| 52 | + ] = "N" |
| 53 | + groups: Annotated[ |
| 54 | + str, Field(description="The name of the column to stratify the group by.") |
| 55 | + ] = None # need to check |
| 56 | + # TODO consider making a sub-model for preprocessing |
| 57 | + standardize: Annotated[ |
| 58 | + bool, |
| 59 | + Field(description="A bool to indicate if the data should be standardised."), |
| 60 | + ] = True |
| 61 | + balancing: Annotated[ |
| 62 | + Literal["OVER", "UNDER", "NONE"], |
| 63 | + Field( |
| 64 | + description="A field to indicate which balancing methodology to use, only relevant for classification problems." |
| 65 | + ), |
| 66 | + ] = "NONE" |
| 67 | + fit_scorer: Annotated[ |
| 68 | + Union[None, Literal[METRICS_NAMES_ALL]], |
| 69 | + Field(description="Which metric the models should optimis during training."), |
| 70 | + ] = None |
| 71 | + scorer_list: Annotated[ |
| 72 | + Union[None, List[Literal[METRICS_NAMES_ALL]]], |
| 73 | + Field(description="Which metrics should be calculated for evaluation."), |
| 74 | + ] = [] |
| 75 | + # TODO: consider adding a None option which will default to all applicable models. |
| 76 | + model_list: Annotated[ |
| 77 | + List[Literal[MODEL_NAMES_ALL]], |
| 78 | + Field(description="A list of models to be trained in the job."), |
| 79 | + ] |
| 80 | + # TODO: check what the below actually drive |
| 81 | + encoding: Annotated[ |
| 82 | + Literal["label", "onehot", None], |
| 83 | + Field( |
| 84 | + description="Which encoding method to use, only relevant in classification problems." |
| 85 | + ), |
| 86 | + ] = None |
| 87 | + autokeras_config: Annotated[ |
| 88 | + Union[AutoKerasModel, None], |
| 89 | + Field( |
| 90 | + description="setting to be used for AutoKeras if it is chosen to be trained. Can be set to None if not selected." |
| 91 | + ), |
| 92 | + ] = AutoKerasModel() |
| 93 | + autolgbm_config: Annotated[ |
| 94 | + Union[AutoLgbmModel, None], |
| 95 | + Field( |
| 96 | + description="settings to be used for AutoLgbm if chosen to be train. Can be set to None if not selected." |
| 97 | + ), |
| 98 | + ] = AutoLgbmModel() |
| 99 | + autoxgboost_config: Annotated[ |
| 100 | + Union[AutoXgboostModel, None], |
| 101 | + Field( |
| 102 | + description="settings to be used for AutoXgboost if chosen to be train. Can be set to None if not selected." |
| 103 | + ), |
| 104 | + ] = AutoXgboostModel() |
| 105 | + feature_selection: Annotated[ |
| 106 | + Union[FeatureSelectionModel, None], |
| 107 | + Field( |
| 108 | + description="Settings to be used for feature selection. If None no feature selection will be done." |
| 109 | + ), |
| 110 | + ] = FeatureSelectionModel() |
38 | 111 |
|
39 | 112 | @model_validator(mode="after") |
40 | 113 | def check(self): |
|
0 commit comments