IBM
diff --git a/‎autoxai4omics/utils/parser/config_model.py‎
Lines changed: 50 additions & 11 deletions b/‎autoxai4omics/utils/parser/config_model.py‎
Lines changed: 50 additions & 11 deletions
diff --git a/‎autoxai4omics/utils/parser/data_model.py‎
Lines changed: 41 additions & 10 deletions b/‎autoxai4omics/utils/parser/data_model.py‎
Lines changed: 41 additions & 10 deletions
diff --git a/‎autoxai4omics/utils/parser/featureSelection_model.py‎
Lines changed: 69 additions & 18 deletions b/‎autoxai4omics/utils/parser/featureSelection_model.py‎
Lines changed: 69 additions & 18 deletions
diff --git a/‎autoxai4omics/utils/parser/ml_model.py‎
Lines changed: 91 additions & 18 deletions b/‎autoxai4omics/utils/parser/ml_model.py‎
Lines changed: 91 additions & 18 deletions
@@ -11,21 +11,60 @@
 from .plotting_model import PlottingModel
 from .prediction_model import PredictionModel
 from .tabular_model import TabularModel
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel, model_validator, Field
 from typing import Union
+from typing_extensions import Annotated
 
 
 class ConfigModel(BaseModel):
-    data: DataModel
-    ml: MlModel
-    plotting: PlottingModel = PlottingModel()
-    tabular: Union[TabularModel, None] = TabularModel()
-    microbiome: Union[MicrobiomeModel, None] = MicrobiomeModel()
-    metabolomic: Union[MetabolomicModel, None] = MetabolomicModel()
-    gene_expression: Union[GeneExpressionModel, None] = GeneExpressionModel(
-        expression_type="OTHER"
-    )
-    prediction: Union[PredictionModel, None] = None
+    data: Annotated[
+        DataModel,
+        Field(
+            description="A subsection corresponding to the data to be used in this job"
+        ),
+    ]
+    ml: Annotated[
+        MlModel,
+        Field(
+            description="A subsection corresponding to the machine learning settings to be used in this job"
+        ),
+    ]
+    plotting: Annotated[
+        PlottingModel,
+        Field(
+            description="A subsection corresponding to the plotting settings to be used in this job"
+        ),
+    ] = PlottingModel()
+    tabular: Annotated[
+        Union[TabularModel, None],
+        Field(
+            description="A subsection with settings if the data is of tabular type, this field can be None if not."
+        ),
+    ] = TabularModel()
+    microbiome: Annotated[
+        Union[MicrobiomeModel, None],
+        Field(
+            description="A subsection with settings if the data is of microbiome type, this field can be None if not."
+        ),
+    ] = MicrobiomeModel()
+    metabolomic: Annotated[
+        Union[MetabolomicModel, None],
+        Field(
+            description="A subsection with settings if the data is of metabolomic type, this field can be None if not."
+        ),
+    ] = MetabolomicModel()
+    gene_expression: Annotated[
+        Union[GeneExpressionModel, None],
+        Field(
+            description="A subsection with settings if the data is of gene expression type, this field can be None if not."
+        ),
+    ] = GeneExpressionModel(expression_type="OTHER")
+    prediction: Annotated[
+        Union[PredictionModel, None],
+        Field(
+            description="A subsection containing setting if a prediction job is to be run, this field can be None if not."
+        ),
+    ] = None
 
     @model_validator(mode="after")
     def check(self):
 
@@ -4,19 +4,50 @@
 # https://opensource.org/licenses/MIT
 
 from typing import Literal, Union
-from pydantic import BaseModel, FilePath, DirectoryPath, model_validator
+from pydantic import BaseModel, FilePath, DirectoryPath, model_validator, Field
+from typing_extensions import Annotated
 
 
 class DataModel(BaseModel):
-    name: str
-    file_path: FilePath
-    metadata_file: Union[FilePath, None] = None
-    file_path_holdout_data: Union[FilePath, None] = None
-    metadata_file_holdout_data: Union[FilePath, None] = None
-    save_path: DirectoryPath = "/experiments/"
-    target: str
-    data_type: Literal[
-        "tabular", "gene_expression", "microbiome", "metabolomic", "other", "R2G"
+    name: Annotated[str, Field(description="The name that is to be given to this job.")]
+    file_path: Annotated[
+        FilePath,
+        Field(description="The path to the data that this job is to be run on."),
+    ]
+    metadata_file: Annotated[
+        Union[FilePath, None],
+        Field(
+            description="The metadata file that accompanies the data to be trained on."
+        ),
+    ] = None
+    file_path_holdout_data: Annotated[
+        Union[FilePath, None],
+        Field(
+            description="The path to the dataset that is to be used as a holdout set."
+        ),
+    ] = None
+    metadata_file_holdout_data: Annotated[
+        Union[FilePath, None],
+        Field(
+            description="The path to the metadata file that accompanies the holdout set"
+        ),
+    ] = None
+    save_path: Annotated[
+        DirectoryPath, Field(description="The path where the results shall be save.")
+    ] = "/experiments/"
+    target: Annotated[
+        str,
+        Field(
+            description="The name of the column in the dataset or in the metadata that is to be predicted."
+        ),
+    ]
+    data_type: Annotated[
+        Literal[
+            "tabular", "gene_expression", "microbiome", "metabolomic", "other", "R2G"
+        ],
+        Field(
+            description='The type of the data that this job will be run on. Note - "R2G" means Ready to Go, meaning that no preprocessing is required and that the dataset is already split into train/test sets (denoted by a column called "set") and has labels present in a "label" column.'
+        ),
     ]
 
     @model_validator(mode="after")
 
@@ -8,11 +8,8 @@
 # https://opensource.org/licenses/MIT
 
 from typing import Union, Literal
-from pydantic import (
-    BaseModel,
-    PositiveInt,
-    NonNegativeFloat,
-)
+from typing_extensions import Annotated
+from pydantic import BaseModel, PositiveInt, NonNegativeFloat, Field
 
 from models.model_defs import MODELS
 from metrics.metric_defs import METRICS
@@ -26,12 +23,37 @@
 
 
 class AutoModel(BaseModel):
-    min_features: PositiveInt = 10
-    max_features: Union[PositiveInt, None] = None
-    interval: PositiveInt = 1
-    eval_model: Union[None, Literal[MODEL_NAMES_ALL]] = None
-    eval_metric: Union[None, Literal[METRICS_NAMES_ALL]] = None
-    low: bool = True
+    min_features: Annotated[
+        PositiveInt, Field(description="The minimium number of features to consider.")
+    ] = 10
+    max_features: Annotated[
+        Union[PositiveInt, None],
+        Field(
+            description="The maximum number of features to consider, if None will default to the number of columns in the given dataset."
+        ),
+    ] = None
+    interval: Annotated[
+        PositiveInt,
+        Field(
+            description="The size of the logarithmic increments to consider when searching for the best number of features."
+        ),
+    ] = 1
+    eval_model: Annotated[
+        Union[None, Literal[MODEL_NAMES_ALL]],
+        Field(description="The estimator to use to evaluate the selected features."),
+    ] = None
+    eval_metric: Annotated[
+        Union[None, Literal[METRICS_NAMES_ALL]],
+        Field(
+            description="The metric to use to evaluate the model trained on the selected features."
+        ),
+    ] = None
+    low: Annotated[
+        bool,
+        Field(
+            description="A bool to indicate if the lower the eval_metric the better."
+        ),
+    ] = True
 
     def validateWithProblemType(self, problemType):
         if problemType not in [CLASSIFICATION, REGRESSION]:
@@ -72,9 +94,20 @@ def validateWithProblemType(self, problemType):
 
 
 class MethodModel(BaseModel):
-    name: Literal[FS_NAMES_MENTHODS] = "SelectKBest"
-    metric: Union[None, Literal[FS_NAMES_KBMETRICS]] = None
-    estimator: Union[None, Literal[MODEL_NAMES_ALL]] = None
+    name: Annotated[
+        Literal[FS_NAMES_MENTHODS],
+        Field(description="The feature selection method to use"),
+    ] = "SelectKBest"
+    metric: Annotated[
+        Union[None, Literal[FS_NAMES_KBMETRICS]],
+        Field(
+            description="The metric to use during the feature selection, if required."
+        ),
+    ] = None
+    estimator: Annotated[
+        Union[None, Literal[MODEL_NAMES_ALL]],
+        Field(description="the model to use during the feature selection if required."),
+    ] = None
 
     def validateWithProblemType(self, problemType):
         if problemType not in [CLASSIFICATION, REGRESSION]:
@@ -110,10 +143,28 @@ def validateWithProblemType(self, problemType):
 
 
 class FeatureSelectionModel(BaseModel):
-    k: Union[PositiveInt, Literal["auto"]] = "auto"
-    var_threshold: NonNegativeFloat = 0
-    auto: Union[None, AutoModel] = AutoModel()
-    method: Union[None, MethodModel] = MethodModel()
+    k: Annotated[
+        Union[PositiveInt, Literal["auto"]],
+        Field(
+            description='The number of features to select, if "auto" is chosen it will find the best number of features to use'
+        ),
+    ] = "auto"
+    var_threshold: Annotated[
+        NonNegativeFloat,
+        Field(description="The value to use for variance thresholding."),
+    ] = 0
+    auto: Annotated[
+        Union[None, AutoModel],
+        Field(
+            description="The setting for configuring the automated feature selection"
+        ),
+    ] = AutoModel()
+    method: Annotated[
+        Union[None, MethodModel],
+        Field(
+            description="The setting for the method to use for the feature selection."
+        ),
+    ] = MethodModel()
 
     # TODO: do conditional validation
 
 
@@ -9,32 +9,105 @@
 from .featureSelection_model import FeatureSelectionModel
 from metrics.metric_defs import METRICS
 from models.model_defs import MODELS
-from pydantic import BaseModel, NonNegativeInt, confloat, model_validator
+from pydantic import BaseModel, NonNegativeInt, confloat, model_validator, Field
 from typing import Literal, Union, List
+from typing_extensions import Annotated
 
 TestSize = confloat(strict=True, le=1, ge=0)
 METRICS_NAMES_ALL = tuple(set().union(*METRICS.values()))
 MODEL_NAMES_ALL = tuple(set().union(*MODELS.values()))
 
 
 class MlModel(BaseModel):
-    seed_num: NonNegativeInt = 29292
-    test_size: TestSize = 0.2  # type: ignore
-    problem_type: Literal[CLASSIFICATION, REGRESSION]
-    hyper_tuning: Literal["random", "grid"] = "random"
-    hyper_budget: NonNegativeInt = 50
-    stratify_by_groups: Literal["Y", "N"] = "N"
-    standardize: bool = True
-    groups: str = None  # need to check
-    balancing: Literal["OVER", "UNDER", "NONE"] = "NONE"
-    fit_scorer: Union[None, Literal[METRICS_NAMES_ALL]] = None
-    scorer_list: Union[None, List[Literal[METRICS_NAMES_ALL]]] = []
-    model_list: List[Literal[MODEL_NAMES_ALL]]
-    encoding: Literal["label", "onehot", None] = None
-    autokeras_config: Union[AutoKerasModel, None] = AutoKerasModel()
-    autolgbm_config: Union[AutoLgbmModel, None] = AutoLgbmModel()
-    autoxgboost_config: Union[AutoXgboostModel, None] = AutoXgboostModel()
-    feature_selection: Union[FeatureSelectionModel, None] = FeatureSelectionModel()
+    seed_num: Annotated[
+        NonNegativeInt,
+        Field(
+            description="The random set to set for this run, used for making the results reproducible."
+        ),
+    ] = 29292
+    test_size: Annotated[TestSize, Field(description="The percentage of the data to use for testing")] = 0.2  # type: ignore
+    problem_type: Annotated[
+        Literal[CLASSIFICATION, REGRESSION],
+        Field(description="The problem type that this job shall be attempting."),
+    ]
+    # TODO: consider making hyper tuning a submodel
+    # TODO: add None to hyper tunning method
+    hyper_tuning: Annotated[
+        Literal["random", "grid"],
+        Field(description="The hyper_tunning method to use during the job."),
+    ] = "random"
+    hyper_budget: Annotated[
+        NonNegativeInt,
+        Field(
+            description='The budget to give for hyper tuning, only used if hyper_tuning is "random".'
+        ),
+    ] = 50
+    # TODO: consider making a stratification /split submodel
+    # TODO: change below to a boolean
+    stratify_by_groups: Annotated[
+        Literal["Y", "N"],
+        Field(
+            description="A field to indicate if the test/train dataset should be stratified by a group"
+        ),
+    ] = "N"
+    groups: Annotated[
+        str, Field(description="The name of the column to stratify the group by.")
+    ] = None  # need to check
+    # TODO consider making a sub-model for preprocessing
+    standardize: Annotated[
+        bool,
+        Field(description="A bool to indicate if the data should be standardised."),
+    ] = True
+    balancing: Annotated[
+        Literal["OVER", "UNDER", "NONE"],
+        Field(
+            description="A field to indicate which balancing methodology to use, only relevant for classification problems."
+        ),
+    ] = "NONE"
+    fit_scorer: Annotated[
+        Union[None, Literal[METRICS_NAMES_ALL]],
+        Field(description="Which metric the models should optimis during training."),
+    ] = None
+    scorer_list: Annotated[
+        Union[None, List[Literal[METRICS_NAMES_ALL]]],
+        Field(description="Which metrics should be calculated for evaluation."),
+    ] = []
+    # TODO: consider adding a None option which will default to all applicable models.
+    model_list: Annotated[
+        List[Literal[MODEL_NAMES_ALL]],
+        Field(description="A list of models to be trained in the job."),
+    ]
+    # TODO: check what the below actually drive
+    encoding: Annotated[
+        Literal["label", "onehot", None],
+        Field(
+            description="Which encoding method to use, only relevant in classification problems."
+        ),
+    ] = None
+    autokeras_config: Annotated[
+        Union[AutoKerasModel, None],
+        Field(
+            description="setting to be used for AutoKeras if it is chosen to be trained. Can be set to None if not selected."
+        ),
+    ] = AutoKerasModel()
+    autolgbm_config: Annotated[
+        Union[AutoLgbmModel, None],
+        Field(
+            description="settings to be used for AutoLgbm if chosen to be train. Can be set to None if not selected."
+        ),
+    ] = AutoLgbmModel()
+    autoxgboost_config: Annotated[
+        Union[AutoXgboostModel, None],
+        Field(
+            description="settings to be used for AutoXgboost if chosen to be train. Can be set to None if not selected."
+        ),
+    ] = AutoXgboostModel()
+    feature_selection: Annotated[
+        Union[FeatureSelectionModel, None],
+        Field(
+            description="Settings to be used for feature selection. If None no feature selection will be done."
+        ),
+    ] = FeatureSelectionModel()
 
     @model_validator(mode="after")
     def check(self):