From 8b4b4c66ea11e94cedcd1708ba8eb1b83f123eb3 Mon Sep 17 00:00:00 2001 From: tintinrevient Date: Thu, 31 Jul 2025 15:39:24 +0200 Subject: [PATCH 1/2] Remove default for Typer.Option for pathlib Path object --- models/esm/src/pg2_model_esm/__main__.py | 8 ++++++-- models/pls/src/pg2_model_pls/__main__.py | 7 +++++-- models/pls/src/pg2_model_pls/utils.py | 21 +++++++++++---------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/models/esm/src/pg2_model_esm/__main__.py b/models/esm/src/pg2_model_esm/__main__.py index 82899a59..6526d612 100644 --- a/models/esm/src/pg2_model_esm/__main__.py +++ b/models/esm/src/pg2_model_esm/__main__.py @@ -34,20 +34,23 @@ def train( dataset_file: Annotated[ Path, typer.Option( - default=SageMakerTrainingJobPath.TRAINING_JOB_PATH, help="Path to the dataset file", ), ], model_toml_file: Annotated[ Path, typer.Option( - default=SageMakerTrainingJobPath.MANIFEST_PATH, help="Path to the model TOML file", ), ], ): + # Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/#path-validations + # Cannot use default in Typer.Option for pathlib's Path object, + # Otherwise the error: 'AttributeError: 'PosixPath' object has no attribute 'isidentifier' + console.print(f"Loading {dataset_file} and {model_toml_file}...") + dataset_file = dataset_file or SageMakerTrainingJobPath.TRAINING_JOB_PATH dataset = Dataset.from_path(dataset_file) assays = dataset.assays.meta.assays @@ -60,6 +63,7 @@ def train( console.print(f"Loaded {len(df)} records.") + model_toml_file = model_toml_file or SageMakerTrainingJobPath.MANIFEST_PATH manifest = Manifest.from_path(model_toml_file) model, alphabet = pretrained.load_model_and_alphabet( diff --git a/models/pls/src/pg2_model_pls/__main__.py b/models/pls/src/pg2_model_pls/__main__.py index 0dc2e8a6..261322b8 100644 --- a/models/pls/src/pg2_model_pls/__main__.py +++ b/models/pls/src/pg2_model_pls/__main__.py @@ -32,22 +32,25 @@ def train( dataset_file: Annotated[ Path, typer.Option( - default=SageMakerTrainingJobPath.TRAINING_JOB_PATH, help="Path to the dataset file", ), ], model_toml_file: Annotated[ Path, typer.Option( - default=SageMakerTrainingJobPath.MANIFEST_PATH, help="Path to the model TOML file", ), ], ): + # Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/#path-validations + # Cannot use default in Typer.Option for pathlib's Path object, + # Otherwise the error: 'AttributeError: 'PosixPath' object has no attribute 'isidentifier' console.print(f"Loading {dataset_file} and {model_toml_file}...") + dataset_file = dataset_file or SageMakerTrainingJobPath.TRAINING_JOB_PATH dataset = Dataset.from_path(dataset_file) + model_toml_file = model_toml_file or SageMakerTrainingJobPath.MANIFEST_PATH manifest = Manifest.from_path(model_toml_file) train_X, train_Y = load_x_and_y( diff --git a/models/pls/src/pg2_model_pls/utils.py b/models/pls/src/pg2_model_pls/utils.py index 19c56ea1..23f4d6a8 100644 --- a/models/pls/src/pg2_model_pls/utils.py +++ b/models/pls/src/pg2_model_pls/utils.py @@ -1,6 +1,7 @@ import numpy as np from typing import Any import pickle +from pathlib import Path from sklearn.cross_decomposition import PLSRegression from pg2_dataset.dataset import Dataset from pg2_dataset.backends.assays import SPLIT_STRATEGY_MAPPING @@ -125,8 +126,8 @@ def encode(spit_X: list[Any], hyper_params: dict[str, Any]) -> np.ndarray: def train_model( train_X: list[list[Any]], train_Y: list[Any], - model_toml_file: str, - model_path: str, + model_toml_file: Path, + model_path: Path, ) -> None: """Train a PLS regression model on encoded protein sequences and save it to disk. @@ -139,9 +140,9 @@ def train_model( Each inner list represents a single sequence. train_Y (list[Any]): Training target values corresponding to the sequences in train_X. - model_toml_file (str): Path to the TOML configuration file containing model + model_toml_file (Path): Path to the TOML configuration file containing model hyperparameters, including encoding parameters and n_components for PLS. - model_path (str): File path where the trained model will be saved as a + model_path (Path): File path where the trained model will be saved as a pickled object. Returns: @@ -167,7 +168,7 @@ def train_model( model = PLSRegression(manifest.hyper_params["n_components"]) model.fit(encodings, train_Y) - with open(model_path, "wb") as file: + with model_path.open(mode="wb") as file: pickle.dump(model, file) logger.info(f"Saved the model to {model_path}") @@ -175,8 +176,8 @@ def train_model( def predict_model( test_X: list[list[Any]], - model_toml_file: str, - model_path: str, + model_toml_file: Path, + model_path: Path, ) -> list[Any]: """Load a trained model and generate predictions on test sequences. @@ -187,9 +188,9 @@ def predict_model( Args: test_X (list[list[Any]]): Test feature data containing protein sequences. Each inner list represents a single sequence to predict on. - model_toml_file (str): Path to the TOML configuration file containing model + model_toml_file (Path): Path to the TOML configuration file containing model hyperparameters used for consistent encoding of test sequences. - model_path (str): File path to the saved pickled model to load for prediction. + model_path (Path): File path to the saved pickled model to load for prediction. Returns: list[Any]: List of predictions corresponding to each sequence in test_X. @@ -211,7 +212,7 @@ def predict_model( """ logger.info(f"Testing the model with {len(test_X)} records.") - with open(model_path, "rb") as file: + with model_path.open(mode="rb") as file: model = pickle.load(file) manifest = Manifest.from_path(model_toml_file) From 9ce9f2dd563fae532aa8d8c0e689d8b9e838a9c2 Mon Sep 17 00:00:00 2001 From: tintinrevient Date: Thu, 31 Jul 2025 16:01:48 +0200 Subject: [PATCH 2/2] Add default value behind Annotated --- models/esm/src/pg2_model_esm/__main__.py | 10 ++-------- models/pls/src/pg2_model_pls/__main__.py | 9 ++------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/models/esm/src/pg2_model_esm/__main__.py b/models/esm/src/pg2_model_esm/__main__.py index 6526d612..487b4cc8 100644 --- a/models/esm/src/pg2_model_esm/__main__.py +++ b/models/esm/src/pg2_model_esm/__main__.py @@ -36,21 +36,16 @@ def train( typer.Option( help="Path to the dataset file", ), - ], + ] = SageMakerTrainingJobPath.TRAINING_JOB_PATH, model_toml_file: Annotated[ Path, typer.Option( help="Path to the model TOML file", ), - ], + ] = SageMakerTrainingJobPath.MANIFEST_PATH, ): - # Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/#path-validations - # Cannot use default in Typer.Option for pathlib's Path object, - # Otherwise the error: 'AttributeError: 'PosixPath' object has no attribute 'isidentifier' - console.print(f"Loading {dataset_file} and {model_toml_file}...") - dataset_file = dataset_file or SageMakerTrainingJobPath.TRAINING_JOB_PATH dataset = Dataset.from_path(dataset_file) assays = dataset.assays.meta.assays @@ -63,7 +58,6 @@ def train( console.print(f"Loaded {len(df)} records.") - model_toml_file = model_toml_file or SageMakerTrainingJobPath.MANIFEST_PATH manifest = Manifest.from_path(model_toml_file) model, alphabet = pretrained.load_model_and_alphabet( diff --git a/models/pls/src/pg2_model_pls/__main__.py b/models/pls/src/pg2_model_pls/__main__.py index 261322b8..e3679544 100644 --- a/models/pls/src/pg2_model_pls/__main__.py +++ b/models/pls/src/pg2_model_pls/__main__.py @@ -34,23 +34,18 @@ def train( typer.Option( help="Path to the dataset file", ), - ], + ] = SageMakerTrainingJobPath.TRAINING_JOB_PATH, model_toml_file: Annotated[ Path, typer.Option( help="Path to the model TOML file", ), - ], + ] = SageMakerTrainingJobPath.MANIFEST_PATH, ): - # Reference: https://typer.tiangolo.com/tutorial/parameter-types/path/#path-validations - # Cannot use default in Typer.Option for pathlib's Path object, - # Otherwise the error: 'AttributeError: 'PosixPath' object has no attribute 'isidentifier' console.print(f"Loading {dataset_file} and {model_toml_file}...") - dataset_file = dataset_file or SageMakerTrainingJobPath.TRAINING_JOB_PATH dataset = Dataset.from_path(dataset_file) - model_toml_file = model_toml_file or SageMakerTrainingJobPath.MANIFEST_PATH manifest = Manifest.from_path(model_toml_file) train_X, train_Y = load_x_and_y(