ProteinGym · tintinrevient · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/models/esm/src/pg2_model_esm/__main__.py b/models/esm/src/pg2_model_esm/__main__.py
@@ -1,11 +1,15 @@
 import torch
 import typer
+from pathlib import Path
+from typing import Tuple
 from rich.console import Console
 from pg2_dataset.dataset import Manifest
 from tqdm import tqdm
 from esm import pretrained
 from pg2_model_esm.utils import compute_pppl, label_row
 from pg2_model_esm.manifest import Manifest as ModelManifest
+import toml
+import json
 
 
 app = typer.Typer(
@@ -16,13 +20,60 @@
 err_console = Console(stderr=True)
 console = Console()
 
+prefix = Path("/opt/ml")
+training_data_path = prefix / "input" / "data" / "training"
+params_path = prefix / "input" / "config" / "hyperparameters.json"
+model_path = Path("/model.pkl")
+
+
+def _configure_container_paths(
+    dataset_toml_file: str, model_toml_file: str
+) -> Tuple[str, str, str]:
+    if not dataset_toml_file and not model_toml_file:
+        typer.echo(
+            "Configuring the paths to where SageMaker mounts interesting things in the container."
+        )
+
+        output_path = prefix / "model"
+
+        with open(params_path, "r") as f:
+            training_params = json.load(f)
+
+        dataset_toml_file = training_data_path / training_params.get(
+            "dataset_toml_file"
+        )
+        model_toml_file = training_data_path / training_params.get("model_toml_file")
+
+        with open(dataset_toml_file, "r") as f:
+            data = toml.load(f)
+
+        data["assays_meta"]["file_path"] = (
+            f"{training_data_path}{data['assays_meta']['file_path']}"
+        )
+
+        with open(dataset_toml_file, "w") as f:
+            toml.dump(data, f)
+
+        return str(output_path), str(dataset_toml_file), str(model_toml_file)
+
+    else:
+        output_path = Path("/output")
+        return str(output_path), str(dataset_toml_file), str(model_toml_file)
+
 
 @app.command()
-def predict(
-    dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
-    model_toml_file: str = typer.Option(help="Path to the model TOML file"),
-    nogpu: bool = typer.Option(False, help="GPUs available"),
+def train(
+    dataset_toml_file: str = typer.Option(
+        default="", help="Path to the dataset TOML file"
+    ),
+    model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
+    nogpu: bool = typer.Option(default=False, help="GPUs available or not"),
 ):
+    output_path, dataset_toml_file, model_toml_file = _configure_container_paths(
+        dataset_toml_file=dataset_toml_file,
+        model_toml_file=model_toml_file,
+    )
+
     console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")
 
     manifest = Manifest.from_path(dataset_toml_file)
@@ -126,9 +177,11 @@ def predict(
             err_console.print(f"Error: Invalid scoring strategy: {scoring_strategy}")
 
     df.rename(columns={targets[0]: "test"}, inplace=True)
-    df.to_csv(f"/output/{dataset_name}_{model_name}.csv", index=False)
+    df.to_csv(f"/{output_path}/{dataset_name}_{model_name}.csv", index=False)
 
-    console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
+    console.print(
+        f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
+    )
     console.print("Done.")
 
 

diff --git a/models/pls/src/pg2_model_pls/__main__.py b/models/pls/src/pg2_model_pls/__main__.py
@@ -1,10 +1,13 @@
 import polars as pl
+from pathlib import Path
 from rich.console import Console
 from pg2_dataset.dataset import Manifest
 from pg2_dataset.splits.abstract_split_strategy import TrainTestValid
 from pg2_model_pls.manifest import Manifest as ModelManifest
 from pg2_model_pls.utils import load_x_and_y, train_model, predict_model
-
+from typing import Tuple
+import json
+import toml
 import typer
 
 app = typer.Typer(
@@ -14,17 +17,63 @@
 
 console = Console()
 
+prefix = Path("/opt/ml")
+training_data_path = prefix / "input" / "data" / "training"
+params_path = prefix / "input" / "config" / "hyperparameters.json"
+model_path = Path("/model.pkl")
+
+
+def _configure_container_paths(
+    dataset_toml_file: str, model_toml_file: str
+) -> Tuple[str, str, str]:
+    if not dataset_toml_file and not model_toml_file:
+        typer.echo(
+            "Configuring the paths to where SageMaker mounts interesting things in the container."
+        )
+
+        output_path = prefix / "model"
+
+        with open(params_path, "r") as f:
+            training_params = json.load(f)
+
+        dataset_toml_file = training_data_path / training_params.get(
+            "dataset_toml_file"
+        )
+        model_toml_file = training_data_path / training_params.get("model_toml_file")
+
+        with open(dataset_toml_file, "r") as f:
+            data = toml.load(f)
+
+        data["assays_meta"]["file_path"] = (
+            f"{training_data_path}{data['assays_meta']['file_path']}"
+        )
+
+        with open(dataset_toml_file, "w") as f:
+            toml.dump(data, f)
+
+        return str(output_path), str(dataset_toml_file), str(model_toml_file)
+
+    else:
+        output_path = Path("/output")
+        return str(output_path), str(dataset_toml_file), str(model_toml_file)
+
 
 @app.command()
-def predict(
-    dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
-    model_toml_file: str = typer.Option(help="Path to the model TOML file"),
+def train(
+    dataset_toml_file: str = typer.Option(
+        default="", help="Path to the dataset TOML file"
+    ),
+    model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
 ):
+    output_path, dataset_toml_file, model_toml_file = _configure_container_paths(
+        dataset_toml_file=dataset_toml_file,
+        model_toml_file=model_toml_file,
+    )
+
     console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")
 
     dataset_name = Manifest.from_path(dataset_toml_file).name
 
-    model_path = "/model.pkl"
     model_name = ModelManifest.from_path(model_toml_file).name
 
     train_X, train_Y = load_x_and_y(
@@ -70,8 +119,10 @@ def predict(
         }
     )
 
-    df.write_csv(f"/output/{dataset_name}_{model_name}.csv")
-    console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
+    df.write_csv(f"/{output_path}/{dataset_name}_{model_name}.csv")
+    console.print(
+        f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
+    )
 
     console.print("Done.")