ProteinGym · tintinrevient · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ There are two games to benchmark: supervised and zero-shot. Each game has its se
 - Supervised game is defined in this [dvc.yaml](supervised/dvc.yaml)
 - Zero-shot game is defined in this [dvc.yaml](zero_shot/dvc.yaml)
 
-The models and datasets are defined in `vars` at the top, and DVC translates `vars` into a matrix, which is namely a loop defined as the below pseudo-code:
+The models and datasets are defined in `vars` at the top, and DVC translates `vars` into a matrix, which is namely a loop defined as the following pseudo-code:
 
 ```python
 for dataset in datasets:
@@ -29,18 +29,34 @@ for dataset in datasets:
 
 ### Supervised
 
-You can benchmark for a group of supervised models:
+You can benchmark a group of supervised models:
 ```shell
 cd supervised && dvc repro
 ```
 
 ### Zero-shot
 
-You can benchmark for a group of zero-shot models:
+You can benchmark a group of zero-shot models:
 ```shell
 cd zero_shot && dvc repro
 ```
 
+## AWS
+
+There are two environments in which to run benchmark: one is the local environment, the other is the AWS environment.
+
+The difference of the AWS environment is that:
+* You need to upload the data and model TOML files and the actual data to S3.
+* You need to build and push your Docker image to ECR.
+* You need to use SageMaker training job to either train or score a model.
+
+> [!IMPORTANT]
+> In order to use the AWS environment, you need to set up your AWS profile with the below steps:
+> 1. Execute `aws configure sso`.
+> 2. Fill in the required fields, especially: "Default client Region" is "us-east-1".
+> 3. You can find your account ID and profile by executing `cat ~/.aws/config`.
+> 4. Finally, you can run `dvc repro` with environment variables in each game: `AWS_ACCOUNT_ID=xxx AWS_PROFILE=yyy dvc repro`
+
 ## Generate dummy data
 
 You can generate dummy data by the following command:

diff --git a/datasets/dummy/charge_ladder.csv b/datasets/dummy/charge_ladder.csv
diff --git a/datasets/dummy/charge_ladder.toml b/datasets/dummy/charge_ladder.toml
diff --git a/datasets/dummy/dataset.zip b/datasets/dummy/dataset.zip
diff --git a/datasets/neime/A0A1I9GEU1_NEIME_Kennouche_2019.csv b/datasets/neime/A0A1I9GEU1_NEIME_Kennouche_2019.csv
diff --git a/datasets/neime/dataset.zip b/datasets/neime/dataset.zip
diff --git a/datasets/neime/neime.toml b/datasets/neime/neime.toml
diff --git a/datasets/ranganathan/BLAT_ECOLX_Ranganathan2015.csv b/datasets/ranganathan/BLAT_ECOLX_Ranganathan2015.csv
diff --git a/datasets/ranganathan/dataset.zip b/datasets/ranganathan/dataset.zip
diff --git a/datasets/ranganathan/ranganathan.toml b/datasets/ranganathan/ranganathan.toml
diff --git a/models/esm/esm.toml → models/esm/manifest.toml b/models/esm/esm.toml → models/esm/manifest.toml
@@ -1,6 +1,6 @@
+[hyper_params]
 name = "esm"
+offset_idx = 24
 location = "esm2_t30_150M_UR50D"
 scoring_strategy = "wt-marginals"
-
-[hyper_params]
-offset_idx = 24
+nogpu = false
diff --git a/models/esm/src/pg2_model_esm/__main__.py b/models/esm/src/pg2_model_esm/__main__.py
@@ -1,11 +1,12 @@
 import torch
 import typer
+from pathlib import Path
 from rich.console import Console
-from pg2_dataset.dataset import Manifest
+from pg2_dataset.dataset import Dataset
 from tqdm import tqdm
 from esm import pretrained
 from pg2_model_esm.utils import compute_pppl, label_row
-from pg2_model_esm.manifest import Manifest as ModelManifest
+from pg2_model_esm.manifest import Manifest
 
 
 app = typer.Typer(
@@ -16,18 +17,31 @@
 err_console = Console(stderr=True)
 console = Console()
 
+prefix = Path("/opt/ml")
+training_data_path = prefix / "input" / "data" / "training" / "dataset.zip"
+manifest_path = prefix / "input" / "data" / "manifest" / "manifest.toml"
+params_path = prefix / "input" / "config" / "hyperparameters.json"
+output_path = prefix / "model"
+
+model_path = Path("/model.pkl")
+
 
 @app.command()
 def train(
-    dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
-    model_toml_file: str = typer.Option(help="Path to the model TOML file"),
-    nogpu: bool = typer.Option(False, help="GPUs available"),
+    dataset_zip_file: str = typer.Option(
+        default="", help="Path to the dataset ZIP file"
-        default="", help="Path to the dataset ZIP file"
+        help="Path to the dataset ZIP file"
-        default="", help="Path to the dataset ZIP file"
+        help="Path to the dataset ZIP file"
+    ),
+    model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
 ):
-    console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")
+    console.print(f"Loading {dataset_zip_file} and {model_toml_file}...")
 
-    manifest = Manifest.from_path(dataset_toml_file)
-    dataset_name = manifest.name
-    dataset = manifest.ingest()
+    dataset_zip_file = dataset_zip_file or training_data_path
+    dataset = Dataset.from_path(dataset_zip_file)
+    dataset_name = dataset.name
+
+    model_toml_file = model_toml_file or manifest_path
+    hyper_params = Manifest.from_path(model_toml_file).hyper_params
+    model_name = hyper_params["name"]
 
     assays = dataset.assays.meta.assays
     targets = list(dataset.assays.meta.assays.keys())
@@ -39,21 +53,14 @@ def train(
 
     console.print(f"Loaded {len(df)} records.")
 
-    model_manifest = ModelManifest.from_path(model_toml_file)
-
-    model_name = model_manifest.name
-    location = model_manifest.location
-    scoring_strategy = model_manifest.scoring_strategy
-    hyper_params = model_manifest.hyper_params
-
-    model, alphabet = pretrained.load_model_and_alphabet(location)
+    model, alphabet = pretrained.load_model_and_alphabet(hyper_params["location"])
     model.eval()
 
     console.print(
-        f"Loaded the model from {location} with scoring strategy {scoring_strategy}."
+        f"Loaded the model from {hyper_params['location']} with scoring strategy {hyper_params['scoring_strategy']}."
     )
 
-    if torch.cuda.is_available() and not nogpu:
+    if torch.cuda.is_available() and not hyper_params["nogpu"]:
         model = model.cuda()
         print("Transferred model to GPU")
 
@@ -65,7 +72,7 @@ def train(
 
     batch_labels, batch_strs, batch_tokens = batch_converter(data)
 
-    match scoring_strategy:
+    match hyper_params["scoring_strategy"]:
         case "wt-marginals":
             with torch.no_grad():
                 token_probs = torch.log_softmax(model(batch_tokens)["logits"], dim=-1)
@@ -123,12 +130,16 @@ def train(
             )
 
         case _:
-            err_console.print(f"Error: Invalid scoring strategy: {scoring_strategy}")
+            err_console.print(
+                f"Error: Invalid scoring strategy: {hyper_params['scoring_strategy']}"
+            )
 
     df.rename(columns={targets[0]: "test"}, inplace=True)
-    df.to_csv(f"/output/{dataset_name}_{model_name}.csv", index=False)
+    df.to_csv(f"{output_path}/{dataset_name}_{model_name}.csv", index=False)
 
-    console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
+    console.print(
+        f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
+    )
     console.print("Done.")
 
 

diff --git a/models/esm/src/pg2_model_esm/manifest.py b/models/esm/src/pg2_model_esm/manifest.py
@@ -1,15 +1,13 @@
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ConfigDict
 from pathlib import Path
 from typing import Self, Any
 import toml
 
 
 class Manifest(BaseModel):
-    name: str = ""
-    hyper_params: dict[str, Any] = Field(default_factory=dict)
+    model_config = ConfigDict(extra="allow")
 
-    location: str = ""
-    scoring_strategy: str = ""
+    hyper_params: dict[str, Any] = Field(default_factory=dict)
 
     @classmethod
     def from_path(cls, toml_file: Path) -> Self:

diff --git a/models/pls/pls.toml → models/pls/manifest.toml b/models/pls/pls.toml → models/pls/manifest.toml
@@ -1,6 +1,5 @@
-name = "pls"
-
 [hyper_params]
+name = "pls"
 n_components = 2
 aa_alphabet = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
 aa_alphabet_length = 20
diff --git a/models/pls/src/pg2_model_pls/__main__.py b/models/pls/src/pg2_model_pls/__main__.py
@@ -1,10 +1,10 @@
 import polars as pl
+from pathlib import Path
 from rich.console import Console
-from pg2_dataset.dataset import Manifest
+from pg2_dataset.dataset import Dataset
 from pg2_dataset.splits.abstract_split_strategy import TrainTestValid
-from pg2_model_pls.manifest import Manifest as ModelManifest
+from pg2_model_pls.manifest import Manifest
 from pg2_model_pls.utils import load_x_and_y, train_model, predict_model
-
 import typer
 
 app = typer.Typer(
@@ -14,21 +14,34 @@
 
 console = Console()
 
+prefix = Path("/opt/ml")
+training_data_path = prefix / "input" / "data" / "training" / "dataset.zip"
+manifest_path = prefix / "input" / "data" / "manifest" / "manifest.toml"
+params_path = prefix / "input" / "config" / "hyperparameters.json"
+output_path = prefix / "model"
+
+model_path = Path("/model.pkl")
+
 
 @app.command()
 def train(
-    dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
-    model_toml_file: str = typer.Option(help="Path to the model TOML file"),
+    dataset_zip_file: str = typer.Option(
+        default="", help="Path to the dataset ZIP file"
+    ),
+    model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
 ):
-    console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")
+    console.print(f"Loading {dataset_zip_file} and {model_toml_file}...")
 
-    dataset_name = Manifest.from_path(dataset_toml_file).name
+    dataset_zip_file = dataset_zip_file or training_data_path
+    dataset = Dataset.from_path(dataset_zip_file)
+    dataset_name = dataset.name
 
-    model_path = "/model.pkl"
-    model_name = ModelManifest.from_path(model_toml_file).name
+    model_toml_file = model_toml_file or manifest_path
+    hyper_params = Manifest.from_path(model_toml_file).hyper_params
+    model_name = hyper_params["name"]
 
     train_X, train_Y = load_x_and_y(
-        dataset_toml_file=dataset_toml_file,
+        dataset=dataset,
         split=TrainTestValid.train,
     )
 
@@ -39,14 +52,14 @@ def train(
     train_model(
         train_X=train_X,
         train_Y=train_Y,
-        model_toml_file=model_toml_file,
         model_path=model_path,
+        hyper_params=hyper_params,
     )
 
     console.print("Finished the training...")
 
     valid_X, valid_Y = load_x_and_y(
-        dataset_toml_file=dataset_toml_file,
+        dataset=dataset,
         split=TrainTestValid.valid,
     )
 
@@ -56,8 +69,8 @@ def train(
 
     pred_y = predict_model(
         test_X=valid_X,
-        model_toml_file=model_toml_file,
         model_path=model_path,
+        hyper_params=hyper_params,
     )
 
     console.print("Finished the scoring...")
@@ -70,8 +83,10 @@ def train(
         }
     )
 
-    df.write_csv(f"/output/{dataset_name}_{model_name}.csv")
-    console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
+    df.write_csv(f"{output_path}/{dataset_name}_{model_name}.csv")
+    console.print(
+        f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
+    )
 
     console.print("Done.")
 

diff --git a/models/pls/src/pg2_model_pls/manifest.py b/models/pls/src/pg2_model_pls/manifest.py
@@ -1,11 +1,12 @@
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ConfigDict
 from pathlib import Path
 from typing import Self, Any
 import toml
 
 
 class Manifest(BaseModel):
-    name: str = ""
+    model_config = ConfigDict(extra="allow")
+
     hyper_params: dict[str, Any] = Field(default_factory=dict)
 
     @classmethod