-
Notifications
You must be signed in to change notification settings - Fork 0
Add AWS DVC file #74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add AWS DVC file #74
Changes from all commits
42cc7b3
357c0b3
23c57fb
62210eb
45f854a
ce5c5d1
aecf723
6a39599
f64c2c6
ffac1d0
0fc7c2c
f01ad68
d8837e0
f992326
464c6d5
03d9c43
02d629f
a203f93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| [hyper_params] | ||
| name = "esm" | ||
| offset_idx = 24 | ||
| location = "esm2_t30_150M_UR50D" | ||
| scoring_strategy = "wt-marginals" | ||
|
|
||
| [hyper_params] | ||
| offset_idx = 24 | ||
| nogpu = false | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,11 +1,12 @@ | ||||||
| import torch | ||||||
| import typer | ||||||
| from pathlib import Path | ||||||
| from rich.console import Console | ||||||
| from pg2_dataset.dataset import Manifest | ||||||
| from pg2_dataset.dataset import Dataset | ||||||
| from tqdm import tqdm | ||||||
| from esm import pretrained | ||||||
| from pg2_model_esm.utils import compute_pppl, label_row | ||||||
| from pg2_model_esm.manifest import Manifest as ModelManifest | ||||||
| from pg2_model_esm.manifest import Manifest | ||||||
|
|
||||||
|
|
||||||
| app = typer.Typer( | ||||||
|
|
@@ -16,18 +17,31 @@ | |||||
| err_console = Console(stderr=True) | ||||||
| console = Console() | ||||||
|
|
||||||
| prefix = Path("/opt/ml") | ||||||
| training_data_path = prefix / "input" / "data" / "training" / "dataset.zip" | ||||||
| manifest_path = prefix / "input" / "data" / "manifest" / "manifest.toml" | ||||||
| params_path = prefix / "input" / "config" / "hyperparameters.json" | ||||||
| output_path = prefix / "model" | ||||||
|
|
||||||
| model_path = Path("/model.pkl") | ||||||
|
|
||||||
|
|
||||||
| @app.command() | ||||||
| def train( | ||||||
| dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"), | ||||||
| model_toml_file: str = typer.Option(help="Path to the model TOML file"), | ||||||
| nogpu: bool = typer.Option(False, help="GPUs available"), | ||||||
| dataset_zip_file: str = typer.Option( | ||||||
|
tintinrevient marked this conversation as resolved.
|
||||||
| default="", help="Path to the dataset ZIP file" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This option is required, right? Also, could you update the syntax to the annotated version where option is at the left side of the equals? And update both types to
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The option is not required. because in AWS, there are no paths passed from a user input to use a local file path. |
||||||
| ), | ||||||
| model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"), | ||||||
| ): | ||||||
| console.print(f"Loading {dataset_toml_file} and {model_toml_file}...") | ||||||
| console.print(f"Loading {dataset_zip_file} and {model_toml_file}...") | ||||||
|
|
||||||
| manifest = Manifest.from_path(dataset_toml_file) | ||||||
| dataset_name = manifest.name | ||||||
| dataset = manifest.ingest() | ||||||
| dataset_zip_file = dataset_zip_file or training_data_path | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why introduce this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because in AWS environment, there is no |
||||||
| dataset = Dataset.from_path(dataset_zip_file) | ||||||
| dataset_name = dataset.name | ||||||
|
tintinrevient marked this conversation as resolved.
|
||||||
|
|
||||||
| model_toml_file = model_toml_file or manifest_path | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar question about the |
||||||
| hyper_params = Manifest.from_path(model_toml_file).hyper_params | ||||||
|
tintinrevient marked this conversation as resolved.
|
||||||
| model_name = hyper_params["name"] | ||||||
|
tintinrevient marked this conversation as resolved.
|
||||||
|
|
||||||
| assays = dataset.assays.meta.assays | ||||||
| targets = list(dataset.assays.meta.assays.keys()) | ||||||
|
|
@@ -39,21 +53,14 @@ def train( | |||||
|
|
||||||
| console.print(f"Loaded {len(df)} records.") | ||||||
|
|
||||||
| model_manifest = ModelManifest.from_path(model_toml_file) | ||||||
|
|
||||||
| model_name = model_manifest.name | ||||||
| location = model_manifest.location | ||||||
| scoring_strategy = model_manifest.scoring_strategy | ||||||
| hyper_params = model_manifest.hyper_params | ||||||
|
|
||||||
| model, alphabet = pretrained.load_model_and_alphabet(location) | ||||||
| model, alphabet = pretrained.load_model_and_alphabet(hyper_params["location"]) | ||||||
| model.eval() | ||||||
|
|
||||||
| console.print( | ||||||
| f"Loaded the model from {location} with scoring strategy {scoring_strategy}." | ||||||
| f"Loaded the model from {hyper_params['location']} with scoring strategy {hyper_params['scoring_strategy']}." | ||||||
| ) | ||||||
|
|
||||||
| if torch.cuda.is_available() and not nogpu: | ||||||
| if torch.cuda.is_available() and not hyper_params["nogpu"]: | ||||||
| model = model.cuda() | ||||||
| print("Transferred model to GPU") | ||||||
|
|
||||||
|
|
@@ -65,7 +72,7 @@ def train( | |||||
|
|
||||||
| batch_labels, batch_strs, batch_tokens = batch_converter(data) | ||||||
|
|
||||||
| match scoring_strategy: | ||||||
| match hyper_params["scoring_strategy"]: | ||||||
| case "wt-marginals": | ||||||
| with torch.no_grad(): | ||||||
| token_probs = torch.log_softmax(model(batch_tokens)["logits"], dim=-1) | ||||||
|
|
@@ -123,12 +130,16 @@ def train( | |||||
| ) | ||||||
|
|
||||||
| case _: | ||||||
| err_console.print(f"Error: Invalid scoring strategy: {scoring_strategy}") | ||||||
| err_console.print( | ||||||
| f"Error: Invalid scoring strategy: {hyper_params['scoring_strategy']}" | ||||||
| ) | ||||||
|
|
||||||
| df.rename(columns={targets[0]: "test"}, inplace=True) | ||||||
| df.to_csv(f"/output/{dataset_name}_{model_name}.csv", index=False) | ||||||
| df.to_csv(f"{output_path}/{dataset_name}_{model_name}.csv", index=False) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move path to variable |
||||||
|
|
||||||
| console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv") | ||||||
| console.print( | ||||||
| f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv" | ||||||
| ) | ||||||
| console.print("Done.") | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,5 @@ | ||
| name = "pls" | ||
|
|
||
| [hyper_params] | ||
| name = "pls" | ||
| n_components = 2 | ||
| aa_alphabet = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"] | ||
| aa_alphabet_length = 20 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,10 @@ | ||
| import polars as pl | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comments as for the other script |
||
| from pathlib import Path | ||
| from rich.console import Console | ||
| from pg2_dataset.dataset import Manifest | ||
| from pg2_dataset.dataset import Dataset | ||
| from pg2_dataset.splits.abstract_split_strategy import TrainTestValid | ||
| from pg2_model_pls.manifest import Manifest as ModelManifest | ||
| from pg2_model_pls.manifest import Manifest | ||
| from pg2_model_pls.utils import load_x_and_y, train_model, predict_model | ||
|
|
||
| import typer | ||
|
|
||
| app = typer.Typer( | ||
|
|
@@ -14,21 +14,34 @@ | |
|
|
||
| console = Console() | ||
|
|
||
| prefix = Path("/opt/ml") | ||
| training_data_path = prefix / "input" / "data" / "training" / "dataset.zip" | ||
| manifest_path = prefix / "input" / "data" / "manifest" / "manifest.toml" | ||
| params_path = prefix / "input" / "config" / "hyperparameters.json" | ||
| output_path = prefix / "model" | ||
|
|
||
| model_path = Path("/model.pkl") | ||
|
|
||
|
|
||
| @app.command() | ||
| def train( | ||
| dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"), | ||
| model_toml_file: str = typer.Option(help="Path to the model TOML file"), | ||
| dataset_zip_file: str = typer.Option( | ||
| default="", help="Path to the dataset ZIP file" | ||
| ), | ||
| model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"), | ||
| ): | ||
| console.print(f"Loading {dataset_toml_file} and {model_toml_file}...") | ||
| console.print(f"Loading {dataset_zip_file} and {model_toml_file}...") | ||
|
|
||
| dataset_name = Manifest.from_path(dataset_toml_file).name | ||
| dataset_zip_file = dataset_zip_file or training_data_path | ||
| dataset = Dataset.from_path(dataset_zip_file) | ||
| dataset_name = dataset.name | ||
|
|
||
| model_path = "/model.pkl" | ||
| model_name = ModelManifest.from_path(model_toml_file).name | ||
| model_toml_file = model_toml_file or manifest_path | ||
| hyper_params = Manifest.from_path(model_toml_file).hyper_params | ||
| model_name = hyper_params["name"] | ||
|
|
||
| train_X, train_Y = load_x_and_y( | ||
| dataset_toml_file=dataset_toml_file, | ||
| dataset=dataset, | ||
| split=TrainTestValid.train, | ||
| ) | ||
|
|
||
|
|
@@ -39,14 +52,14 @@ def train( | |
| train_model( | ||
| train_X=train_X, | ||
| train_Y=train_Y, | ||
| model_toml_file=model_toml_file, | ||
| model_path=model_path, | ||
| hyper_params=hyper_params, | ||
| ) | ||
|
|
||
| console.print("Finished the training...") | ||
|
|
||
| valid_X, valid_Y = load_x_and_y( | ||
| dataset_toml_file=dataset_toml_file, | ||
| dataset=dataset, | ||
| split=TrainTestValid.valid, | ||
| ) | ||
|
|
||
|
|
@@ -56,8 +69,8 @@ def train( | |
|
|
||
| pred_y = predict_model( | ||
| test_X=valid_X, | ||
| model_toml_file=model_toml_file, | ||
| model_path=model_path, | ||
| hyper_params=hyper_params, | ||
| ) | ||
|
|
||
| console.print("Finished the scoring...") | ||
|
|
@@ -70,8 +83,10 @@ def train( | |
| } | ||
| ) | ||
|
|
||
| df.write_csv(f"/output/{dataset_name}_{model_name}.csv") | ||
| console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv") | ||
| df.write_csv(f"{output_path}/{dataset_name}_{model_name}.csv") | ||
| console.print( | ||
| f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv" | ||
| ) | ||
|
|
||
| console.print("Done.") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,12 @@ | ||
| from pydantic import BaseModel, Field | ||
| from pydantic import BaseModel, Field, ConfigDict | ||
| from pathlib import Path | ||
| from typing import Self, Any | ||
| import toml | ||
|
|
||
|
|
||
| class Manifest(BaseModel): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The manifest should probably go into the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've put them into the pg2-benchmark! It is a good point, as in the future, we will update it with model cards, so it is sensible to put it in pg2-benchmark, 🤔 |
||
| name: str = "" | ||
| model_config = ConfigDict(extra="allow") | ||
|
|
||
| hyper_params: dict[str, Any] = Field(default_factory=dict) | ||
|
|
||
| @classmethod | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.