Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 59 additions & 6 deletions models/esm/src/pg2_model_esm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import torch
import typer
from pathlib import Path
from typing import Tuple
from rich.console import Console
from pg2_dataset.dataset import Manifest
from tqdm import tqdm
from esm import pretrained
from pg2_model_esm.utils import compute_pppl, label_row
from pg2_model_esm.manifest import Manifest as ModelManifest
import toml
import json


app = typer.Typer(
Expand All @@ -16,13 +20,60 @@
err_console = Console(stderr=True)
console = Console()

prefix = Path("/opt/ml")
training_data_path = prefix / "input" / "data" / "training"
params_path = prefix / "input" / "config" / "hyperparameters.json"
model_path = Path("/model.pkl")


def _configure_container_paths(
dataset_toml_file: str, model_toml_file: str
) -> Tuple[str, str, str]:
if not dataset_toml_file and not model_toml_file:
typer.echo(
"Configuring the paths to where SageMaker mounts interesting things in the container."
)

output_path = prefix / "model"

with open(params_path, "r") as f:
Comment thread
tintinrevient marked this conversation as resolved.
training_params = json.load(f)

dataset_toml_file = training_data_path / training_params.get(
"dataset_toml_file"
)
model_toml_file = training_data_path / training_params.get("model_toml_file")
Comment thread
tintinrevient marked this conversation as resolved.

with open(dataset_toml_file, "r") as f:
data = toml.load(f)

data["assays_meta"]["file_path"] = (
f"{training_data_path}{data['assays_meta']['file_path']}"
)

with open(dataset_toml_file, "w") as f:
toml.dump(data, f)

return str(output_path), str(dataset_toml_file), str(model_toml_file)

else:
output_path = Path("/output")
return str(output_path), str(dataset_toml_file), str(model_toml_file)


@app.command()
def predict(
dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
Comment thread
tintinrevient marked this conversation as resolved.
model_toml_file: str = typer.Option(help="Path to the model TOML file"),
nogpu: bool = typer.Option(False, help="GPUs available"),
def train(
dataset_toml_file: str = typer.Option(
default="", help="Path to the dataset TOML file"
),
model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
nogpu: bool = typer.Option(default=False, help="GPUs available or not"),
):
output_path, dataset_toml_file, model_toml_file = _configure_container_paths(
dataset_toml_file=dataset_toml_file,
model_toml_file=model_toml_file,
)

console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")

manifest = Manifest.from_path(dataset_toml_file)
Expand Down Expand Up @@ -126,9 +177,11 @@ def predict(
err_console.print(f"Error: Invalid scoring strategy: {scoring_strategy}")

df.rename(columns={targets[0]: "test"}, inplace=True)
df.to_csv(f"/output/{dataset_name}_{model_name}.csv", index=False)
df.to_csv(f"/{output_path}/{dataset_name}_{model_name}.csv", index=False)

console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
console.print(
f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
)
console.print("Done.")


Expand Down
65 changes: 58 additions & 7 deletions models/pls/src/pg2_model_pls/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import polars as pl
from pathlib import Path
from rich.console import Console
from pg2_dataset.dataset import Manifest
from pg2_dataset.splits.abstract_split_strategy import TrainTestValid
from pg2_model_pls.manifest import Manifest as ModelManifest
from pg2_model_pls.utils import load_x_and_y, train_model, predict_model

from typing import Tuple
import json
import toml
import typer

app = typer.Typer(
Expand All @@ -14,17 +17,63 @@

console = Console()

prefix = Path("/opt/ml")
training_data_path = prefix / "input" / "data" / "training"
params_path = prefix / "input" / "config" / "hyperparameters.json"
model_path = Path("/model.pkl")


def _configure_container_paths(
dataset_toml_file: str, model_toml_file: str
) -> Tuple[str, str, str]:
if not dataset_toml_file and not model_toml_file:
typer.echo(
"Configuring the paths to where SageMaker mounts interesting things in the container."
)

output_path = prefix / "model"

with open(params_path, "r") as f:
training_params = json.load(f)

dataset_toml_file = training_data_path / training_params.get(
"dataset_toml_file"
)
model_toml_file = training_data_path / training_params.get("model_toml_file")

with open(dataset_toml_file, "r") as f:
data = toml.load(f)

data["assays_meta"]["file_path"] = (
f"{training_data_path}{data['assays_meta']['file_path']}"
)

with open(dataset_toml_file, "w") as f:
toml.dump(data, f)

return str(output_path), str(dataset_toml_file), str(model_toml_file)

else:
output_path = Path("/output")
return str(output_path), str(dataset_toml_file), str(model_toml_file)


@app.command()
def predict(
dataset_toml_file: str = typer.Option(help="Path to the dataset TOML file"),
model_toml_file: str = typer.Option(help="Path to the model TOML file"),
def train(
dataset_toml_file: str = typer.Option(
default="", help="Path to the dataset TOML file"
),
model_toml_file: str = typer.Option(default="", help="Path to the model TOML file"),
):
output_path, dataset_toml_file, model_toml_file = _configure_container_paths(
dataset_toml_file=dataset_toml_file,
model_toml_file=model_toml_file,
)

console.print(f"Loading {dataset_toml_file} and {model_toml_file}...")

dataset_name = Manifest.from_path(dataset_toml_file).name

model_path = "/model.pkl"
model_name = ModelManifest.from_path(model_toml_file).name

train_X, train_Y = load_x_and_y(
Expand Down Expand Up @@ -70,8 +119,10 @@ def predict(
}
)

df.write_csv(f"/output/{dataset_name}_{model_name}.csv")
console.print(f"Saved the metrics in CSV in output/{dataset_name}_{model_name}.csv")
df.write_csv(f"/{output_path}/{dataset_name}_{model_name}.csv")
console.print(
f"Saved the metrics in CSV in {output_path}/{dataset_name}_{model_name}.csv"
)

console.print("Done.")

Expand Down