Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,6 @@ cython_debug/

# PyPI configuration file
.pypirc

# VS Code workspace files
*.code-workspace
7 changes: 6 additions & 1 deletion files/config.toml.example
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,17 @@ time_format = "seconds" # Format for predicted time output
[model.xgboost]
n_estimators = 1003 # Number of boosting rounds
learning_rate = 0.2415 # Step size shrinkage used to prevent overfitting
max_depth = 5 # Maximum depth of a tree
max_bin = 256 # Max number of bins that feature values will be bucketed in
max_leaves = 64 # Maximum tree leaves for base learners
subsample = 0.5693 # Subsample ratio of the training instances
colsample_bytree = 0.6181 # Subsample ratio of columns when constructing each tree
reg_alpha = 0.0305 # L1 regularization term on weights
reg_lambda = 7.6076 # L2 regularization term on weights
min_child_weight = 1.101 # Minimum sum of instance weight needed in a child
gamma = 1.904 # Minimum loss reduction required to make a further partition on a leaf node
early_stopping_rounds = 10 # Number of rounds for early stopping
size_penalty_enabled = true # Enable penalization of model size in Optuna objective for XGBoost.
size_penalty_lambda = 1 # How much cost 100k of nodes (added penalty).
[model.xgboost.params] # Optional model parameters
objective = "reg:squarederror"
tree_method = "hist"
Expand All @@ -65,6 +68,8 @@ lambda_l1 = 0.0005 # L1 regularization term on weights
lambda_l2 = 0.0001 # L2 regularization term on weights
max_bin = 282 # Max number of bins that feature values will be bucketed in
early_stopping_rounds = 10 # Number of rounds for early stopping
size_penalty_enabled = true # Enable penalization of model size in Optuna objective for LightGBM.
size_penalty_lambda = 1 # How much cost 100k of nodes (added penalty).
[model.lightgbm.params] # Optional model parameters
objective = "reg:squarederror"
tree_method = "hist"
Expand Down
30 changes: 30 additions & 0 deletions rpmeta/cli/run.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import code
import json
import logging
import sys
from pathlib import Path
from typing import Optional

Expand Down Expand Up @@ -252,3 +254,31 @@ def visualize(ctx: click.Context, dataset: Path):
result_handler.plot_predictions()
result_handler.plot_distribution()
result_handler.save_best_json()


@run.command("shell")
@click.pass_context
def shell(ctx: click.Context):
"""
Start interactive Python shell with the model loaded.
"""
predictor = ctx.obj.predictor
config = ctx.obj.config

local_vars = {
"ctx": ctx,
"predictor": predictor,
"config": config,
"InputRecord": InputRecord,
"pd": pd,
}

var_list = "\n".join(f" - {name}: {var!s}" for name, var in local_vars.items())
banner = (
"Interactive RPMeta Shell\n"
f"Python {sys.version}\n"
"The following variables are available:\n"
f"{var_list}"
)

code.interact(banner=banner, local=local_vars)
26 changes: 23 additions & 3 deletions rpmeta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,13 @@ class XGBoostParams(ModelParams):
default=0.2415,
description="Step size shrinkage used to prevent overfitting",
)
max_depth: int = Field(
default=5,
description="Maximum depth of a tree",
max_bin: int = Field(
default=256,
description="Max number of bins that feature values will be bucketed in",
)
max_leaves: int = Field(
default=64,
description="Maximum tree leaves for base learners",
)
subsample: float = Field(
default=0.5693,
Expand Down Expand Up @@ -98,6 +102,14 @@ class XGBoostParams(ModelParams):
description="Number of rounds for early stopping",
examples=[10, 20, 50],
)
size_penalty_enabled: bool = Field(
default=True,
description="Enable penalization of model size in Optuna objective for XGBoost.",
)
size_penalty_lambda: float = Field(
default=1,
description="How much cost 100k of nodes (added penalty).",
)


class LightGBMParams(ModelParams):
Expand Down Expand Up @@ -148,6 +160,14 @@ class LightGBMParams(ModelParams):
description="Number of rounds for early stopping",
examples=[10, 20, 50],
)
size_penalty_enabled: bool = Field(
default=True,
description="Enable penalization of model size in Optuna objective for LightGBM.",
)
size_penalty_lambda: float = Field(
default=1,
description="How much cost 100k of nodes (added penalty).",
)


class ModelBehavior(BaseModel):
Expand Down
18 changes: 15 additions & 3 deletions rpmeta/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,20 +208,32 @@ def to_model_dict(self) -> dict[str, Any]:
"hw_info": self.hw_info.model_dump(),
}

def to_data_frame(self, category_maps: dict[str, list[str]]) -> pd.DataFrame:
def to_data_frame(
self,
category_maps: dict[str, list[str]],
category_dtypes: Optional[dict[str, pd.CategoricalDtype]] = None,
) -> pd.DataFrame:
"""
Convert the record to a pandas DataFrame that the model understands.
This is used for prediction.

Args:
category_maps: Mapping of column name to list of categories.
category_dtypes: Pre-created CategoricalDtype objects.
Falls back to building from category_maps if not provided.
"""
df = pd.json_normalize(self.model_dump())
df["os"] = self.os
df["os_family"] = self.os_family
df["os_version"] = self.os_version
df["os_arch"] = self.os_arch

# preprocess
for col, cat_list in category_maps.items():
dtype = pd.CategoricalDtype(categories=cat_list, ordered=False)
if category_dtypes is None:
dtype = pd.CategoricalDtype(categories=cat_list, ordered=False)
else:
dtype = category_dtypes[col]

df[col] = df[col].astype(dtype)

df["hw_info.ram"] = df["hw_info.ram"].astype("float32")
Expand Down
Loading