fedora-copr · nikromen · Mar 4, 2026 · Feb 20, 2026 · Mar 3, 2026 · Mar 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -169,3 +169,6 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+
+# VS Code workspace files
+*.code-workspace
diff --git a/files/config.toml.example b/files/config.toml.example
@@ -40,14 +40,17 @@ time_format = "seconds" # Format for predicted time output
 [model.xgboost]
 n_estimators = 1003 # Number of boosting rounds
 learning_rate = 0.2415 # Step size shrinkage used to prevent overfitting
-max_depth = 5 # Maximum depth of a tree
+max_bin = 256 # Max number of bins that feature values will be bucketed in
+max_leaves = 64 # Maximum tree leaves for base learners
 subsample = 0.5693 # Subsample ratio of the training instances
 colsample_bytree = 0.6181 # Subsample ratio of columns when constructing each tree
 reg_alpha = 0.0305 # L1 regularization term on weights
 reg_lambda = 7.6076 # L2 regularization term on weights
 min_child_weight = 1.101 # Minimum sum of instance weight needed in a child
 gamma = 1.904 # Minimum loss reduction required to make a further partition on a leaf node
 early_stopping_rounds = 10 # Number of rounds for early stopping
+size_penalty_enabled = true # Enable penalization of model size in Optuna objective for XGBoost.
+size_penalty_lambda = 1 # How much cost 100k of nodes (added penalty).
 [model.xgboost.params] # Optional model parameters
 objective = "reg:squarederror"
 tree_method = "hist"
@@ -65,6 +68,8 @@ lambda_l1 = 0.0005 # L1 regularization term on weights
 lambda_l2 = 0.0001 # L2 regularization term on weights
 max_bin = 282 # Max number of bins that feature values will be bucketed in
 early_stopping_rounds = 10 # Number of rounds for early stopping
+size_penalty_enabled = true # Enable penalization of model size in Optuna objective for LightGBM.
+size_penalty_lambda = 1 # How much cost 100k of nodes (added penalty).
 [model.lightgbm.params] # Optional model parameters
 objective = "reg:squarederror"
 tree_method = "hist"

diff --git a/rpmeta/cli/run.py b/rpmeta/cli/run.py
@@ -1,5 +1,7 @@
+import code
 import json
 import logging
+import sys
 from pathlib import Path
 from typing import Optional
 
@@ -252,3 +254,31 @@ def visualize(ctx: click.Context, dataset: Path):
     result_handler.plot_predictions()
     result_handler.plot_distribution()
     result_handler.save_best_json()
+
+
+@run.command("shell")
+@click.pass_context
+def shell(ctx: click.Context):
+    """
+    Start interactive Python shell with the model loaded.
+    """
+    predictor = ctx.obj.predictor
+    config = ctx.obj.config
+
+    local_vars = {
+        "ctx": ctx,
+        "predictor": predictor,
+        "config": config,
+        "InputRecord": InputRecord,
+        "pd": pd,
+    }
+
+    var_list = "\n".join(f" - {name}: {var!s}" for name, var in local_vars.items())
+    banner = (
+        "Interactive RPMeta Shell\n"
+        f"Python {sys.version}\n"
+        "The following variables are available:\n"
+        f"{var_list}"
+    )
+
+    code.interact(banner=banner, local=local_vars)
diff --git a/rpmeta/config.py b/rpmeta/config.py
@@ -65,9 +65,13 @@ class XGBoostParams(ModelParams):
         default=0.2415,
         description="Step size shrinkage used to prevent overfitting",
     )
-    max_depth: int = Field(
-        default=5,
-        description="Maximum depth of a tree",
+    max_bin: int = Field(
+        default=256,
+        description="Max number of bins that feature values will be bucketed in",
+    )
+    max_leaves: int = Field(
+        default=64,
+        description="Maximum tree leaves for base learners",
     )
     subsample: float = Field(
         default=0.5693,
@@ -98,6 +102,14 @@ class XGBoostParams(ModelParams):
         description="Number of rounds for early stopping",
         examples=[10, 20, 50],
     )
+    size_penalty_enabled: bool = Field(
+        default=True,
+        description="Enable penalization of model size in Optuna objective for XGBoost.",
+    )
+    size_penalty_lambda: float = Field(
+        default=1,
+        description="How much cost 100k of nodes (added penalty).",
+    )
 
 
 class LightGBMParams(ModelParams):
@@ -148,6 +160,14 @@ class LightGBMParams(ModelParams):
         description="Number of rounds for early stopping",
         examples=[10, 20, 50],
     )
+    size_penalty_enabled: bool = Field(
+        default=True,
+        description="Enable penalization of model size in Optuna objective for LightGBM.",
+    )
+    size_penalty_lambda: float = Field(
+        default=1,
+        description="How much cost 100k of nodes (added penalty).",
+    )
 
 
 class ModelBehavior(BaseModel):

diff --git a/rpmeta/dataset.py b/rpmeta/dataset.py
@@ -208,20 +208,32 @@ def to_model_dict(self) -> dict[str, Any]:
             "hw_info": self.hw_info.model_dump(),
         }
 
-    def to_data_frame(self, category_maps: dict[str, list[str]]) -> pd.DataFrame:
+    def to_data_frame(
+        self,
+        category_maps: dict[str, list[str]],
+        category_dtypes: Optional[dict[str, pd.CategoricalDtype]] = None,
+    ) -> pd.DataFrame:
         """
         Convert the record to a pandas DataFrame that the model understands.
         This is used for prediction.
+
+        Args:
+            category_maps: Mapping of column name to list of categories.
+            category_dtypes: Pre-created CategoricalDtype objects.
+                Falls back to building from category_maps if not provided.
         """
         df = pd.json_normalize(self.model_dump())
         df["os"] = self.os
         df["os_family"] = self.os_family
         df["os_version"] = self.os_version
         df["os_arch"] = self.os_arch
 
-        # preprocess
         for col, cat_list in category_maps.items():
-            dtype = pd.CategoricalDtype(categories=cat_list, ordered=False)
+            if category_dtypes is None:
+                dtype = pd.CategoricalDtype(categories=cat_list, ordered=False)
+            else:
+                dtype = category_dtypes[col]
+
             df[col] = df[col].astype(dtype)
 
         df["hw_info.ram"] = df["hw_info.ram"].astype("float32")