diff --git a/.gitignore b/.gitignore index a00be42b73..1d59ccd3fe 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ http_service/models/*model* data/ sheets/ +wandb/ .mypy_cache/ .pytest_cache/ diff --git a/bugbug/model.py b/bugbug/model.py index ed73b955e4..d19950a6a7 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -25,8 +25,10 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder from tabulate import tabulate +from wandb.xgboost import WandbCallback from xgboost import XGBModel +import wandb from bugbug import bugzilla, db, repository from bugbug.github import Github from bugbug.nlp import SpacyVectorizer @@ -363,6 +365,9 @@ def get_labels(self) -> tuple[dict[Any, Any], list[Any]]: raise NotImplementedError("The model must implement this method") def train(self, importance_cutoff=0.15, limit=None): + run = wandb.init( + project="bugbug-john-test", config={"importance_cutoff": importance_cutoff} + ) classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) @@ -414,7 +419,19 @@ def train(self, importance_cutoff=0.15, limit=None): logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}") logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}") + if ( + run.sweep_id + ): # if we are running a grid search from wandb, use the hyperparameter options from there + self.clf.named_steps["estimator"].set_params( + max_depth=wandb.config.max_depth, + colsample_bytree=wandb.config.colsample_bytree, + ) + + self.clf.named_steps["estimator"].set_params( + callbacks=[WandbCallback(log_model=True)] + ) self.clf.fit(X_train, self.le.transform(y_train)) + logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_) logger.info("Model trained") @@ -596,6 +613,12 @@ def train(self, importance_cutoff=0.15, limit=None): with open(model_path, "wb") as f: pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) + wandb.summary["Model"] = model_directory + + artifact = wandb.Artifact(name=model_directory, type="data") + artifact.add_file(model_path) + run.log_artifact(artifact) + if self.store_dataset: with open(f"{self.__class__.__name__.lower()}_data_X", "wb") as f: pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL) @@ -603,7 +626,8 @@ def train(self, importance_cutoff=0.15, limit=None): with open(f"{self.__class__.__name__.lower()}_data_y", "wb") as f: pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL) - return tracking_metrics + wandb.log(tracking_metrics) + return tracking_metrics, run @staticmethod def load(model_directory: str) -> "Model": diff --git a/requirements.txt b/requirements.txt index 8cfc4b6b35..80b343228c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,5 +29,6 @@ tabulate==0.9.0 taskcluster==59.2.0 tenacity==8.2.3 tqdm==4.66.1 +wandb==0.16.1 xgboost==2.0.3 zstandard==0.22.0 diff --git a/scripts/trainer.py b/scripts/trainer.py index c7158d0a01..cc3752d223 100644 --- a/scripts/trainer.py +++ b/scripts/trainer.py @@ -7,6 +7,7 @@ import sys from logging import INFO, basicConfig, getLogger +import wandb from bugbug import db from bugbug.models import MODELS, get_model_class from bugbug.utils import CustomJsonEncoder, create_tar_zst, zstd_compress @@ -38,13 +39,16 @@ def go(self, args): logger.info("Skipping download of the databases") logger.info("Training *%s* model", model_name) - metrics = model_obj.train(limit=args.limit) - + metrics, wandb_run = model_obj.train(limit=args.limit) # Save the metrics as a file that can be uploaded as an artifact. metric_file_path = "metrics.json" with open(metric_file_path, "w") as metric_file: json.dump(metrics, metric_file, cls=CustomJsonEncoder) + artifact = wandb.Artifact(name="metrics_file", type="data") + artifact.add_file("metrics.json") + wandb_run.log_artifact(artifact) + logger.info("Training done") model_directory = f"{model_name}model"