mozilla · jpangas · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@ http_service/models/*model*
 
 data/
 sheets/
+wandb/
 
 .mypy_cache/
 .pytest_cache/

diff --git a/bugbug/model.py b/bugbug/model.py
@@ -25,8 +25,10 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 from tabulate import tabulate
+from wandb.xgboost import WandbCallback
 from xgboost import XGBModel
 
+import wandb
 from bugbug import bugzilla, db, repository
 from bugbug.github import Github
 from bugbug.nlp import SpacyVectorizer
@@ -363,6 +365,9 @@ def get_labels(self) -> tuple[dict[Any, Any], list[Any]]:
         raise NotImplementedError("The model must implement this method")
 
     def train(self, importance_cutoff=0.15, limit=None):
+        run = wandb.init(
+            project="bugbug-john-test", config={"importance_cutoff": importance_cutoff}
+        )
         classes, self.class_names = self.get_labels()
         self.class_names = sort_class_names(self.class_names)
 
@@ -414,7 +419,19 @@ def train(self, importance_cutoff=0.15, limit=None):
         logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
         logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
 
+        if (
+            run.sweep_id
+        ):  # if we are running a grid search from wandb, use the hyperparameter options from there
+            self.clf.named_steps["estimator"].set_params(
+                max_depth=wandb.config.max_depth,
+                colsample_bytree=wandb.config.colsample_bytree,
+            )
+
+        self.clf.named_steps["estimator"].set_params(
+            callbacks=[WandbCallback(log_model=True)]
+        )
         self.clf.fit(X_train, self.le.transform(y_train))
+
         logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
 
         logger.info("Model trained")
@@ -596,14 +613,21 @@ def train(self, importance_cutoff=0.15, limit=None):
         with open(model_path, "wb") as f:
             pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
 
+        wandb.summary["Model"] = model_directory
+
+        artifact = wandb.Artifact(name=model_directory, type="data")
+        artifact.add_file(model_path)
+        run.log_artifact(artifact)
+
         if self.store_dataset:
             with open(f"{self.__class__.__name__.lower()}_data_X", "wb") as f:
                 pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL)
 
             with open(f"{self.__class__.__name__.lower()}_data_y", "wb") as f:
                 pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL)
 
-        return tracking_metrics
+        wandb.log(tracking_metrics)
+        return tracking_metrics, run
 
     @staticmethod
     def load(model_directory: str) -> "Model":

diff --git a/requirements.txt b/requirements.txt
@@ -29,5 +29,6 @@ tabulate==0.9.0
 taskcluster==59.2.0
 tenacity==8.2.3
 tqdm==4.66.1
+wandb==0.16.1
 xgboost==2.0.3
 zstandard==0.22.0
diff --git a/scripts/trainer.py b/scripts/trainer.py
@@ -7,6 +7,7 @@
 import sys
 from logging import INFO, basicConfig, getLogger
 
+import wandb
 from bugbug import db
 from bugbug.models import MODELS, get_model_class
 from bugbug.utils import CustomJsonEncoder, create_tar_zst, zstd_compress
@@ -38,13 +39,16 @@ def go(self, args):
             logger.info("Skipping download of the databases")
 
         logger.info("Training *%s* model", model_name)
-        metrics = model_obj.train(limit=args.limit)
-
+        metrics, wandb_run = model_obj.train(limit=args.limit)
         # Save the metrics as a file that can be uploaded as an artifact.
         metric_file_path = "metrics.json"
         with open(metric_file_path, "w") as metric_file:
             json.dump(metrics, metric_file, cls=CustomJsonEncoder)
 
+        artifact = wandb.Artifact(name="metrics_file", type="data")
+        artifact.add_file("metrics.json")
+        wandb_run.log_artifact(artifact)
+
         logger.info("Training done")
 
         model_directory = f"{model_name}model"
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ http_service/models/*model* @@
     data/
     sheets/
+    wandb/
     .mypy_cache/
     .pytest_cache/
@@ Expand Down @@