Skip to content

Integrate BugBug with Weights & Biases #3957

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ http_service/models/*model*

data/
sheets/
wandb/

.mypy_cache/
.pytest_cache/
Expand Down
26 changes: 25 additions & 1 deletion bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
from wandb.xgboost import WandbCallback
from xgboost import XGBModel

import wandb
from bugbug import bugzilla, db, repository
from bugbug.github import Github
from bugbug.nlp import SpacyVectorizer
Expand Down Expand Up @@ -363,6 +365,9 @@ def get_labels(self) -> tuple[dict[Any, Any], list[Any]]:
raise NotImplementedError("The model must implement this method")

def train(self, importance_cutoff=0.15, limit=None):
run = wandb.init(
project="bugbug-john-test", config={"importance_cutoff": importance_cutoff}
)
classes, self.class_names = self.get_labels()
self.class_names = sort_class_names(self.class_names)

Expand Down Expand Up @@ -414,7 +419,19 @@ def train(self, importance_cutoff=0.15, limit=None):
logger.info(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

if (
run.sweep_id
): # if we are running a grid search from wandb, use the hyperparameter options from there
self.clf.named_steps["estimator"].set_params(
max_depth=wandb.config.max_depth,
colsample_bytree=wandb.config.colsample_bytree,
)

self.clf.named_steps["estimator"].set_params(
callbacks=[WandbCallback(log_model=True)]
)
self.clf.fit(X_train, self.le.transform(y_train))

logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)

logger.info("Model trained")
Expand Down Expand Up @@ -596,14 +613,21 @@ def train(self, importance_cutoff=0.15, limit=None):
with open(model_path, "wb") as f:
pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)

wandb.summary["Model"] = model_directory

artifact = wandb.Artifact(name=model_directory, type="data")
artifact.add_file(model_path)
run.log_artifact(artifact)

if self.store_dataset:
with open(f"{self.__class__.__name__.lower()}_data_X", "wb") as f:
pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(f"{self.__class__.__name__.lower()}_data_y", "wb") as f:
pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL)

return tracking_metrics
wandb.log(tracking_metrics)
return tracking_metrics, run

@staticmethod
def load(model_directory: str) -> "Model":
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@ tabulate==0.9.0
taskcluster==59.2.0
tenacity==8.2.3
tqdm==4.66.1
wandb==0.16.1
xgboost==2.0.3
zstandard==0.22.0
8 changes: 6 additions & 2 deletions scripts/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sys
from logging import INFO, basicConfig, getLogger

import wandb
from bugbug import db
from bugbug.models import MODELS, get_model_class
from bugbug.utils import CustomJsonEncoder, create_tar_zst, zstd_compress
Expand Down Expand Up @@ -38,13 +39,16 @@ def go(self, args):
logger.info("Skipping download of the databases")

logger.info("Training *%s* model", model_name)
metrics = model_obj.train(limit=args.limit)

metrics, wandb_run = model_obj.train(limit=args.limit)
# Save the metrics as a file that can be uploaded as an artifact.
metric_file_path = "metrics.json"
with open(metric_file_path, "w") as metric_file:
json.dump(metrics, metric_file, cls=CustomJsonEncoder)

artifact = wandb.Artifact(name="metrics_file", type="data")
artifact.add_file("metrics.json")
wandb_run.log_artifact(artifact)

logger.info("Training done")

model_directory = f"{model_name}model"
Expand Down