diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index b47085a623..35427acf84 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -409,6 +409,17 @@ def get_groups_users(group_names: list[str]) -> list[str]: ] +def get_comment(comment_id) -> dict: + r = utils.get_session("bugzilla").get( + f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}", + headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"}, + ) + # TODO: Do we raise the error or we just ignore the comment? + r.raise_for_status() + + return r.json()["comments"] + + def get_revision_ids(bug: BugDict) -> list[int]: revision_ids = [] diff --git a/bugbug/model.py b/bugbug/model.py index 92de0c34f2..1176919e64 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -803,3 +803,18 @@ def items_gen(self, classes): continue yield issue, classes[issue_number] + + +class CommentModel(Model): + def __init__(self, lemmatization=False): + Model.__init__(self, lemmatization) + self.training_dbs = [bugzilla.BUGS_DB] + + def items_gen(self, classes): + for bug in bugzilla.get_bugs(): + for comment in bug["comments"]: + comment_id = comment["id"] + if comment["id"] not in classes: + continue + + yield (bug, comment), classes[comment_id] diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py index 5f7c8b6ea8..14afe83ef9 100644 --- a/http_service/bugbug_http/models.py +++ b/http_service/bugbug_http/models.py @@ -183,6 +183,67 @@ def classify_issue( return "OK" +def classify_comment( + model_name: str, comment_ids: Sequence[int], bugzilla_token: str +) -> str: + from bugbug_http.app import JobInfo + + # This should be called in a process worker so it should be safe to set + # the token here + comment_ids_set = set(map(int, comment_ids)) + bugzilla.set_token(bugzilla_token) + + comments = { + comment_id: bugzilla.get_comment(comment_id).values() + for comment_id in comment_ids + } + + missing_comments = comment_ids_set.difference(comments.keys()) + + for comment_id in missing_comments: + job = JobInfo(classify_comment, model_name, comment_id) + + # TODO: Find a better error format + setkey(job.result_key, orjson.dumps({"available": False})) + + if not comments: + return "NOK" + + model = MODEL_CACHE.get(model_name) + + if not model: + LOGGER.info("Missing model %r, aborting" % model_name) + return "NOK" + + model_extra_data = model.get_extra_data() + + # TODO: Classify could choke on a single bug which could make the whole + # job to fails. What should we do here? + probs = model.classify(list(comments.values()), True) + indexes = probs.argmax(axis=-1) + suggestions = model.le.inverse_transform(indexes) + + probs_list = probs.tolist() + indexes_list = indexes.tolist() + suggestions_list = suggestions.tolist() + + for i, comment_id in enumerate(comments.keys()): + data = { + "prob": probs_list[i], + "index": indexes_list[i], + "class": suggestions_list[i], + "extra_data": model_extra_data, + } + + job = JobInfo(classify_comment, model_name, comment_id) + setkey(job.result_key, orjson.dumps(data), compress=True) + + # TODO: Save the comment last change + # We shall need to update one of the comment keys to show an updated comment + + return "OK" + + def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str: from bugbug_http.app import JobInfo diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py new file mode 100644 index 0000000000..007a2ac0c1 --- /dev/null +++ b/scripts/comment_classifier.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +import argparse +import os +from logging import INFO, basicConfig, getLogger + +import numpy as np +import requests + +from bugbug import bugzilla, db +from bugbug.models import get_model_class +from bugbug.utils import download_model + +basicConfig(level=INFO) +logger = getLogger(__name__) + + +def classify_comments(model_name: str, comment_id: int) -> None: + model_file_name = f"{model_name}model" + + if not os.path.exists(model_file_name): + logger.info("%s does not exist. Downloading the model....", model_file_name) + try: + download_model(model_name) + except requests.HTTPError: + logger.error( + "A pre-trained model is not available, you will need to train it yourself using the trainer script" + ) + raise SystemExit(1) + + model_class = get_model_class(model_name) + model = model_class.load(model_file_name) + + if comment_id: + # Get a comment by its id + comments = list(bugzilla.get_comment(comment_id).values()) + assert comments, f"A comment with a comment id of {comment_id} was not found" + else: + assert db.download(bugzilla.BUGS_DB) + bugs = bugzilla.get_bugs() + comments = [ + {**comment, "bug_id": bug["id"]} + for bug in bugs + for comment in bug["comments"] + ] + + for comment in comments: + print( + f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#c{comment["count"]}' + ) + + if model.calculate_importance: + probas, importance = model.classify( + comment, probabilities=True, importances=True + ) + + model.print_feature_importances( + importance["importances"], class_probabilities=probas + ) + else: + probas = model.classify(comment, probabilities=True, importances=False) + + probability = probas[0] + pred_index = np.argmax(probability) + if len(probability) > 2: + pred_class = model.le.inverse_transform([pred_index])[0] + else: + pred_class = "Positive" if pred_index == 1 else "Negative" + print(f"{pred_class} {probability}") + input() + + +def main() -> None: + description = "Perform evaluation on comments using the specified model" + parser = argparse.ArgumentParser(description=description) + + parser.add_argument("model", help="Which model to use for evaluation") + parser.add_argument("--comment-id", help="Classify the given comment id", type=int) + + args = parser.parse_args() + + classify_comments(args.model, args.comment_id) + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 22919e5481..d80bc620ea 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def read_requirements(file_): "bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main", "bugbug-microannotate-generate = scripts.microannotate_generator:main", "bugbug-classify-commit = scripts.commit_classifier:main", + "bugbug-classify-comment = scripts.comment_classifier:main", "bugbug-classify-bug = scripts.bug_classifier:main", "bugbug-regressor-finder = scripts.regressor_finder:main", "bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main",