mozilla · jpangas · Mar 13, 2024 · Mar 13, 2024 · Apr 2, 2024 · suhaibmujahid
diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py
@@ -409,6 +409,17 @@ def get_groups_users(group_names: list[str]) -> list[str]:
     ]
 
 
+def get_comment(comment_id) -> dict:
+    r = utils.get_session("bugzilla").get(
+        f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}",
+        headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"},
+    )
+    # TODO: Do we raise the error or we just ignore the comment?
+    r.raise_for_status()
+
+    return r.json()["comments"]
+
+
 def get_revision_ids(bug: BugDict) -> list[int]:
     revision_ids = []
 

diff --git a/bugbug/model.py b/bugbug/model.py
@@ -803,3 +803,18 @@ def items_gen(self, classes):
                 continue
 
             yield issue, classes[issue_number]
+
+
+class CommentModel(Model):
+    def __init__(self, lemmatization=False):
+        Model.__init__(self, lemmatization)
+        self.training_dbs = [bugzilla.BUGS_DB]
+
+    def items_gen(self, classes):
+        for bug in bugzilla.get_bugs():
+            for comment in bug["comments"]:
+                comment_id = comment["id"]
+                if comment["id"] not in classes:
+                    continue
+
+            yield (bug, comment), classes[comment_id]
diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py
@@ -183,6 +183,67 @@ def classify_issue(
     return "OK"
 
 
+def classify_comment(
+    model_name: str, comment_ids: Sequence[int], bugzilla_token: str
+) -> str:
+    from bugbug_http.app import JobInfo
+
+    # This should be called in a process worker so it should be safe to set
+    # the token here
+    comment_ids_set = set(map(int, comment_ids))
+    bugzilla.set_token(bugzilla_token)
+
+    comments = {
+        comment_id: bugzilla.get_comment(comment_id).values()
+        for comment_id in comment_ids
+    }
+
+    missing_comments = comment_ids_set.difference(comments.keys())
+
+    for comment_id in missing_comments:
+        job = JobInfo(classify_comment, model_name, comment_id)
+
+        # TODO: Find a better error format
+        setkey(job.result_key, orjson.dumps({"available": False}))
+
+    if not comments:
+        return "NOK"
+
+    model = MODEL_CACHE.get(model_name)
+
+    if not model:
+        LOGGER.info("Missing model %r, aborting" % model_name)
+        return "NOK"
+
+    model_extra_data = model.get_extra_data()
+
+    # TODO: Classify could choke on a single bug which could make the whole
+    # job to fails. What should we do here?
+    probs = model.classify(list(comments.values()), True)
+    indexes = probs.argmax(axis=-1)
+    suggestions = model.le.inverse_transform(indexes)
+
+    probs_list = probs.tolist()
+    indexes_list = indexes.tolist()
+    suggestions_list = suggestions.tolist()
+
+    for i, comment_id in enumerate(comments.keys()):
+        data = {
+            "prob": probs_list[i],
+            "index": indexes_list[i],
+            "class": suggestions_list[i],
+            "extra_data": model_extra_data,
+        }
+
+        job = JobInfo(classify_comment, model_name, comment_id)
+        setkey(job.result_key, orjson.dumps(data), compress=True)
+
+        # TODO: Save the comment last change
+        # We shall need to update one of the comment keys to show an updated comment
+
+    return "OK"
+
+
 def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str:
     from bugbug_http.app import JobInfo
 

diff --git a/scripts/comment_classifier.py b/scripts/comment_classifier.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+import argparse
+import os
+from logging import INFO, basicConfig, getLogger
+
+import numpy as np
+import requests
+
+from bugbug import bugzilla, db
+from bugbug.models import get_model_class
+from bugbug.utils import download_model
+
+basicConfig(level=INFO)
+logger = getLogger(__name__)
+
+
+def classify_comments(model_name: str, comment_id: int) -> None:
+    model_file_name = f"{model_name}model"
+
+    if not os.path.exists(model_file_name):
+        logger.info("%s does not exist. Downloading the model....", model_file_name)
+        try:
+            download_model(model_name)
+        except requests.HTTPError:
+            logger.error(
+                "A pre-trained model is not available, you will need to train it yourself using the trainer script"
+            )
+            raise SystemExit(1)
+
+    model_class = get_model_class(model_name)
+    model = model_class.load(model_file_name)
+
+    if comment_id:
+        # Get a comment by its id
+        comments = list(bugzilla.get_comment(comment_id).values())
+        assert comments, f"A comment with a comment id of {comment_id} was not found"
+    else:
+        assert db.download(bugzilla.BUGS_DB)
+        bugs = bugzilla.get_bugs()
+        comments = [
+            {**comment, "bug_id": bug["id"]}
+            for bug in bugs
+            for comment in bug["comments"]
+        ]
+
+    for comment in comments:
+        print(
+            f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#c{comment["count"]}'
+        )
+
+        if model.calculate_importance:
+            probas, importance = model.classify(
+                comment, probabilities=True, importances=True
+            )
+
+            model.print_feature_importances(
+                importance["importances"], class_probabilities=probas
+            )
+        else:
+            probas = model.classify(comment, probabilities=True, importances=False)
+
+        probability = probas[0]
+        pred_index = np.argmax(probability)
+        if len(probability) > 2:
+            pred_class = model.le.inverse_transform([pred_index])[0]
+        else:
+            pred_class = "Positive" if pred_index == 1 else "Negative"
+        print(f"{pred_class} {probability}")
+        input()
+
+
+def main() -> None:
+    description = "Perform evaluation on comments using the specified model"
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument("model", help="Which model to use for evaluation")
+    parser.add_argument("--comment-id", help="Classify the given comment id", type=int)
+
+    args = parser.parse_args()
+
+    classify_comments(args.model, args.comment_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -52,6 +52,7 @@ def read_requirements(file_):
             "bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main",
             "bugbug-microannotate-generate = scripts.microannotate_generator:main",
             "bugbug-classify-commit = scripts.commit_classifier:main",
+            "bugbug-classify-comment = scripts.comment_classifier:main",
             "bugbug-classify-bug = scripts.bug_classifier:main",
             "bugbug-regressor-finder = scripts.regressor_finder:main",
             "bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main",