Skip to content

Introduce Base Comment Model and functionality to classify comments #4097

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions bugbug/bugzilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,17 @@ def get_groups_users(group_names: list[str]) -> list[str]:
]


def get_comment(comment_id) -> dict:
r = utils.get_session("bugzilla").get(
f"https://bugzilla.mozilla.org/rest/bug/comment/{comment_id}",
headers={"X-Bugzilla-API-Key": Bugzilla.TOKEN, "User-Agent": "bugbug"},
)
# TODO: Do we raise the error or we just ignore the comment?
r.raise_for_status()

return r.json()["comments"]


def get_revision_ids(bug: BugDict) -> list[int]:
revision_ids = []

Expand Down
15 changes: 15 additions & 0 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,3 +803,18 @@ def items_gen(self, classes):
continue

yield issue, classes[issue_number]


class CommentModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.training_dbs = [bugzilla.BUGS_DB]

def items_gen(self, classes):
for bug in bugzilla.get_bugs():
for comment in bug["comments"]:
comment_id = comment["id"]
if comment["id"] not in classes:
continue

yield (bug, comment), classes[comment_id]
61 changes: 61 additions & 0 deletions http_service/bugbug_http/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,67 @@ def classify_issue(
return "OK"


def classify_comment(
model_name: str, comment_ids: Sequence[int], bugzilla_token: str
) -> str:
from bugbug_http.app import JobInfo

# This should be called in a process worker so it should be safe to set
# the token here
comment_ids_set = set(map(int, comment_ids))
bugzilla.set_token(bugzilla_token)

comments = {
comment_id: bugzilla.get_comment(comment_id).values()
for comment_id in comment_ids
}

missing_comments = comment_ids_set.difference(comments.keys())

for comment_id in missing_comments:
job = JobInfo(classify_comment, model_name, comment_id)

# TODO: Find a better error format
setkey(job.result_key, orjson.dumps({"available": False}))

if not comments:
return "NOK"

model = MODEL_CACHE.get(model_name)

if not model:
LOGGER.info("Missing model %r, aborting" % model_name)
return "NOK"

model_extra_data = model.get_extra_data()

# TODO: Classify could choke on a single bug which could make the whole
# job to fails. What should we do here?
probs = model.classify(list(comments.values()), True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should pass the bug here as well i.e., (bug, comment)

indexes = probs.argmax(axis=-1)
suggestions = model.le.inverse_transform(indexes)

probs_list = probs.tolist()
indexes_list = indexes.tolist()
suggestions_list = suggestions.tolist()

for i, comment_id in enumerate(comments.keys()):
data = {
"prob": probs_list[i],
"index": indexes_list[i],
"class": suggestions_list[i],
"extra_data": model_extra_data,
}

job = JobInfo(classify_comment, model_name, comment_id)
setkey(job.result_key, orjson.dumps(data), compress=True)

# TODO: Save the comment last change
# We shall need to update one of the comment keys to show an updated comment

return "OK"


def classify_broken_site_report(model_name: str, reports_data: list[dict]) -> str:
from bugbug_http.app import JobInfo

Expand Down
86 changes: 86 additions & 0 deletions scripts/comment_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-

import argparse
import os
from logging import INFO, basicConfig, getLogger

import numpy as np
import requests

from bugbug import bugzilla, db
from bugbug.models import get_model_class
from bugbug.utils import download_model

basicConfig(level=INFO)
logger = getLogger(__name__)


def classify_comments(model_name: str, comment_id: int) -> None:
model_file_name = f"{model_name}model"

if not os.path.exists(model_file_name):
logger.info("%s does not exist. Downloading the model....", model_file_name)
try:
download_model(model_name)
except requests.HTTPError:
logger.error(
"A pre-trained model is not available, you will need to train it yourself using the trainer script"
)
raise SystemExit(1)

model_class = get_model_class(model_name)
model = model_class.load(model_file_name)

if comment_id:
# Get a comment by its id
comments = list(bugzilla.get_comment(comment_id).values())
assert comments, f"A comment with a comment id of {comment_id} was not found"
else:
assert db.download(bugzilla.BUGS_DB)
bugs = bugzilla.get_bugs()
comments = [
{**comment, "bug_id": bug["id"]}
for bug in bugs
for comment in bug["comments"]
]

for comment in comments:
print(
f'https://bugzilla.mozilla.org/show_bug.cgi?id={comment["bug_id"]}#c{comment["count"]}'
)

if model.calculate_importance:
probas, importance = model.classify(
comment, probabilities=True, importances=True
)

model.print_feature_importances(
importance["importances"], class_probabilities=probas
)
else:
probas = model.classify(comment, probabilities=True, importances=False)

probability = probas[0]
pred_index = np.argmax(probability)
if len(probability) > 2:
pred_class = model.le.inverse_transform([pred_index])[0]
else:
pred_class = "Positive" if pred_index == 1 else "Negative"
print(f"{pred_class} {probability}")
input()


def main() -> None:
description = "Perform evaluation on comments using the specified model"
parser = argparse.ArgumentParser(description=description)

parser.add_argument("model", help="Which model to use for evaluation")
parser.add_argument("--comment-id", help="Classify the given comment id", type=int)

args = parser.parse_args()

classify_comments(args.model, args.comment_id)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def read_requirements(file_):
"bugbug-maintenance-effectiveness-indicator = scripts.maintenance_effectiveness_indicator:main",
"bugbug-microannotate-generate = scripts.microannotate_generator:main",
"bugbug-classify-commit = scripts.commit_classifier:main",
"bugbug-classify-comment = scripts.comment_classifier:main",
"bugbug-classify-bug = scripts.bug_classifier:main",
"bugbug-regressor-finder = scripts.regressor_finder:main",
"bugbug-retrieve-training-metrics = scripts.retrieve_training_metrics:main",
Expand Down