Skip to content

Dataset creation for backout commits #4159

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 38 commits into
base: master
Choose a base branch
from
Draft
Changes from 9 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
eabf9bb
Created base script to construct dataset for backout commits
benjaminmah May 2, 2024
aaf8386
Created new directory to store dataset, added comments to script
benjaminmah May 2, 2024
c096468
Cleaned up code, restructured dataset to include the inducing, backou…
benjaminmah May 3, 2024
24046fd
Sample dataset (count_limit = 500)
benjaminmah May 3, 2024
3eb6605
Removed old datasets
benjaminmah May 3, 2024
2db5029
Skip 'fixing commits' that are actually backout commits
benjaminmah May 3, 2024
3516c09
Sample dataset (num_count = 500)
benjaminmah May 3, 2024
0544b27
Deleted dataset
benjaminmah May 6, 2024
49570ac
Added cache for processed dictionaries, removed unused fields, simpli…
benjaminmah May 6, 2024
fc37940
Split up function `filter_commits` to handle saving to directory and …
benjaminmah May 6, 2024
10314dd
Replaced list with generator, stylized code to match standard coding …
benjaminmah May 6, 2024
943eb40
Removed commented out code
benjaminmah May 6, 2024
8ed0784
Added new file to log commits that do not have a fix commit, used `bu…
benjaminmah May 7, 2024
39ab450
Added metric collection for number of fixes found, number of no fixes…
benjaminmah May 8, 2024
fe8114b
Added condition to only append to dataset if the number of non backed…
benjaminmah May 8, 2024
74939f2
Added the diff between the original commit and the fixing commit in t…
benjaminmah May 10, 2024
be10d51
Removed separating by `added_lines` and `removed_lines`, storing raw …
benjaminmah May 10, 2024
3a406ef
Added threshold for number of changes and separated diffs by file.
benjaminmah May 13, 2024
bc23a22
Added support for hglib grafting from `repository.py`
benjaminmah May 14, 2024
6058305
Added grafting support to apply original commit to parent commit of t…
benjaminmah May 14, 2024
e666c2e
Cleaned up code
benjaminmah May 15, 2024
40bbe1b
Removed storing bugs without fixes, limited bugs to be within the las…
benjaminmah May 15, 2024
a4c5bff
Reverted to storing the raw diff as a utf-8 encoded string.
benjaminmah May 15, 2024
f133041
Removed unnecessary fields when populating dataset, extract correct d…
benjaminmah May 21, 2024
d202b0b
Fixed type hinting
benjaminmah May 22, 2024
79152a3
Added `hg merge-tool` for automatically resolving conflicts when graf…
benjaminmah May 22, 2024
4740196
Fixed docstring for function `graft`
benjaminmah May 22, 2024
38d6cf8
Added check to omit any diff containing conflicts
benjaminmah May 23, 2024
9fc018c
Made code more Pythonic
benjaminmah May 27, 2024
846210f
Changed standard collections to generic types
benjaminmah Jun 3, 2024
ae28dcf
Implemented logging error when shelving changes
benjaminmah Jun 3, 2024
c6f6a8f
Implemented logging error when grafting
benjaminmah Jun 3, 2024
37c51b6
Renamed `bug_dict` and `bug_info` to `bug_resolution_map` and `bug_re…
benjaminmah Jun 3, 2024
fad6df6
Removed `commit_dict`
benjaminmah Jun 3, 2024
fb7a17d
Changed `logger.info` to `logger.warning` when error encountered whil…
benjaminmah Jun 4, 2024
bfc77e4
Reverted importing standard collections
benjaminmah Jun 4, 2024
66108ad
Added raise-from when shelving
benjaminmah Jun 4, 2024
0d83fa7
Removed try-except when grafting
benjaminmah Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions scripts/backout_data_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import json
import logging
import os
import sys

from tqdm import tqdm

sys.path.append("../bugbug")
from bugbug import bugzilla, db, repository

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def download_databases() -> None:
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)

logger.info("Downloading commits database...")
assert db.download(repository.COMMITS_DB, support_files_too=True)


def save_dict_to_file(data_dict, file_path):
with open(file_path, "w") as file:
json.dump(data_dict, file, indent=4)


def load_dict_from_file(file_path):
with open(file_path, "r") as file:
return json.load(file)


def preprocess_commits_and_bugs():
cache_path_commit = "dataset/cache_commit_dict.json"
cache_path_bug_to_commit = "dataset/cache_bug_to_commit_dict.json"
cache_path_bug = "dataset/cache_bug_dict.json"

if (
os.path.exists(cache_path_commit)
and os.path.exists(cache_path_bug_to_commit)
and os.path.exists(cache_path_bug)
):
logger.info("Loading cached data...")
commit_dict = load_dict_from_file(cache_path_commit)
bug_to_commit_dict = load_dict_from_file(cache_path_bug_to_commit)
bug_dict = load_dict_from_file(cache_path_bug)

else:
logger.info("Preprocessing commits and bugs...")
commit_dict = {}
bug_to_commit_dict = {}

# store commits with their hashes and bug IDs as keys
for commit in tqdm(
repository.get_commits(
include_no_bug=True, include_backouts=True, include_ignored=True
),
desc="Processing commits",
):
commit_dict[commit["node"]] = {
"node": commit["node"],
"bug_id": commit["bug_id"],
"desc": commit["desc"],
"pushdate": commit["pushdate"],
"backedoutby": commit["backedoutby"],
"backsout": commit["backsout"],
}

if commit_dict[commit["node"]]["bug_id"] not in bug_to_commit_dict:
bug_to_commit_dict[commit["bug_id"]] = [commit_dict[commit["node"]]]
else:
bug_to_commit_dict[commit["bug_id"]].append(commit_dict[commit["node"]])

logger.info("Preprocessing bugs")
bug_dict = {}

num_lines = sum(1 for line in open(bugzilla.BUGS_DB, "r"))

# store bugs with their bug IDs as keys
with open(bugzilla.BUGS_DB, "r") as f:
for line in tqdm(f, total=num_lines, desc="Processing bugs"):
bug = json.loads(line)
bug_dict[bug.get("id")] = bug["resolution"]

save_dict_to_file(commit_dict, cache_path_commit)
save_dict_to_file(bug_to_commit_dict, cache_path_bug_to_commit)
save_dict_to_file(bug_dict, cache_path_bug)

return commit_dict, bug_to_commit_dict, bug_dict


def filter_commits(
directory_path: str,
destination_filepath: str,
count_limit: int,
bug_dict: dict,
commit_dict: dict,
bug_to_commit_dict: dict,
) -> None:
filtered_list = []
counter = 0

pbar = tqdm(total=count_limit, desc="Filtering commits")

for commit in repository.get_commits(
include_no_bug=False, include_backouts=True, include_ignored=False
):
# add commit if it was backed out and the bug is fixed
if (
bug_dict.get(str(commit["bug_id"]))
and commit["backedoutby"]
and bug_dict[str(commit["bug_id"])] == "FIXED"
):
fixing_commit = find_next_commit(
commit["bug_id"], bug_to_commit_dict, commit["node"]
)

# if fixing commit could not be found or is another backing out commit, do not add it to dataset
if (
fixing_commit["node"] == commit["backedoutby"]
or len(fixing_commit["backsout"]) > 0
):
counter += 1
continue

# add the hashes of the bug-inducing commit, the back out commit, and the fixing commit
# include metadata such as push date and description for further context
list_entry = {
"bug_id": commit["bug_id"],
"inducing_commit": {
"node": commit["node"],
"pushdate": commit["pushdate"],
"desc": commit["desc"],
},
"backout_commit": {
"node": commit["backedoutby"],
"pushdate": commit_dict[commit["backedoutby"]]["pushdate"],
"desc": commit_dict[commit["backedoutby"]]["desc"],
},
"fixing_commit": {
"node": fixing_commit["node"],
"pushdate": fixing_commit["pushdate"],
"desc": fixing_commit["desc"],
},
}

filtered_list.append(list_entry)
counter += 1
pbar.update(1)

if counter >= count_limit:
break

json_data = json.dumps(filtered_list, indent=4)

if not os.path.exists(directory_path):
os.makedirs(directory_path)
print(f"Directory {directory_path} created")

with open(destination_filepath, "w") as file:
file.write(json_data)

logger.info(f"Data successfully saved to {destination_filepath}")

return


def find_next_commit(bug_id: int, bug_to_commit_dict: dict, inducing_node: str) -> dict:
inducing_commit_found = False
for commit in bug_to_commit_dict[str(bug_id)]:
# if the inducing commit has been found, find next commit that has not been backed out
if inducing_commit_found:
if len(commit["backedoutby"]) == 0:
return commit

if commit["node"] == inducing_node:
inducing_commit_found = True

return commit


def main():
download_databases()

commit_dict, bug_to_commit_dict, bug_dict = preprocess_commits_and_bugs()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to consider the space complexity when iterating over the whole dataset.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed unused keys when constructing the dictionaries and implemented a cache to use generated dictionaries from previous instances of running the code via saving them as JSON files. Let me know if this needs additional changes/fixes!


filter_commits(
directory_path="dataset",
destination_filepath="dataset/backout_dataset.json",
count_limit=500,
bug_dict=bug_dict,
commit_dict=commit_dict,
bug_to_commit_dict=bug_to_commit_dict,
)


if __name__ == "__main__":
main()