Skip to content

Commit 6f39cd7

Browse files
[Refactor:Plagiarism] Add dedicated ranking step (#79)
* Only print warning once * add number of times it was truncated * Add progress bars for most of pipeline * Add compare_hashes progress bar * Finish the Python portion * Remove unnecessary code from compare_hashes * lint * Fix off-by-1 There was a very minor off-by-1 in the original codebase which necessitated the updates to the tests
1 parent 47a4e2d commit 6f39cd7

File tree

13 files changed

+198
-212
lines changed

13 files changed

+198
-212
lines changed

bin/process_all.sh

+4-3
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,10 @@ mkdir -p "${BASEPATH}/users"
8686
############################################################################
8787
# Run Lichen
8888
{ # We still want to unzip files if an error occurs when running Lichen here
89-
./tokenize_all.py "$tmp_location" &&
90-
./hash_all.py "$tmp_location" &&
91-
./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}";
89+
./tokenize_all.py "$tmp_location" &&
90+
./hash_all.py "$tmp_location" &&
91+
./compare_hashes.out "$tmp_location" || echo "${KILL_ERROR_MESSAGE}" &&
92+
./similarity_ranking.py "$tmp_location";
9293
}
9394

9495
############################################################################

bin/similarity_ranking.py

+180
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Ranks the submissions in order of plagiarism likelihood
4+
"""
5+
6+
import argparse
7+
import os
8+
import json
9+
import humanize
10+
import datetime
11+
from pathlib import Path
12+
13+
14+
# This is a helper class which is used to store, and ultimately sort, data about submissions
15+
class Submission:
16+
def __init__(self, user_id, version):
17+
self.user_id = user_id
18+
self.version = version
19+
20+
# The percent of this submission which matches other submissions
21+
self.percent_match = 0
22+
23+
# The absolute number of hashes matched
24+
self.total_hashes_matched = 0
25+
26+
# The highest number of matches between this user and any other single submission
27+
self.highest_match_count = 0
28+
29+
# We use this for sorting submissions later on. Future adjustments to the
30+
# ranking algorithm should modify this function.
31+
def __lt__(self, other):
32+
return self.highest_match_count < other.highest_match_count
33+
34+
35+
class Match:
36+
def __init__(self, user_id, version, source_gradeable):
37+
self.user_id = user_id
38+
self.version = version
39+
self.source_gradeable = source_gradeable
40+
41+
# The number of hashes this match shares with a Submission
42+
self.matching_hash_count = 0
43+
44+
45+
def parse_args():
46+
parser = argparse.ArgumentParser(description='')
47+
parser.add_argument('basepath')
48+
return parser.parse_args()
49+
50+
51+
# get_submission_stats is passed a user, version, a path to a matches.json, a
52+
# path to a hashes.txt file, and the hash size and returns a pair of a Submission()
53+
# object conatining a number of statistics about the specified submission, and a
54+
# list of Match objects which match this submission
55+
def get_submission_stats(user_id, version, matches_file, hashes_file, hash_size):
56+
submission = Submission(user_id, version)
57+
58+
# Determine how many hashes there are in this submission
59+
with open(hashes_file, 'r') as file:
60+
token_count = len([0 for _ in file]) + hash_size
61+
62+
# If this is a blank/empty submission, return now
63+
if token_count <= 1:
64+
return submission, []
65+
66+
# It is possible that there are no matches and thus a matches.json file isn't
67+
# created. If this is the case, we can simply return now.
68+
if not os.path.isfile(matches_file):
69+
return submission, []
70+
71+
with open(matches_file, 'r') as file:
72+
matches_json = json.load(file)
73+
74+
# Calculate the total number of hashes matched, as well as the number of
75+
# hashes matched for every other submission with matches
76+
matching_submissions = dict()
77+
prev_end = 0
78+
for match in matches_json:
79+
# Common and provided code doesn't have an others list (due to size contraints)
80+
if match['type'] != 'match':
81+
continue
82+
83+
for other in match['others']:
84+
other_submission = f"{other['username']}_{other['version']}_{other['source_gradeable']}" # noqa: E501
85+
if other_submission not in matching_submissions.keys():
86+
matching_submissions[other_submission] = Match(other['username'],
87+
other['version'],
88+
other['source_gradeable'])
89+
matching_submissions[other_submission].matching_hash_count += \
90+
match['end'] - max(prev_end, match['start'] - 1)
91+
submission.total_hashes_matched += match['end'] - max(prev_end, match['start'] - 1)
92+
prev_end = match['end']
93+
94+
# Actually stored as the fraction of the submission which matches
95+
submission.percent_match = submission.total_hashes_matched / token_count
96+
97+
if len(matching_submissions.values()) > 0:
98+
matching_submissions = list(matching_submissions.values())
99+
100+
matching_submissions.sort(key=lambda x: x.matching_hash_count, reverse=True)
101+
submission.highest_match_count = matching_submissions[0].matching_hash_count
102+
else:
103+
matching_submissions = []
104+
105+
return submission, matching_submissions
106+
107+
108+
def main():
109+
start_time = datetime.datetime.now()
110+
args = parse_args()
111+
112+
print("SIMILARITY RANKING:", flush=True)
113+
print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501
114+
115+
with open(Path(args.basepath, "config.json")) as lichen_config_file:
116+
lichen_config = json.load(lichen_config_file)
117+
118+
users_dir = Path(args.basepath, 'users')
119+
if not os.path.isdir(users_dir):
120+
raise SystemExit('ERROR! Unable to find users directory')
121+
122+
# We'll make a rough estimate of the percentage of ranking output done by
123+
# taking the percentage of users which have been done thus far
124+
total_users = len(os.listdir(users_dir))
125+
users_ranking_output = 0
126+
percent_progress = 0
127+
128+
all_submissions = list()
129+
130+
for user in sorted(os.listdir(users_dir)):
131+
user_dir = Path(users_dir, user)
132+
if not os.path.isdir(user_dir):
133+
continue
134+
135+
for version in sorted(os.listdir(user_dir)):
136+
version_dir = Path(user_dir, version)
137+
if not os.path.isdir(version_dir):
138+
continue
139+
140+
matches_file = Path(version_dir, 'matches.json')
141+
hashes_file = Path(version_dir, 'hashes.txt')
142+
143+
submission, matching_submissions = get_submission_stats(user,
144+
version,
145+
matches_file,
146+
hashes_file,
147+
lichen_config['hash_size'])
148+
all_submissions.append(submission)
149+
150+
# Write the ranking.txt for this submission
151+
with open(Path(version_dir, 'ranking.txt'), 'w') as ranking_file:
152+
# matching_submissions is already sorted by the absolute number of hashes matched
153+
for match in matching_submissions:
154+
ranking_file.write(f"{match.user_id:10} {match.version:3} "
155+
f"{match.source_gradeable} {match.matching_hash_count:>8}\n")
156+
157+
users_ranking_output += 1
158+
if int((users_ranking_output / total_users) * 100) > percent_progress:
159+
new_percent_progress = int((users_ranking_output / total_users) * 100)
160+
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
161+
percent_progress = new_percent_progress
162+
163+
all_submissions.sort(reverse=True)
164+
165+
# A set of all the users we've written lines for thus far (duplicates aren't allowed)
166+
users_written = set('foo')
167+
with open(Path(args.basepath, 'overall_ranking.txt'), 'w') as ranking_file:
168+
for s in all_submissions:
169+
if s.user_id in users_written:
170+
continue
171+
ranking_file.write(f"{s.user_id:10} {s.version:3} "
172+
f"{s.percent_match:4.0%} {s.total_hashes_matched:>8}\n")
173+
users_written.add(s.user_id)
174+
175+
# ==========================================================================
176+
print("]\nSimilarity ranking done in", humanize.precisedelta(start_time, format="%1.f"))
177+
178+
179+
if __name__ == "__main__":
180+
main()

compare_hashes/compare_hashes.cpp

+4-129
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#include "lichen_config.h"
1818
#include "submission.h"
1919
#include "hash_location.h"
20-
#include "score.h"
2120

2221

2322
// =============================================================================
@@ -29,20 +28,6 @@ typedef std::string user_id;
2928
typedef unsigned int version_number;
3029

3130

32-
// =============================================================================
33-
// helper classes
34-
35-
36-
// represents an element in a ranking of students by percent match
37-
struct StudentRanking {
38-
StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
39-
user_id student;
40-
version_number version;
41-
std::string source_gradeable;
42-
Score score;
43-
};
44-
45-
4631
// =============================================================================
4732
// helper functions
4833

@@ -89,12 +74,6 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
8974
}
9075

9176

92-
bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
93-
return a.score > b.score ||
94-
(a.score == b.score && a.student < b.student);
95-
}
96-
97-
9877
// =============================================================================
9978
// MAIN
10079

@@ -157,10 +136,6 @@ int main(int argc, char* argv[]) {
157136
std::unordered_set<hash> provided_code;
158137
// stores all hashes from other gradeables
159138
std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
160-
// stores the matches for every student, used later for generating overall_rankings.txt
161-
std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>> highest_matches;
162-
// keeps track of max matching hashes across all submissions, used for calculation of ranking score
163-
unsigned int max_hashes_matched = 0;
164139
// a map of "user_id:version" strings to the non-zero number of times their matching positions array was truncated
165140
std::unordered_map<std::string, int> matching_positions_truncations;
166141

@@ -323,7 +298,7 @@ int main(int argc, char* argv[]) {
323298

324299
// Note: we DO look for matches across submissions of the same student for self-plagiarism
325300

326-
// save the locations of all other occurences from proir term submissions
301+
// save the locations of all other occurences from prior term submissions
327302
std::vector<HashLocation>::iterator itr = other_occurences_itr->second.begin();
328303
for (; itr != other_occurences_itr->second.end(); ++itr) {
329304
(*submission_itr)->addSuspiciousMatch(hash_itr->second, *itr, hash_itr->first);
@@ -515,80 +490,14 @@ int main(int argc, char* argv[]) {
515490
assert(ostr.good());
516491
ostr << match_data.dump(4) << std::endl;
517492

518-
// =========================================================================
519-
// create individual ranking file
520-
// the file contains all the other students share matches, sorted by decreasing order of the percent match
521-
522-
// find and sort the other submissions it matches with
523-
std::vector<StudentRanking> student_ranking;
524-
std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>> matches = (*submission_itr)->getStudentsMatched();
525-
526-
std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>::const_iterator gradeables_itr = matches.begin();
527-
for (; gradeables_itr != matches.end(); ++gradeables_itr) {
528-
for (std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>::const_iterator matches_itr = gradeables_itr->second.begin();
529-
matches_itr != gradeables_itr->second.end(); ++matches_itr) {
530-
531-
for (std::unordered_map<version_number, std::unordered_set<hash>>::const_iterator version_itr = matches_itr->second.begin();
532-
version_itr != matches_itr->second.end(); ++version_itr) {
533-
534-
// Calculate the Percent Match:
535-
// count the number of unique hashes for the percent match calculation
536-
std::vector<std::pair<hash, location_in_submission>> submission_hashes = (*submission_itr)->getHashes();
537-
std::unordered_set<hash> unique_hashes;
538-
for (std::vector<std::pair<hash, location_in_submission>>::const_iterator itr = submission_hashes.begin();
539-
itr != submission_hashes.end(); ++itr) {
540-
unique_hashes.insert(itr->first);
541-
}
542-
543-
// the percent match is currently calculated using the number of hashes that match between this
544-
// submission and the other submission, over the total number of hashes this submission has.
545-
// In other words, the percentage is how much of this submission's code was plgairised from the other.
546-
unsigned int num_hashes_matched = version_itr->second.size();
547-
float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
548-
student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
549-
student_ranking.back().score.calculateScore(num_hashes_matched);
550-
}
551-
}
552-
}
553-
554-
// =========================================================================
555-
// Save this submission's highest percent match for later when we generate overall_rankings.txt
556-
float percentMatch = (*submission_itr)->getPercentage();
557-
unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
558-
Score submission_score(totalMatchingHashes, percentMatch);
559-
if (max_hashes_matched < totalMatchingHashes) {
560-
max_hashes_matched = totalMatchingHashes;
561-
}
562-
563-
std::pair<version_number, Score> new_pair = {(*submission_itr)->version(), submission_score};
564-
highest_matches[(*submission_itr)->student()].push_back(new_pair);
565-
// =========================================================================
566-
567-
std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
568-
569-
// create the directory and a file to write into
570-
boost::filesystem::path ranking_student_dir = users_root_directory / (*submission_itr)->student() / std::to_string((*submission_itr)->version());
571-
boost::filesystem::path ranking_student_file = ranking_student_dir / "ranking.txt";
572-
boost::filesystem::create_directories(ranking_student_dir);
573-
std::ofstream ranking_student_ostr(ranking_student_file.string());
574-
575-
// finally, write the file of ranking for this submission
576-
for (unsigned int i = 0; i < student_ranking.size(); i++) {
577-
ranking_student_ostr
578-
<< std::setw(15) << std::left << student_ranking[i].student << " "
579-
<< std::setw(3) << std::left << student_ranking[i].version << " "
580-
<< std::setw(1) << std::right << student_ranking[i].source_gradeable << " "
581-
<< std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
582-
}
583-
584493
// =========================================================================
585494
// Cleanup
586495

587-
// Done with this submissions. discard the data and clear the memory
496+
// Done with this submission. discard the data and clear the memory
588497
delete (*submission_itr);
589498
(*submission_itr) = nullptr;
590499

591-
// print current progress
500+
// Print current progress
592501
my_counter++;
593502
if (int((my_counter / float(all_submissions.size())) * 100) > my_percent) {
594503
int new_my_percent = int((my_counter / float(all_submissions.size())) * 100);
@@ -607,7 +516,7 @@ int main(int argc, char* argv[]) {
607516

608517
time(&end);
609518
diff = difftime(end, start);
610-
std::cout << "]" << std::endl << "Finished processing submissions in " << diff << " seconds" << std::endl;
519+
std::cout << "]" << std::endl;
611520

612521
// Print out the list of users who had their matching positions array truncated
613522
if (matching_positions_truncations.size() > 0) {
@@ -618,40 +527,6 @@ int main(int argc, char* argv[]) {
618527
}
619528
std::cout << std::endl << " - Try increasing the hash size or adding a regex to fix this problem." << std::endl;
620529
}
621-
fflush(stdout);
622-
623-
// ===========================================================================
624-
// Create a general summary of rankings of users by percentage match
625-
626-
// create a single file of students ranked by highest percentage of code plagiarised
627-
boost::filesystem::path ranking_file = lichen_gradeable_path / "overall_ranking.txt";
628-
std::ofstream ranking_ostr(ranking_file.string());
629-
630-
// take the map of highest matches and convert it to a vector so we can sort it
631-
// by percent match and then save it to a file
632-
std::vector<StudentRanking> ranking;
633-
for (std::unordered_map<user_id, std::vector<std::pair<version_number, Score>>>::iterator itr
634-
= highest_matches.begin(); itr != highest_matches.end(); ++itr) {
635-
636-
std::pair<version_number, Score> best_score = itr->second.front();
637-
best_score.second.calculateScore(max_hashes_matched);
638-
for (unsigned int i=0; i < itr->second.size(); i++) {
639-
itr->second[i].second.calculateScore(max_hashes_matched);
640-
if (itr->second[i].second > best_score.second) {
641-
best_score = itr->second[i];
642-
}
643-
}
644-
ranking.push_back(StudentRanking(itr->first, best_score.first, "", best_score.second));
645-
}
646-
647-
std::sort(ranking.begin(), ranking.end(), ranking_sorter);
648-
for (unsigned int i = 0; i < ranking.size(); i++) {
649-
ranking_ostr
650-
<< std::left << std::setw(20) << ranking[i].student << " "
651-
<< std::setw(3) << ranking[i].version << " "
652-
<< std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "% "
653-
<< std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
654-
}
655530

656531
// ===========================================================================
657532
// Done!

0 commit comments

Comments
 (0)