Skip to content

Commit f1b2990

Browse files
[Feature:Plagiarism] Improve overall and individual rankings (#67)
* renaming sequence length to hash size and prior term to other gradeables * suggested edits * linting * 99% implemented (syntax error????) * Fix compilation issues * fix ranking files * Make requested changes * Fix tests Co-authored-by: williamjallen <[email protected]>
1 parent 5897a67 commit f1b2990

File tree

6 files changed

+102
-27
lines changed

6 files changed

+102
-27
lines changed

compare_hashes/compare_hashes.cpp

+34-23
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "lichen_config.h"
1818
#include "submission.h"
1919
#include "hash_location.h"
20+
#include "score.h"
2021

2122

2223
// =============================================================================
@@ -34,11 +35,11 @@ typedef unsigned int version_number;
3435

3536
// represents an element in a ranking of students by percent match
3637
struct StudentRanking {
37-
StudentRanking(const user_id &s, int v, const std::string &sg, float p) : student(s), version(v), source_gradeable(sg), percent(p) {}
38+
StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
3839
user_id student;
3940
version_number version;
4041
std::string source_gradeable;
41-
float percent;
42+
Score score;
4243
};
4344

4445

@@ -54,7 +55,8 @@ bool matchingPositionsAreAdjacent(const nlohmann::json &first, const nlohmann::j
5455

5556
nlohmann::json::const_iterator itr1 = first["matchingpositions"].begin();
5657
nlohmann::json::const_iterator itr2 = second["matchingpositions"].begin();
57-
// iterate over each matching submission (first and second are the same length so we don't have to check for the end of second)
58+
// iterate over each matching submission (first and second are the same length
59+
// so we don't have to check for the end of second)
5860
for (; itr1 != first["matchingpositions"].end(); itr1++, itr2++) {
5961
if ((*itr1)["end"].get<int>() + 1 != (*itr2)["end"].get<int>()) {
6062
return false;
@@ -88,8 +90,8 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
8890

8991

9092
bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
91-
return a.percent > b.percent ||
92-
(a.percent == b.percent && a.student < b.student);
93+
return a.score > b.score ||
94+
(a.score == b.score && a.student < b.student);
9395
}
9496

9597

@@ -156,7 +158,9 @@ int main(int argc, char* argv[]) {
156158
// stores all hashes from other gradeables
157159
std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
158160
// stores the highest match for every student, used later for generating overall_rankings.txt
159-
std::unordered_map<std::string, std::pair<int, float>> highest_matches;
161+
std::unordered_map<user_id, std::pair<int, Score>> highest_matches;
162+
// keeps track of max matching hashes across all submissions, used for calculation of ranking score
163+
unsigned int max_hashes_matched = 0;
160164

161165
time_t start, end;
162166
time(&start);
@@ -331,15 +335,19 @@ int main(int argc, char* argv[]) {
331335

332336
// Save this submissions highest percent match for later when we generate overall_rankings.txt
333337
float percentMatch = (*submission_itr)->getPercentage();
338+
unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
339+
Score submission_score(totalMatchingHashes, percentMatch);
340+
if (max_hashes_matched < totalMatchingHashes) {
341+
max_hashes_matched = totalMatchingHashes;
342+
}
334343

335-
std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
344+
std::unordered_map<user_id, std::pair<int, Score> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
345+
std::pair<int, Score> new_pair = {(*submission_itr)->version(), submission_score};
336346
if (highest_matches_itr == highest_matches.end()) {
337-
highest_matches[(*submission_itr)->student()].first = (*submission_itr)->version();
338-
highest_matches[(*submission_itr)->student()].second = percentMatch;
347+
highest_matches.insert({(*submission_itr)->student(), new_pair});
339348
}
340-
else if (percentMatch > highest_matches_itr->second.second) {
341-
highest_matches_itr->second.first = (*submission_itr)->version();
342-
highest_matches_itr->second.second = percentMatch;
349+
else if (submission_score > highest_matches_itr->second.second) {
350+
highest_matches_itr->second = new_pair;
343351
}
344352

345353
// =========================================================================
@@ -547,8 +555,10 @@ int main(int argc, char* argv[]) {
547555
// the percent match is currently calculated using the number of hashes that match between this
548556
// submission and the other submission, over the total number of hashes this submission has.
549557
// In other words, the percentage is how much of this submission's code was plgairised from the other.
550-
float percent = (100.0 * version_itr->second.size()) / unique_hashes.size();
551-
student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, percent));
558+
unsigned int num_hashes_matched = version_itr->second.size();
559+
float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
560+
student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
561+
student_ranking.back().score.calculateScore(num_hashes_matched);
552562
}
553563
}
554564
}
@@ -564,10 +574,10 @@ int main(int argc, char* argv[]) {
564574
// finally, write the file of ranking for this submission
565575
for (unsigned int i = 0; i < student_ranking.size(); i++) {
566576
ranking_student_ostr
567-
<< std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].percent << "% "
568-
<< std::setw(15) << std::left << student_ranking[i].student << " "
569-
<< std::setw(3) << std::left << student_ranking[i].version << " "
570-
<< std::setw(1) << std::right << student_ranking[i].source_gradeable << std::endl;
577+
<< std::setw(15) << std::left << student_ranking[i].student << " "
578+
<< std::setw(3) << std::left << student_ranking[i].version << " "
579+
<< std::setw(1) << std::right << student_ranking[i].source_gradeable << " "
580+
<< std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
571581
}
572582

573583
// =========================================================================
@@ -599,18 +609,19 @@ int main(int argc, char* argv[]) {
599609
// take the map of highest matches and convert it to a vector so we can sort it
600610
// by percent match and then save it to a file
601611
std::vector<StudentRanking> ranking;
602-
for (std::unordered_map<std::string, std::pair<int, float> >::iterator itr
612+
for (std::unordered_map<user_id, std::pair<int, Score> >::iterator itr
603613
= highest_matches.begin(); itr != highest_matches.end(); ++itr) {
604614
ranking.push_back(StudentRanking(itr->first, itr->second.first, "", itr->second.second));
615+
ranking[ranking.size()-1].score.calculateScore(max_hashes_matched);
605616
}
606617

607618
std::sort(ranking.begin(), ranking.end(), ranking_sorter);
608-
609619
for (unsigned int i = 0; i < ranking.size(); i++) {
610620
ranking_ostr
611-
<< std::setw(6) << std::setprecision(2) << std::fixed << ranking[i].percent << "% "
612-
<< std::setw(15) << std::left << ranking[i].student << " "
613-
<< std::setw(3) << std::right << ranking[i].version << std::endl;
621+
<< std::left << std::setw(20) << ranking[i].student << " "
622+
<< std::setw(3) << ranking[i].version << " "
623+
<< std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "% "
624+
<< std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
614625
}
615626

616627
// ===========================================================================

compare_hashes/score.h

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#ifndef SCORE_H
2+
#define SCORE_H
3+
4+
#include <cassert>
5+
6+
typedef int location_in_submission;
7+
typedef unsigned int hash;
8+
typedef std::string user_id;
9+
typedef unsigned int version_number;
10+
11+
// represents the plagiarism score for a given submissions, used for the overall rankings file
12+
class Score {
13+
public:
14+
// CONSTRUCTOR
15+
Score(unsigned int hashes_matched, float percent): hashes_matched(hashes_matched), percent(percent), score(-1) {}
16+
Score(const Score &other) { copy(other); }
17+
18+
// GETTERS
19+
float getPercent() const { return percent; }
20+
unsigned int getHashesMatched() const { return hashes_matched; }
21+
22+
// MODIFIER
23+
// Each submission in the ranking file gets a composite score that weighs both its percentage
24+
// of suspicious matches, and its percentile of total number of hashes matched
25+
void calculateScore(unsigned int max_hashes_matched) {
26+
score = PERCENT_WEIGHT*(percent/100.0) + MATCH_WEIGHT*(static_cast<float>(hashes_matched)/max_hashes_matched);
27+
}
28+
29+
// OPERATORS
30+
bool operator>(const Score &other_s) const {
31+
constexpr float EPSILON = 0.0001;
32+
return std::abs(getScore() - other_s.getScore()) > EPSILON && getScore() > other_s.getScore();
33+
}
34+
bool operator==(const Score &other_s) const {
35+
return getScore() == other_s.getScore();
36+
}
37+
Score& operator=(const Score& other) {
38+
if (this != &other) {
39+
copy(other);
40+
}
41+
return *this;
42+
}
43+
44+
45+
private:
46+
static constexpr float PERCENT_WEIGHT = 0.5;
47+
static constexpr float MATCH_WEIGHT = 0.5;
48+
// just a sanity check to make sure these values are appropriately updated in the future
49+
static_assert(PERCENT_WEIGHT + MATCH_WEIGHT == 1, "Weights must add to 1");
50+
51+
unsigned int hashes_matched;
52+
float percent;
53+
float score;
54+
55+
void copy(const Score &other) {
56+
hashes_matched = other.hashes_matched;
57+
percent = other.percent;
58+
score = other.score;
59+
}
60+
float getScore() const { assert(score >= 0 && score <= 1); return score; }
61+
};
62+
63+
#endif

compare_hashes/submission.h

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class Submission {
3232
const std::set<location_in_submission>& getProvidedMatches() const { return provided_matches; }
3333
const std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>& getStudentsMatched() const { return students_matched; }
3434
const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
35+
unsigned int getMatchCount() const { return suspicious_matches.size(); }
3536
float getPercentage() const;
3637

3738
// MODIFIERS
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
25.71% bitdiddle 1
2-
19.48% aphacker 1
1+
bitdiddle 1 25.7% 27
2+
aphacker 1 19.5% 15
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
18.42% bitdiddle 1 f21__plagiarism__repeated_sequences
1+
bitdiddle 1 f21__plagiarism__repeated_sequences 18.42%
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
15.22% aphacker 1 f21__plagiarism__repeated_sequences
1+
aphacker 1 f21__plagiarism__repeated_sequences 15.22%

0 commit comments

Comments
 (0)