[Feature:Plagiarism] Improve overall and individual rankings (#67)

sbelsk · williamjallen · web-flow · commit f1b2990eb5a3 · 2021-11-08T15:49:44.000-05:00
* renaming sequence length to hash size and prior term to other gradeables

* suggested edits

* linting

* 99% implemented (syntax error????)

* Fix compilation issues

* fix ranking files

* Make requested changes

* Fix tests

Co-authored-by: williamjallen &lt;william.j.allen24@gmail.com&gt;
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -17,6 +17,7 @@
 #include "lichen_config.h"
 #include "submission.h"
 #include "hash_location.h"
+#include "score.h"
 
 
 // =============================================================================
@@ -34,11 +35,11 @@ typedef unsigned int version_number;
 
 // represents an element in a ranking of students by percent match
 struct StudentRanking {
-  StudentRanking(const user_id &s, int v, const std::string &sg, float p) : student(s), version(v), source_gradeable(sg), percent(p) {}
+  StudentRanking(const user_id &id, int v, const std::string &sg, const Score &s) : student(id), version(v), source_gradeable(sg), score(s) {}
   user_id student;
   version_number version;
   std::string source_gradeable;
-  float percent;
+  Score score;
 };
 
 
@@ -54,7 +55,8 @@ bool matchingPositionsAreAdjacent(const nlohmann::json &first, const nlohmann::j
 
   nlohmann::json::const_iterator itr1 = first["matchingpositions"].begin();
   nlohmann::json::const_iterator itr2 = second["matchingpositions"].begin();
-  // iterate over each matching submission (first and second are the same length so we don't have to check for the end of second)
+  // iterate over each matching submission (first and second are the same length
+  // so we don't have to check for the end of second)
   for (; itr1 != first["matchingpositions"].end(); itr1++, itr2++) {
     if ((*itr1)["end"].get<int>() + 1 != (*itr2)["end"].get<int>()) {
       return false;
@@ -88,8 +90,8 @@ void incrementEndPositionsForMatches(nlohmann::json &others) {
 
 
 bool ranking_sorter(const StudentRanking &a, const StudentRanking &b) {
-  return a.percent > b.percent ||
-        (a.percent == b.percent && a.student < b.student);
+  return a.score > b.score ||
+        (a.score == b.score && a.student < b.student);
 }
 
 
@@ -156,7 +158,9 @@ int main(int argc, char* argv[]) {
   // stores all hashes from other gradeables
   std::unordered_map<hash, std::unordered_map<user_id, std::vector<HashLocation>>> other_gradeables;
   // stores the highest match for every student, used later for generating overall_rankings.txt
-  std::unordered_map<std::string, std::pair<int, float>> highest_matches;
+  std::unordered_map<user_id, std::pair<int, Score>> highest_matches;
+  // keeps track of max matching hashes across all submissions, used for calculation of ranking score
+  unsigned int max_hashes_matched = 0;
 
   time_t start, end;
   time(&start);
@@ -331,15 +335,19 @@ int main(int argc, char* argv[]) {
 
     // Save this submissions highest percent match for later when we generate overall_rankings.txt
     float percentMatch = (*submission_itr)->getPercentage();
+    unsigned int totalMatchingHashes = (*submission_itr)->getMatchCount();
+    Score submission_score(totalMatchingHashes, percentMatch);
+    if (max_hashes_matched < totalMatchingHashes) {
+      max_hashes_matched = totalMatchingHashes;
+    }
 
-    std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
+    std::unordered_map<user_id, std::pair<int, Score> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
+    std::pair<int, Score> new_pair = {(*submission_itr)->version(), submission_score};
     if (highest_matches_itr == highest_matches.end()) {
-      highest_matches[(*submission_itr)->student()].first = (*submission_itr)->version();
-      highest_matches[(*submission_itr)->student()].second = percentMatch;
+      highest_matches.insert({(*submission_itr)->student(), new_pair});
     }
-    else if (percentMatch > highest_matches_itr->second.second) {
-      highest_matches_itr->second.first = (*submission_itr)->version();
-      highest_matches_itr->second.second = percentMatch;
+    else if (submission_score > highest_matches_itr->second.second) {
+      highest_matches_itr->second = new_pair;
     }
 
     // =========================================================================
@@ -547,8 +555,10 @@ int main(int argc, char* argv[]) {
           // the percent match is currently calculated using the number of hashes that match between this
           // submission and the other submission, over the total number of hashes this submission has.
           // In other words, the percentage is how much of this submission's code was plgairised from the other.
-          float percent = (100.0 * version_itr->second.size()) / unique_hashes.size();
-          student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, percent));
+          unsigned int num_hashes_matched = version_itr->second.size();
+          float percent = (100.0 * num_hashes_matched) / unique_hashes.size();
+          student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, gradeables_itr->first, Score(num_hashes_matched, percent)));
+          student_ranking.back().score.calculateScore(num_hashes_matched);
         }
       }
     }
@@ -564,10 +574,10 @@ int main(int argc, char* argv[]) {
     // finally, write the file of ranking for this submission
     for (unsigned int i = 0; i < student_ranking.size(); i++) {
       ranking_student_ostr
-        << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].percent << "%   "
-        << std::setw(15) << std::left << student_ranking[i].student << " "
-        << std::setw(3) << std::left << student_ranking[i].version << " "
-        << std::setw(1) << std::right << student_ranking[i].source_gradeable << std::endl;
+        << std::setw(15) << std::left << student_ranking[i].student << "  "
+        << std::setw(3) << std::left << student_ranking[i].version << "  "
+        << std::setw(1) << std::right << student_ranking[i].source_gradeable << "  "
+        << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].score.getPercent() << "%" << std::endl;
     }
 
     // =========================================================================
@@ -599,18 +609,19 @@ int main(int argc, char* argv[]) {
   // take the map of highest matches and convert it to a vector so we can sort it
   // by percent match and then save it to a file
   std::vector<StudentRanking> ranking;
-  for (std::unordered_map<std::string, std::pair<int, float> >::iterator itr
+  for (std::unordered_map<user_id, std::pair<int, Score> >::iterator itr
         = highest_matches.begin(); itr != highest_matches.end(); ++itr) {
     ranking.push_back(StudentRanking(itr->first, itr->second.first, "", itr->second.second));
+    ranking[ranking.size()-1].score.calculateScore(max_hashes_matched);
   }
 
   std::sort(ranking.begin(), ranking.end(), ranking_sorter);
-
   for (unsigned int i = 0; i < ranking.size(); i++) {
     ranking_ostr
-      << std::setw(6) << std::setprecision(2) << std::fixed << ranking[i].percent << "%   "
-      << std::setw(15) << std::left << ranking[i].student << " "
-      << std::setw(3) << std::right << ranking[i].version << std::endl;
+      << std::left << std::setw(20) << ranking[i].student << "  "
+      << std::setw(3) << ranking[i].version << "  "
+      << std::right << std::setw(4) << std::setprecision(1) << std::fixed << ranking[i].score.getPercent() << "%   "
+      << std::setw(5) << ranking[i].score.getHashesMatched() << std::endl;
   }
 
   // ===========================================================================
diff --git a/compare_hashes/score.h b/compare_hashes/score.h
@@ -0,0 +1,63 @@
+#ifndef SCORE_H
+#define SCORE_H
+
+#include <cassert>
+
+typedef int location_in_submission;
+typedef unsigned int hash;
+typedef std::string user_id;
+typedef unsigned int version_number;
+
+// represents the plagiarism score for a given submissions, used for the overall rankings file
+class Score {
+public:
+  // CONSTRUCTOR
+  Score(unsigned int hashes_matched, float percent): hashes_matched(hashes_matched), percent(percent), score(-1) {}
+  Score(const Score &other) { copy(other); }
+
+  // GETTERS
+  float getPercent() const { return percent; }
+  unsigned int getHashesMatched() const { return hashes_matched; }
+
+  // MODIFIER
+  // Each submission in the ranking file gets a composite score that weighs both its percentage
+  // of suspicious matches, and its percentile of total number of hashes matched
+  void calculateScore(unsigned int max_hashes_matched) {
+    score = PERCENT_WEIGHT*(percent/100.0) + MATCH_WEIGHT*(static_cast<float>(hashes_matched)/max_hashes_matched);
+  }
+
+  // OPERATORS
+  bool operator>(const Score &other_s) const {
+    constexpr float EPSILON = 0.0001;
+    return std::abs(getScore() - other_s.getScore()) > EPSILON && getScore() > other_s.getScore();
+  }
+  bool operator==(const Score &other_s) const {
+    return getScore() == other_s.getScore();
+  }
+  Score& operator=(const Score& other) {
+    if (this != &other) {
+      copy(other);
+    }
+    return *this;
+  }
+
+
+private:
+  static constexpr float PERCENT_WEIGHT = 0.5;
+  static constexpr float MATCH_WEIGHT = 0.5;
+  // just a sanity check to make sure these values are appropriately updated in the future
+  static_assert(PERCENT_WEIGHT + MATCH_WEIGHT == 1, "Weights must add to 1");
+
+  unsigned int hashes_matched;
+  float percent;
+  float score;
+
+  void copy(const Score &other) {
+    hashes_matched = other.hashes_matched;
+    percent = other.percent;
+    score = other.score;
+  }
+  float getScore() const { assert(score >= 0 && score <= 1); return score; }
+};
+
+#endif
diff --git a/compare_hashes/submission.h b/compare_hashes/submission.h
@@ -32,6 +32,7 @@ class Submission {
   const std::set<location_in_submission>& getProvidedMatches() const { return provided_matches; }
   const std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>& getStudentsMatched() const { return students_matched; }
   const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
+  unsigned int getMatchCount() const { return suspicious_matches.size(); }
   float getPercentage() const;
 
   // MODIFIERS
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/overall_ranking.txt
@@ -1,2 +1,2 @@
- 25.71%   bitdiddle         1
- 19.48%   aphacker          1
+bitdiddle             1    25.7%      27
+aphacker              1    19.5%      15
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/aphacker/1/ranking.txt
@@ -1 +1 @@
- 18.42%   bitdiddle       1   f21__plagiarism__repeated_sequences
+bitdiddle        1    f21__plagiarism__repeated_sequences   18.42%
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt b/tests/data/test_lichen/repeated_sequences/expected_output/users/bitdiddle/1/ranking.txt
@@ -1 +1 @@
- 15.22%   aphacker        1   f21__plagiarism__repeated_sequences
+aphacker         1    f21__plagiarism__repeated_sequences   15.22%

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`- 18.42% bitdiddle 1 f21__plagiarism__repeated_sequences`
	`1`	`+bitdiddle 1 f21__plagiarism__repeated_sequences 18.42%`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`- 15.22% aphacker 1 f21__plagiarism__repeated_sequences`
	`1`	`+aphacker 1 f21__plagiarism__repeated_sequences 15.22%`