Skip to content

Commit 38125ff

Browse files
[Refactor:Plagiarism] Discard overlap with common code (#63)
* initial draft * Fix errors Co-authored-by: sbelsk <[email protected]>
1 parent f29dd01 commit 38125ff

File tree

6 files changed

+105
-33
lines changed

6 files changed

+105
-33
lines changed

compare_hashes/compare_hashes.cpp

+20-17
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "boost/filesystem/path.hpp"
1515
#include "nlohmann/json.hpp"
1616

17+
#include "lichen_config.h"
1718
#include "submission.h"
1819
#include "hash_location.h"
1920

@@ -101,11 +102,13 @@ int main(int argc, char* argv[]) {
101102
time_t overall_start, overall_end;
102103
time(&overall_start);
103104

105+
104106
// ===========================================================================
105107
// load Lichen config data
106108
std::ifstream lichen_config_istr("./lichen_config.json");
107109
assert(lichen_config_istr.good());
108110
nlohmann::json lichen_config = nlohmann::json::parse(lichen_config_istr);
111+
LichenConfig config;
109112

110113
// ===========================================================================
111114
// load config info
@@ -119,11 +122,11 @@ int main(int argc, char* argv[]) {
119122
assert(istr.good());
120123
nlohmann::json config_file_json = nlohmann::json::parse(istr);
121124

122-
std::string semester = config_file_json.value("semester", "ERROR");
123-
std::string course = config_file_json.value("course", "ERROR");
124-
std::string gradeable = config_file_json.value("gradeable", "ERROR");
125-
int sequence_length = config_file_json.value("sequence_length", 1);
126-
int threshold = config_file_json.value("threshold", 5);
125+
config.semester = config_file_json.value("semester", "ERROR");
126+
config.course = config_file_json.value("course", "ERROR");
127+
config.gradeable = config_file_json.value("gradeable", "ERROR");
128+
config.sequence_length = config_file_json.value("sequence_length", 1);
129+
config.threshold = config_file_json.value("threshold", 5);
127130

128131
// error checking, confirm there are hashes to work with
129132
boost::filesystem::path users_root_directory = lichen_gradeable_path / "users";
@@ -136,7 +139,7 @@ int main(int argc, char* argv[]) {
136139
// the file path where we expect to find the hashed instructor provided code file
137140
boost::filesystem::path provided_code_file = lichen_gradeable_path / "provided_code" / "hashes.txt";
138141
// if file exists in that location, the provided code mode is enabled.
139-
bool provided_code_enabled = boost::filesystem::exists(provided_code_file);
142+
config.provided_code_enabled = boost::filesystem::exists(provided_code_file);
140143
// path to prior gradeables' data
141144
boost::filesystem::path prior_terms_dir = lichen_gradeable_path / "other_gradeables";
142145

@@ -158,7 +161,7 @@ int main(int argc, char* argv[]) {
158161
time_t start, end;
159162
time(&start);
160163

161-
if (provided_code_enabled) {
164+
if (config.provided_code_enabled) {
162165
// load the instructor provided code's hashes
163166
std::ifstream istr(provided_code_file.string());
164167
assert(istr.good());
@@ -221,7 +224,7 @@ int main(int argc, char* argv[]) {
221224
assert (version > 0);
222225

223226
// create a submission object and load to the main submissions structure
224-
Submission* curr_submission = new Submission(username, version);
227+
Submission* curr_submission = new Submission(username, version, config);
225228

226229
// load the hashes from this submission
227230
boost::filesystem::path hash_file = version_path;
@@ -233,7 +236,7 @@ int main(int argc, char* argv[]) {
233236
while (istr >> input_hash_str) {
234237
hash input_hash = (unsigned int)(stoul(input_hash_str, 0, 16));
235238
location++;
236-
all_hashes[input_hash][username].push_back(HashLocation(username, version, location, semester+"__"+course+"__"+gradeable));
239+
all_hashes[input_hash][username].push_back(HashLocation(username, version, location, config.semester + "__" + config.course + "__" + config.gradeable));
237240
curr_submission->addHash(input_hash, location);
238241
}
239242

@@ -267,7 +270,7 @@ int main(int argc, char* argv[]) {
267270

268271
// if provided code was enabled, look for the submission hash in the provided code's hashes
269272
bool provided_match_found = false;
270-
if (provided_code_enabled) {
273+
if (config.provided_code_enabled) {
271274
std::unordered_set<hash>::iterator provided_match_itr = provided_code.find(hash_itr->first);
272275
if (provided_match_itr != provided_code.end()) {
273276
provided_match_found = true;
@@ -292,7 +295,7 @@ int main(int argc, char* argv[]) {
292295
std::vector<HashLocation>::iterator itr = occurences_itr->second.begin();
293296
for (; itr != occurences_itr->second.end(); ++itr) {
294297

295-
if (occurences.size() > (unsigned int)threshold) {
298+
if (occurences.size() > (unsigned int) config.threshold) {
296299
// if the number of students with matching code is more
297300
// than the threshold, it is considered common code
298301
(*submission_itr)->addCommonMatch(hash_itr->second);
@@ -368,7 +371,7 @@ int main(int argc, char* argv[]) {
368371
std::vector<nlohmann::json> matchingpositions;
369372
nlohmann::json position;
370373
position["start"] = matching_positions_itr->location;
371-
position["end"] = matching_positions_itr->location + sequence_length - 1;
374+
position["end"] = matching_positions_itr->location + config.sequence_length - 1;
372375
matchingpositions.push_back(position);
373376

374377
// search for all matching positions of the suspicious match in other submissions
@@ -400,7 +403,7 @@ int main(int argc, char* argv[]) {
400403
other["source_gradeable"] = matching_positions_itr->source_gradeable;
401404
}
402405
position["start"] = matching_positions_itr->location;
403-
position["end"] = matching_positions_itr->location + sequence_length - 1;
406+
position["end"] = matching_positions_itr->location + config.sequence_length - 1;
404407
matchingpositions.push_back(position);
405408
}
406409
}
@@ -411,7 +414,7 @@ int main(int argc, char* argv[]) {
411414

412415
nlohmann::json info;
413416
info["start"] = location_itr->first;
414-
info["end"] = location_itr->first + sequence_length - 1;
417+
info["end"] = location_itr->first + config.sequence_length - 1;
415418
info["type"] = "match";
416419
info["others"] = others;
417420

@@ -428,7 +431,7 @@ int main(int argc, char* argv[]) {
428431

429432
nlohmann::json info;
430433
info["start"] = *location_itr;
431-
info["end"] = *location_itr + sequence_length - 1;
434+
info["end"] = *location_itr + config.sequence_length - 1;
432435
info["type"] = "common";
433436

434437
result.push_back(info);
@@ -444,7 +447,7 @@ int main(int argc, char* argv[]) {
444447

445448
nlohmann::json info;
446449
info["start"] = *location_itr;
447-
info["end"] = *location_itr + sequence_length - 1;
450+
info["end"] = *location_itr + config.sequence_length - 1;
448451
info["type"] = "provided";
449452

450453
result.push_back(info);
@@ -456,7 +459,7 @@ int main(int argc, char* argv[]) {
456459
// Done creating the JSON file/objects, now we merge them to shrink them in size
457460

458461
// Merge matching regions:
459-
if (result.size() > 0) { // check to make sure that there are more than 1 positions (if it's 1, we can't merge anyway)
462+
if (!result.empty()) { // check to make sure that there are more than 1 positions (if it's 1, we can't merge anyway)
460463
// loop through all positions
461464
for (unsigned int position = 1; position < result.size(); position++) {
462465
nlohmann::json* prevPosition = &result[position - 1];

compare_hashes/hash_location.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ struct HashLocation {
1919
std::string source_gradeable;
2020
};
2121

22-
bool operator < (const HashLocation &hl1, const HashLocation &hl2) {
22+
// inline keyword is necessary to prevent linker errors when multiple .cpp files include this header and are then linked
23+
inline bool operator < (const HashLocation &hl1, const HashLocation &hl2) {
2324
return hl1.student > hl2.student ||
2425
(hl1.student == hl2.student && hl1.version < hl2.version) ||
2526
(hl1.student == hl2.student && hl1.version == hl2.version && hl1.location < hl2.location);

compare_hashes/lichen_config.h

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#ifndef LICHEN_CONFIG_H
2+
#define LICHEN_CONFIG_H
3+
4+
struct LichenConfig {
5+
std::string semester;
6+
std::string course;
7+
std::string gradeable;
8+
int sequence_length;
9+
int threshold;
10+
bool provided_code_enabled;
11+
};
12+
13+
#endif

compare_hashes/submission.cpp

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include <map>
2+
#include <set>
3+
#include <vector>
4+
5+
#include "hash_location.h"
6+
#include "submission.h"
7+
8+
typedef int location_in_submission;
9+
typedef unsigned int hash;
10+
typedef std::string user_id;
11+
typedef unsigned int version_number;
12+
13+
float Submission::getPercentage() const {
14+
return (100.0 * (suspicious_matches.size())) / hashes.size();
15+
}
16+
17+
void Submission::addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) {
18+
// figure out if there is an overlap between this hash and a common/provided match
19+
int sequence_length = config_.sequence_length;
20+
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
21+
if (common_matches.find(i) != common_matches.end() || provided_matches.find(i) != provided_matches.end()) {
22+
return;
23+
}
24+
}
25+
26+
// save the found match
27+
suspicious_matches[location].insert(matching_location);
28+
// update the students_matched container
29+
students_matched[matching_location.source_gradeable][matching_location.student][matching_location.version].insert(matched_hash);
30+
}
31+
32+
void Submission::addCommonMatch(location_in_submission location) {
33+
// figure out if there is an overlap between this hash and a match
34+
int sequence_length = config_.sequence_length;
35+
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
36+
std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
37+
// if there is an overlap, remove the suspicious match that overlaps
38+
// hopefully this doesn't cause problems with other submissions thinking
39+
// this hash still matches...
40+
if (find_i != suspicious_matches.end()) {
41+
suspicious_matches.erase(find_i);
42+
}
43+
}
44+
45+
common_matches.insert(location);
46+
}
47+
48+
void Submission::addProvidedMatch(location_in_submission location) {
49+
// figure out if there is an overlap between this hash and a match
50+
int sequence_length = config_.sequence_length;
51+
for (int i = location - 1; i > location - sequence_length && i >= 0; i--) {
52+
std::map<location_in_submission, std::set<HashLocation> >::const_iterator find_i = suspicious_matches.find(i);
53+
// if there is an overlap, remove the suspicious match that overlaps
54+
// hopefully this doesn't cause problems with other submissions thinking
55+
// this hash still matches...
56+
if (find_i != suspicious_matches.end()) {
57+
suspicious_matches.erase(find_i);
58+
}
59+
}
60+
61+
provided_matches.insert(location);
62+
}

compare_hashes/submission.h

+7-14
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <vector>
1010

1111
#include "hash_location.h"
12+
#include "lichen_config.h"
1213

1314
typedef int location_in_submission;
1415
typedef unsigned int hash;
@@ -20,7 +21,7 @@ typedef unsigned int version_number;
2021
class Submission {
2122
public:
2223
// CONSTRUCTOR
23-
Submission(const user_id &s, version_number v) : student_(s), version_(v) {}
24+
Submission(const user_id &s, version_number v, const LichenConfig &c) : student_(s), version_(v), config_(c) {}
2425

2526
// GETTERS
2627
const user_id& student() const { return student_; }
@@ -31,26 +32,18 @@ class Submission {
3132
const std::set<location_in_submission>& getProvidedMatches() const { return provided_matches; }
3233
const std::unordered_map<std::string, std::unordered_map<user_id, std::unordered_map<version_number, std::unordered_set<hash>>>>& getStudentsMatched() const { return students_matched; }
3334
const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
34-
float getPercentage() const {
35-
return (100.0 * (suspicious_matches.size())) / hashes.size();
36-
}
35+
float getPercentage() const;
3736

3837
// MODIFIERS
3938
void addHash(const hash &h, location_in_submission l) { hashes.push_back(std::make_pair(h, l)); }
40-
41-
void addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash) {
42-
// save the found match
43-
suspicious_matches[location].insert(matching_location);
44-
// update the students_matched container
45-
students_matched[matching_location.source_gradeable][matching_location.student][matching_location.version].insert(matched_hash);
46-
}
47-
48-
void addCommonMatch(location_in_submission location) { common_matches.insert(location); }
49-
void addProvidedMatch(location_in_submission location) { provided_matches.insert(location); }
39+
void addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location, const hash &matched_hash);
40+
void addCommonMatch(location_in_submission location);
41+
void addProvidedMatch(location_in_submission location);
5042

5143
private:
5244
user_id student_;
5345
version_number version_;
46+
LichenConfig config_;
5447
std::vector<std::pair<hash, location_in_submission> > hashes;
5548
std::map<location_in_submission, std::set<HashLocation> > suspicious_matches;
5649
std::set<location_in_submission> common_matches;

install_lichen.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ mkdir -p ${lichen_installation_dir}/tools/assignments
6262
#-------------------------------------------
6363
# compile & install the hash comparison tool
6464
pushd ${lichen_repository_dir} > /dev/null
65-
clang++ -I ${lichen_vendor_dir} -lboost_system -lboost_filesystem -Wall -Wextra -Werror -g -O3 -flto -funroll-loops -std=c++11 compare_hashes/compare_hashes.cpp -o ${lichen_installation_dir}/bin/compare_hashes.out
65+
clang++ -I ${lichen_vendor_dir} -lboost_system -lboost_filesystem -Wall -Wextra -Werror -g -Ofast -flto -funroll-loops -std=c++11 compare_hashes/compare_hashes.cpp compare_hashes/submission.cpp -o ${lichen_installation_dir}/bin/compare_hashes.out
6666
if [ $? -ne 0 ]; then
6767
echo -e "ERROR: FAILED TO BUILD HASH COMPARISON TOOL\n"
6868
exit 1

0 commit comments

Comments
 (0)