Skip to content

Commit 0f2e364

Browse files
[Feature:Plagiarism] Add reasonable limits to file sizes (#54)
* add error message * Add limits to potentially expensive operations * linter * linter (v2.0) Co-authored-by: sbelsk <[email protected]>
1 parent 36bd51f commit 0f2e364

File tree

5 files changed

+97
-54
lines changed

5 files changed

+97
-54
lines changed

bin/concatenate_all.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
IGNORED_FILES = [
1717
".submit.timestamp"
1818
]
19-
MAX_CONCAT_SIZE = 1e9
19+
20+
with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
21+
LICHEN_CONFIG = json.load(lichen_config_file)
2022

2123

2224
# returns a string containing the contents of the files which match the regex in the specified dir
@@ -45,8 +47,9 @@ def getConcatFilesInDir(input_dir, regex_patterns):
4547

4648

4749
def checkTotalSize(total_concat):
48-
if total_concat > MAX_CONCAT_SIZE:
49-
raise SystemExit(f"ERROR! exceeded {humanize.naturalsize(MAX_CONCAT_SIZE)}"
50+
if total_concat > LICHEN_CONFIG['concat_max_total_bytes']:
51+
raise SystemExit("ERROR! exceeded"
52+
f"{humanize.naturalsize(LICHEN_CONFIG['concat_max_total_bytes'])}"
5053
" of concatenated files allowed")
5154

5255

bin/hash_all.py

+29-21
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import json
1111
import time
1212
import hashlib
13+
from pathlib import Path
1314

1415

1516
def parse_args():
@@ -18,9 +19,9 @@ def parse_args():
1819
return parser.parse_args()
1920

2021

21-
def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
22-
language = lichen_config_data["language"]
23-
sequence_length = int(lichen_config_data["sequence_length"])
22+
def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
23+
language = lichen_run_config["language"]
24+
sequence_length = int(lichen_run_config["sequence_length"])
2425

2526
data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
2627
with open(data_json_path) as token_data_file:
@@ -39,69 +40,76 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
3940
token_values[x:x+sequence_length]).encode())
4041
.hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
4142

43+
if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
44+
token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])] # noqa E501
45+
print(f"File {my_hashes_file} truncated after exceeding max sequence limit")
46+
4247
my_hf.write('\n'.join(token_hashed_values))
4348

4449

4550
def main():
4651
start_time = time.time()
4752
args = parse_args()
4853

49-
with open(os.path.join(args.basepath, "config.json")) as lichen_config:
50-
lichen_config_data = json.load(lichen_config)
54+
with open(Path(args.basepath, "config.json")) as lichen_run_config_file:
55+
lichen_run_config = json.load(lichen_run_config_file)
56+
57+
with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
58+
lichen_config = json.load(lichen_config_file)
5159

5260
print("HASH ALL...", end="")
5361

5462
# ==========================================================================
5563
# walk the subdirectories of this gradeable
56-
users_dir = os.path.join(args.basepath, "users")
64+
users_dir = Path(args.basepath, "users")
5765
if not os.path.isdir(users_dir):
5866
raise SystemExit("ERROR! Unable to find users directory")
5967

6068
for user in sorted(os.listdir(users_dir)):
61-
user_dir = os.path.join(users_dir, user)
69+
user_dir = Path(users_dir, user)
6270
if not os.path.isdir(user_dir):
6371
continue
6472

6573
for version in sorted(os.listdir(user_dir)):
66-
my_dir = os.path.join(user_dir, version)
74+
my_dir = Path(user_dir, version)
6775
if not os.path.isdir(my_dir):
6876
continue
6977

70-
my_tokenized_file = os.path.join(my_dir, "tokens.json")
71-
my_hashes_file = os.path.join(my_dir, "hashes.txt")
72-
hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
78+
my_tokenized_file = Path(my_dir, "tokens.json")
79+
my_hashes_file = Path(my_dir, "hashes.txt")
80+
hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file)
7381

7482
# ==========================================================================
7583
# walk the subdirectories of the other gradeables
7684

77-
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
85+
other_gradeables_dir = Path(args.basepath, "other_gradeables")
7886
if not os.path.isdir(other_gradeables_dir):
7987
raise SystemExit("ERROR! Unable to find other gradeables directory")
8088

8189
for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
82-
other_gradeable_dir = os.path.join(other_gradeables_dir, other_gradeable)
90+
other_gradeable_dir = Path(other_gradeables_dir, other_gradeable)
8391
if not os.path.isdir(other_gradeable_dir):
8492
continue
8593

8694
for other_user in sorted(os.listdir(other_gradeable_dir)):
87-
other_user_dir = os.path.join(other_gradeable_dir, other_user)
95+
other_user_dir = Path(other_gradeable_dir, other_user)
8896
if not os.path.isdir(other_user_dir):
8997
continue
9098

9199
for other_version in sorted(os.listdir(other_user_dir)):
92-
other_version_dir = os.path.join(other_user_dir, other_version)
100+
other_version_dir = Path(other_user_dir, other_version)
93101
if not os.path.isdir(other_version_dir):
94102
continue
95103

96-
other_tokenized_file = os.path.join(other_version_dir, "tokens.json")
97-
other_hashes_file = os.path.join(other_version_dir, "hashes.txt")
98-
hasher(lichen_config_data, other_tokenized_file, other_hashes_file)
104+
other_tokenized_file = Path(other_version_dir, "tokens.json")
105+
other_hashes_file = Path(other_version_dir, "hashes.txt")
106+
hasher(lichen_config, lichen_run_config, other_tokenized_file, other_hashes_file)
99107

100108
# ==========================================================================
101109
# hash the provided code
102-
provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
103-
provided_code_hashed = os.path.join(args.basepath, "provided_code", "hashes.txt")
104-
hasher(lichen_config_data, provided_code_tokenized, provided_code_hashed)
110+
provided_code_tokenized = Path(args.basepath, "provided_code", "tokens.json")
111+
provided_code_hashed = Path(args.basepath, "provided_code", "hashes.txt")
112+
hasher(lichen_config, lichen_run_config, provided_code_tokenized, provided_code_hashed)
105113

106114
# ==========================================================================
107115
end_time = time.time()

bin/lichen_config.json

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"concat_max_total_bytes": 1000000000,
3+
"max_sequences_per_file": 10000,
4+
"max_matching_positions": 30
5+
}

bin/process_all.sh

+40-26
Original file line numberDiff line numberDiff line change
@@ -7,57 +7,71 @@
77

88
# TODO: Assert permissions, as necessary
99

10-
basepath=$1 # holds the path to a directory containing a config for this gradeable
10+
BASEPATH=$1 # holds the path to a directory containing a config for this gradeable
1111
# (probably .../lichen/gradeable/<unique number>/ on Submitty)
1212

13-
datapath=$2 # holds the path to a directory conatining courses and their data
13+
DATAPATH=$2 # holds the path to a directory conatining courses and their data
1414
# (probably /var/local/submitty/courses on Submitty)
1515

16+
KILL_ERROR_MESSAGE="
17+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
18+
* An error occured while running Lichen. Your run was probably killed for *
19+
* exceeding the configured resource limits. Before rerunning, perhaps try any *
20+
* of the following edits to the configuration: *
21+
* - Increasing the sequence length *
22+
* - Using only active version *
23+
* - Decreasing the common code threshold *
24+
* - Selecting fewer files to be compared *
25+
* - Comparing against fewer other gradeables *
26+
* - Uploading provided code files *
27+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
28+
";
29+
1630
# kill the script if there is no config file
17-
if [ ! -f "${basepath}/config.json" ]; then
31+
if [ ! -f "${BASEPATH}/config.json" ]; then
1832
echo "Unable to find config.json in provided directory"
1933
exit 1
2034
fi
2135

2236

2337
# delete any previous run results
2438
# TODO: determine if any caching should occur
25-
rm -rf "${basepath}/logs"
26-
rm -rf "${basepath}/other_gradeables"
27-
rm -rf "${basepath}/users"
28-
rm -f "${basepath}/overall_ranking.txt"
29-
rm -f "${basepath}/provided_code/submission.concatenated"
30-
rm -f "${basepath}/provided_code/tokens.json"
31-
rm -f "${basepath}/provided_code/hashes.txt"
39+
rm -rf "${BASEPATH}/logs"
40+
rm -rf "${BASEPATH}/other_gradeables"
41+
rm -rf "${BASEPATH}/users"
42+
rm -f "${BASEPATH}/overall_ranking.txt"
43+
rm -f "${BASEPATH}/provided_code/submission.concatenated"
44+
rm -f "${BASEPATH}/provided_code/tokens.json"
45+
rm -f "${BASEPATH}/provided_code/hashes.txt"
3246

3347
# create these directories if they don't already exist
34-
mkdir -p "${basepath}/logs"
35-
mkdir -p "${basepath}/provided_code"
36-
mkdir -p "${basepath}/provided_code/files"
37-
mkdir -p "${basepath}/other_gradeables"
38-
mkdir -p "${basepath}/users"
48+
mkdir -p "${BASEPATH}/logs"
49+
mkdir -p "${BASEPATH}/provided_code"
50+
mkdir -p "${BASEPATH}/provided_code/files"
51+
mkdir -p "${BASEPATH}/other_gradeables"
52+
mkdir -p "${BASEPATH}/users"
3953

4054
# Run Lichen and exit if an error occurs
4155
{
4256
############################################################################
4357
# Finish setting up Lichen run
4458

4559
# The default is r-x and we need PHP to be able to write if edits are made to the provided code
46-
chmod g=rwxs "${basepath}/provided_code/files" || exit 1
60+
chmod g=rwxs "${BASEPATH}/provided_code/files" || exit 1
4761

4862
cd "$(dirname "${0}")" || exit 1
4963

5064
############################################################################
5165
# Do some preprocessing
5266
echo "Beginning Lichen run: $(date +"%Y-%m-%d %H:%M:%S")"
53-
./concatenate_all.py "$basepath" "$datapath" || exit 1
67+
./concatenate_all.py "$BASEPATH" "$DATAPATH" || exit 1
5468

5569
############################################################################
5670
# Move the file somewhere to be processed (eventually this will be a worker machine)
5771

5872
# Tar+zip the file structure and save it to /tmp
59-
cd $basepath || exit 1
60-
archive_name=$(sha1sum "${basepath}/config.json" | awk '{ print $1 }') || exit 1
73+
cd $BASEPATH || exit 1
74+
archive_name=$(sha1sum "${BASEPATH}/config.json" | awk '{ print $1 }') || exit 1
6175
tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "config.json" "other_gradeables" "users" "provided_code" || exit 1
6276
cd "$(dirname "${0}")" || exit 1
6377

@@ -71,21 +85,21 @@ mkdir -p "${basepath}/users"
7185

7286
############################################################################
7387
# Run Lichen
74-
./tokenize_all.py "$tmp_location" || { rm -rf $tmp_location; exit 1; }
75-
./hash_all.py "$tmp_location" || { rm -rf $tmp_location; exit 1; }
76-
./compare_hashes.out "$tmp_location" || { rm -rf $tmp_location; exit 1; }
88+
./tokenize_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
89+
./hash_all.py "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
90+
./compare_hashes.out "$tmp_location" || { rm -rf "$tmp_location"; echo "${KILL_ERROR_MESSAGE}"; exit 1; }
7791

7892
############################################################################
7993
# Zip the results back up and send them back to the course's lichen directory
8094
cd $tmp_location || exit 1
8195
tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "."
82-
rm -rf $tmp_location || exit 1
96+
rm -rf "$tmp_location" || exit 1
8397

8498
# TODO: Move the archive back from worker machine
8599

86100
# Extract archive and restore Lichen file structure
87-
cd $basepath || exit 1
88-
tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$basepath"
101+
cd "$BASEPATH" || exit 1
102+
tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$BASEPATH"
89103
rm "/tmp/LICHEN_JOB_${archive_name}.tar.gz" || exit 1
90104

91-
} >> "${basepath}/logs/lichen_job_output.txt" 2>&1
105+
} >> "${BASEPATH}/logs/lichen_job_output.txt" 2>&1

compare_hashes/compare_hashes.cpp

+17-4
Original file line numberDiff line numberDiff line change
@@ -101,16 +101,22 @@ int main(int argc, char* argv[]) {
101101
time_t overall_start, overall_end;
102102
time(&overall_start);
103103

104+
// ===========================================================================
105+
// load Lichen config data
106+
std::ifstream lichen_config_istr("./lichen_config.json");
107+
assert(lichen_config_istr.good());
108+
nlohmann::json lichen_config = nlohmann::json::parse(lichen_config_istr);
109+
104110
// ===========================================================================
105111
// load config info
106112

107-
assert (argc == 2);
113+
assert(argc == 2);
108114
std::string lichen_gradeable_path_str = argv[1];
109115
boost::filesystem::path lichen_gradeable_path = boost::filesystem::system_complete(lichen_gradeable_path_str);
110116
boost::filesystem::path config_file_json_path = lichen_gradeable_path / "config.json";
111117

112118
std::ifstream istr(config_file_json_path.string());
113-
assert (istr.good());
119+
assert(istr.good());
114120
nlohmann::json config_file_json = nlohmann::json::parse(istr);
115121

116122
std::string semester = config_file_json.value("semester", "ERROR");
@@ -320,7 +326,7 @@ int main(int argc, char* argv[]) {
320326
continue;
321327
}
322328

323-
// Save this submissions highest percent match for later when we geenrate overall_rankings.txt
329+
// Save this submissions highest percent match for later when we generate overall_rankings.txt
324330
float percentMatch = (*submission_itr)->getPercentage();
325331

326332
std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
@@ -375,12 +381,19 @@ int main(int argc, char* argv[]) {
375381
// keep iterating and editing the same object until a we get to a different submission
376382
if (matching_positions_itr->student != other["username"]
377383
|| matching_positions_itr->version != other["version"]
378-
|| matching_positions_itr->source_gradeable != other["source_gradeable"]) {
384+
|| matching_positions_itr->source_gradeable != other["source_gradeable"]
385+
|| matchingpositions.size() >= lichen_config["max_matching_positions"]) {
379386

380387
// found a different one, we push the old one and start over
381388
other["matchingpositions"] = matchingpositions;
382389
others.push_back(other);
383390

391+
if (matchingpositions.size() >= lichen_config["max_matching_positions"]) {
392+
std::cout << "Matching positions array truncated for user: [" << other["username"] << "] version: " << other["version"] << std::endl;
393+
std::cout << " - Try increasing the sequence length to fix this problem." << std::endl;
394+
break;
395+
}
396+
384397
matchingpositions.clear();
385398
other["username"] = matching_positions_itr->student;
386399
other["version"] = matching_positions_itr->version;

0 commit comments

Comments
 (0)