[Feature:Plagiarism] Add reasonable limits to file sizes (#54)

williamjallen · sbelsk · web-flow · commit 0f2e3645d45c · 2021-08-12T22:24:13.000-04:00
* add error message

* Add limits to potentially expensive operations

* linter

* linter (v2.0)

Co-authored-by: sbelsk &lt;shellybelsky02@gmail.com&gt;
diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
@@ -16,7 +16,9 @@
 IGNORED_FILES = [
     ".submit.timestamp"
 ]
-MAX_CONCAT_SIZE = 1e9
+
+with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
+    LICHEN_CONFIG = json.load(lichen_config_file)
 
 
 # returns a string containing the contents of the files which match the regex in the specified dir
@@ -45,8 +47,9 @@ def getConcatFilesInDir(input_dir, regex_patterns):
 
 
 def checkTotalSize(total_concat):
-    if total_concat > MAX_CONCAT_SIZE:
-        raise SystemExit(f"ERROR! exceeded {humanize.naturalsize(MAX_CONCAT_SIZE)}"
+    if total_concat > LICHEN_CONFIG['concat_max_total_bytes']:
+        raise SystemExit("ERROR! exceeded"
+                         f"{humanize.naturalsize(LICHEN_CONFIG['concat_max_total_bytes'])}"
                          " of concatenated files allowed")
 
 
diff --git a/bin/hash_all.py b/bin/hash_all.py
@@ -10,6 +10,7 @@
 import json
 import time
 import hashlib
+from pathlib import Path
 
 
 def parse_args():
@@ -18,9 +19,9 @@ def parse_args():
     return parser.parse_args()
 
 
-def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
-    language = lichen_config_data["language"]
-    sequence_length = int(lichen_config_data["sequence_length"])
+def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
+    language = lichen_run_config["language"]
+    sequence_length = int(lichen_run_config["sequence_length"])
 
     data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
     with open(data_json_path) as token_data_file:
@@ -39,69 +40,76 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
                     token_values[x:x+sequence_length]).encode())
                     .hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
 
+                if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
+                    token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])]  # noqa E501
+                    print(f"File {my_hashes_file} truncated after exceeding max sequence limit")
+
                 my_hf.write('\n'.join(token_hashed_values))
 
 
 def main():
     start_time = time.time()
     args = parse_args()
 
-    with open(os.path.join(args.basepath, "config.json")) as lichen_config:
-        lichen_config_data = json.load(lichen_config)
+    with open(Path(args.basepath, "config.json")) as lichen_run_config_file:
+        lichen_run_config = json.load(lichen_run_config_file)
+
+    with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
+        lichen_config = json.load(lichen_config_file)
 
     print("HASH ALL...", end="")
 
     # ==========================================================================
     # walk the subdirectories of this gradeable
-    users_dir = os.path.join(args.basepath, "users")
+    users_dir = Path(args.basepath, "users")
     if not os.path.isdir(users_dir):
         raise SystemExit("ERROR! Unable to find users directory")
 
     for user in sorted(os.listdir(users_dir)):
-        user_dir = os.path.join(users_dir, user)
+        user_dir = Path(users_dir, user)
         if not os.path.isdir(user_dir):
             continue
 
         for version in sorted(os.listdir(user_dir)):
-            my_dir = os.path.join(user_dir, version)
+            my_dir = Path(user_dir, version)
             if not os.path.isdir(my_dir):
                 continue
 
-            my_tokenized_file = os.path.join(my_dir, "tokens.json")
-            my_hashes_file = os.path.join(my_dir, "hashes.txt")
-            hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
+            my_tokenized_file = Path(my_dir, "tokens.json")
+            my_hashes_file = Path(my_dir, "hashes.txt")
+            hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file)
 
     # ==========================================================================
     # walk the subdirectories of the other gradeables
 
-    other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
+    other_gradeables_dir = Path(args.basepath, "other_gradeables")
     if not os.path.isdir(other_gradeables_dir):
         raise SystemExit("ERROR! Unable to find other gradeables directory")
 
     for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
-        other_gradeable_dir = os.path.join(other_gradeables_dir, other_gradeable)
+        other_gradeable_dir = Path(other_gradeables_dir, other_gradeable)
         if not os.path.isdir(other_gradeable_dir):
             continue
 
         for other_user in sorted(os.listdir(other_gradeable_dir)):
-            other_user_dir = os.path.join(other_gradeable_dir, other_user)
+            other_user_dir = Path(other_gradeable_dir, other_user)
             if not os.path.isdir(other_user_dir):
                 continue
 
             for other_version in sorted(os.listdir(other_user_dir)):
-                other_version_dir = os.path.join(other_user_dir, other_version)
+                other_version_dir = Path(other_user_dir, other_version)
                 if not os.path.isdir(other_version_dir):
                     continue
 
-                other_tokenized_file = os.path.join(other_version_dir, "tokens.json")
-                other_hashes_file = os.path.join(other_version_dir, "hashes.txt")
-                hasher(lichen_config_data, other_tokenized_file, other_hashes_file)
+                other_tokenized_file = Path(other_version_dir, "tokens.json")
+                other_hashes_file = Path(other_version_dir, "hashes.txt")
+                hasher(lichen_config, lichen_run_config, other_tokenized_file, other_hashes_file)
 
     # ==========================================================================
     # hash the provided code
-    provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
-    provided_code_hashed = os.path.join(args.basepath, "provided_code", "hashes.txt")
-    hasher(lichen_config_data, provided_code_tokenized, provided_code_hashed)
+    provided_code_tokenized = Path(args.basepath, "provided_code", "tokens.json")
+    provided_code_hashed = Path(args.basepath, "provided_code", "hashes.txt")
+    hasher(lichen_config, lichen_run_config, provided_code_tokenized, provided_code_hashed)
 
     # ==========================================================================
     end_time = time.time()
diff --git a/bin/lichen_config.json b/bin/lichen_config.json
@@ -0,0 +1,5 @@
+{
+  "concat_max_total_bytes": 1000000000,
+  "max_sequences_per_file": 10000,
+  "max_matching_positions": 30
+}
diff --git a/bin/process_all.sh b/bin/process_all.sh
@@ -7,57 +7,71 @@
 
 # TODO: Assert permissions, as necessary
 
-basepath=$1 # holds the path to a directory containing a config for this gradeable
+BASEPATH=$1 # holds the path to a directory containing a config for this gradeable
             # (probably .../lichen/gradeable/<unique number>/ on Submitty)
 
-datapath=$2 # holds the path to a directory conatining courses and their data
+DATAPATH=$2 # holds the path to a directory conatining courses and their data
             # (probably /var/local/submitty/courses on Submitty)
 
+KILL_ERROR_MESSAGE="
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+* An error occured while running Lichen. Your run was probably killed for       *
+* exceeding the configured resource limits. Before rerunning, perhaps try any   *
+* of the following edits to the configuration:                                  *
+* - Increasing the sequence length                                              *
+* - Using only active version                                                   *
+* - Decreasing the common code threshold                                        *
+* - Selecting fewer files to be compared                                        *
+* - Comparing against fewer other gradeables                                    *
+* - Uploading provided code files                                               *
+* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+";
+
 # kill the script if there is no config file
-if [ ! -f "${basepath}/config.json" ]; then
+if [ ! -f "${BASEPATH}/config.json" ]; then
     echo "Unable to find config.json in provided directory"
 		exit 1
 fi
 
 
 # delete any previous run results
 # TODO: determine if any caching should occur
-rm -rf "${basepath}/logs"
-rm -rf "${basepath}/other_gradeables"
-rm -rf "${basepath}/users"
-rm -f "${basepath}/overall_ranking.txt"
-rm -f "${basepath}/provided_code/submission.concatenated"
-rm -f "${basepath}/provided_code/tokens.json"
-rm -f "${basepath}/provided_code/hashes.txt"
+rm -rf "${BASEPATH}/logs"
+rm -rf "${BASEPATH}/other_gradeables"
+rm -rf "${BASEPATH}/users"
+rm -f "${BASEPATH}/overall_ranking.txt"
+rm -f "${BASEPATH}/provided_code/submission.concatenated"
+rm -f "${BASEPATH}/provided_code/tokens.json"
+rm -f "${BASEPATH}/provided_code/hashes.txt"
 
 # create these directories if they don't already exist
-mkdir -p "${basepath}/logs"
-mkdir -p "${basepath}/provided_code"
-mkdir -p "${basepath}/provided_code/files"
-mkdir -p "${basepath}/other_gradeables"
-mkdir -p "${basepath}/users"
+mkdir -p "${BASEPATH}/logs"
+mkdir -p "${BASEPATH}/provided_code"
+mkdir -p "${BASEPATH}/provided_code/files"
+mkdir -p "${BASEPATH}/other_gradeables"
+mkdir -p "${BASEPATH}/users"
 
 # Run Lichen and exit if an error occurs
 {
     ############################################################################
     # Finish setting up Lichen run
 
     # The default is r-x and we need PHP to be able to write if edits are made to the provided code
-    chmod g=rwxs "${basepath}/provided_code/files" || exit 1
+    chmod g=rwxs "${BASEPATH}/provided_code/files" || exit 1
 
     cd "$(dirname "${0}")" || exit 1
 
     ############################################################################
     # Do some preprocessing
     echo "Beginning Lichen run: $(date +"%Y-%m-%d %H:%M:%S")"
-    ./concatenate_all.py "$basepath" "$datapath" || exit 1
+    ./concatenate_all.py "$BASEPATH" "$DATAPATH" || exit 1
 
     ############################################################################
     # Move the file somewhere to be processed (eventually this will be a worker machine)
 
     # Tar+zip the file structure and save it to /tmp
-    cd $basepath || exit 1
-    archive_name=$(sha1sum "${basepath}/config.json" | awk '{ print $1 }') || exit 1
+    cd $BASEPATH || exit 1
+    archive_name=$(sha1sum "${BASEPATH}/config.json" | awk '{ print $1 }') || exit 1
     tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "config.json" "other_gradeables" "users" "provided_code" || exit 1
     cd "$(dirname "${0}")" || exit 1
 
@@ -71,21 +85,21 @@ mkdir -p "${basepath}/users"
 
     ############################################################################
     # Run Lichen
-    ./tokenize_all.py    "$tmp_location" || { rm -rf $tmp_location; exit 1; }
-    ./hash_all.py        "$tmp_location" || { rm -rf $tmp_location; exit 1; }
-    ./compare_hashes.out "$tmp_location" || { rm -rf $tmp_location; exit 1; }
+    ./tokenize_all.py    "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
+    ./hash_all.py        "$tmp_location" || { rm -rf "$tmp_location"; exit 1; }
+    ./compare_hashes.out "$tmp_location" || { rm -rf "$tmp_location"; echo "${KILL_ERROR_MESSAGE}"; exit 1; }
 
     ############################################################################
     # Zip the results back up and send them back to the course's lichen directory
     cd $tmp_location || exit 1
     tar -czf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" "."
-    rm -rf $tmp_location || exit 1
+    rm -rf "$tmp_location" || exit 1
 
     # TODO: Move the archive back from worker machine
 
     # Extract archive and restore Lichen file structure
-    cd $basepath || exit 1
-    tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$basepath"
+    cd "$BASEPATH" || exit 1
+    tar --skip-old-files -xzf "/tmp/LICHEN_JOB_${archive_name}.tar.gz" -C "$BASEPATH"
     rm "/tmp/LICHEN_JOB_${archive_name}.tar.gz" || exit 1
 
-} >> "${basepath}/logs/lichen_job_output.txt" 2>&1
+} >> "${BASEPATH}/logs/lichen_job_output.txt" 2>&1
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -101,16 +101,22 @@ int main(int argc, char* argv[]) {
   time_t overall_start, overall_end;
   time(&overall_start);
 
+  // ===========================================================================
+  // load Lichen config data
+  std::ifstream lichen_config_istr("./lichen_config.json");
+  assert(lichen_config_istr.good());
+  nlohmann::json lichen_config = nlohmann::json::parse(lichen_config_istr);
+
   // ===========================================================================
   // load config info
 
-  assert (argc == 2);
+  assert(argc == 2);
   std::string lichen_gradeable_path_str = argv[1];
   boost::filesystem::path lichen_gradeable_path = boost::filesystem::system_complete(lichen_gradeable_path_str);
   boost::filesystem::path config_file_json_path = lichen_gradeable_path / "config.json";
 
   std::ifstream istr(config_file_json_path.string());
-  assert (istr.good());
+  assert(istr.good());
   nlohmann::json config_file_json = nlohmann::json::parse(istr);
 
   std::string semester = config_file_json.value("semester", "ERROR");
@@ -320,7 +326,7 @@ int main(int argc, char* argv[]) {
       continue;
     }
 
-    // Save this submissions highest percent match for later when we geenrate overall_rankings.txt
+    // Save this submissions highest percent match for later when we generate overall_rankings.txt
     float percentMatch = (*submission_itr)->getPercentage();
 
     std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr = highest_matches.find((*submission_itr)->student());
@@ -375,12 +381,19 @@ int main(int argc, char* argv[]) {
             // keep iterating and editing the same object until a we get to a different submission
             if (matching_positions_itr->student != other["username"]
                 || matching_positions_itr->version != other["version"]
-                || matching_positions_itr->source_gradeable != other["source_gradeable"]) {
+                || matching_positions_itr->source_gradeable != other["source_gradeable"]
+                || matchingpositions.size() >= lichen_config["max_matching_positions"]) {
 
               // found a different one, we push the old one and start over
               other["matchingpositions"] = matchingpositions;
               others.push_back(other);
 
+              if (matchingpositions.size() >= lichen_config["max_matching_positions"]) {
+                std::cout << "Matching positions array truncated for user: [" << other["username"] << "] version: " << other["version"] << std::endl;
+                std::cout << "  - Try increasing the sequence length to fix this problem." << std::endl;
+                break;
+              }
+
               matchingpositions.clear();
               other["username"] = matching_positions_itr->student;
               other["version"] = matching_positions_itr->version;