Refactor tokenize_all and hash_all python files (#18)

andrewaikens87 · web-flow · commit d46cef39cc44 · 2020-02-03T23:45:46.000-05:00
* Fix hash_all ugly

* remove whitespace

* remove spaces and duplicate code adding data.json

* remove hash map

* fix token traversal

* tweaks to json

* tweaks

* fin

* fix json types
diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
@@ -103,6 +103,6 @@ def main():
                         my_cf.write("\n")
 
     print ("done")
-                            
+
 if __name__ == "__main__":
     main()
diff --git a/bin/hash_all.py b/bin/hash_all.py
@@ -20,7 +20,6 @@
 SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
 SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
 
-
 def parse_args():
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("config_path")
@@ -34,48 +33,25 @@ def hasher(args,my_tokenized_file,my_hashes_file):
         language = lichen_config_data["language"]
         sequence_length = int(lichen_config_data["sequence_length"])
 
+    data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
+    with open(data_json_path) as token_data_file:
+        token_data = json.load(token_data_file)
+        if not language in token_data:
+            print("\n\nERROR: UNKNOWN HASHER\n\n")
+            exit(1)
+
     if (sequence_length < 1):
         print ("ERROR! sequence_length must be >= 1")
         exit(1)
 
     with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf:
         with open(my_hashes_file,'w') as my_hf:
             tokens = json.load(my_tf)
+            token_values = [str(x.get(token_data[language]["token_value"])) for x in tokens]
             num = len(tokens)
-            for i in range(0,num-sequence_length):
-                foo=""
-                if language == "plaintext":
-                    for j in range(0,sequence_length):
-                        foo+=str(tokens[i+j].get("value"))
-
-                elif language == "python":
-                    for j in range(0,sequence_length):
-                        foo+=str(tokens[i+j].get("type"))
-
-                elif language == "cpp":
-                    for j in range(0,sequence_length):
-                        foo+=str(tokens[i+j].get("type"))
-
-                elif language == "java":
-                    for j in range(0,sequence_length):
-                        foo+=str(tokens[i+j].get("type"))
-
-                elif language == "mips":
-                    for j in range(0,sequence_length):
-                        foo+=str(tokens[i+j].get("type"))
-
-                else:
-                    print("\n\nERROR: UNKNOWN HASHER\n\n")
-                    exit(1)
-
-                hash_object = hashlib.md5(foo.encode())
-                hash_object_string=hash_object.hexdigest()
-                #FIXME: this truncation should be adjusted after more full-scale testing
-                #hash_object_string_truncated=hash_object_string[0:4]
-                hash_object_string_truncated=hash_object_string[0:8]
-                #my_hf.write(hash_object_string+"\n")
-                my_hf.write(hash_object_string_truncated+"\n")
-
+            #FIXME: this truncation should be adjusted after more full-scale testing
+            token_hashed_values = [ (hashlib.md5(''.join(token_values[x:x+sequence_length]).encode()).hexdigest())[0:8] for x in range(0, num-sequence_length)]
+            my_hf.write('\n'.join(token_hashed_values))
 
 def main():
     args = parse_args()
@@ -88,7 +64,7 @@ def main():
 
     sys.stdout.write("HASH ALL...")
     sys.stdout.flush()
-    
+
     # ===========================================================================
     # error checking
     course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
@@ -118,6 +94,6 @@ def main():
             hasher(args,my_tokenized_file,my_hashes_file)
 
     print("done")
-            
+
 if __name__ == "__main__":
     main()
diff --git a/bin/process_all.sh b/bin/process_all.sh
@@ -36,10 +36,10 @@ do
 			ignore_submissions+=("$argument")
 		  	;;
 		esac
-	fi		
+	fi
 done
 
-/usr/local/submitty/Lichen/bin/concatenate_all.py  $semester $course $gradeable 
+/usr/local/submitty/Lichen/bin/concatenate_all.py  $semester $course $gradeable
 /usr/local/submitty/Lichen/bin/tokenize_all.py     $semester $course $gradeable  --${language}
 /usr/local/submitty/Lichen/bin/hash_all.py         $semester $course $gradeable  --window $window  --${language}
 
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
@@ -28,51 +28,30 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
         lichen_config_data = json.load(lichen_config)
         language = lichen_config_data["language"]
 
-    if language == "plaintext":
-        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
-        with open(my_concatenated_file,'r') as infile:
-            with open (my_tokenized_file,'w') as outfile:
-                subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
-
-    elif language == "python":
-        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
-        with open(my_concatenated_file,'r') as infile:
-            with open (my_tokenized_file,'w') as outfile:
-                command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
-                os.system(command)
-
-    elif language == "cpp":
-        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
-        with open(my_concatenated_file,'r') as infile:
-            with open (my_tokenized_file,'w') as outfile:
-                command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
-                os.system(command)
-
-    elif language == "java":
-        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","java_tokenizer.py")
-        with open(my_concatenated_file,'r') as infile:
-            with open (my_tokenized_file,'w') as outfile:
-                command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
-                os.system(command)
-
-    elif language == "mips":
-        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","mips_tokenizer.py")
-        with open(my_concatenated_file,'r') as infile:
-            with open (my_tokenized_file,'w') as outfile:
-                command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
-                os.system(command)
-
-    else:
-        print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
-        exit(1)
-
+    language_token_data = dict()
+
+    data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
+    with open(data_json_path, 'r') as token_data_file:
+        token_data = json.load(token_data_file)
+        if not language in token_data:
+            print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
+            exit(1)
+        else:
+            language_token_data = token_data[language]
+
+    tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin", language_token_data["tokenizer"])
+    if not language_token_data.get("input_as_argument"):
+        my_concatenated_file = f'< {my_concatenated_file}'
+    cli_args = ' '.join(language_token_data["command_args"]) if "command_args" in language_token_data else ''
+    command = f'{language_token_data["command_executable"]} {tokenizer} {cli_args} {my_concatenated_file} > {my_tokenized_file}'.strip()
+    os.system(command)
 
 def main():
     args = parse_args()
 
     sys.stdout.write("TOKENIZE ALL...")
     sys.stdout.flush()
-    
+
     with open(args.config_path) as lichen_config:
         lichen_config_data = json.load(lichen_config)
         semester = lichen_config_data["semester"]
@@ -108,6 +87,5 @@ def main():
 
     print ("done")
 
-    
 if __name__ == "__main__":
     main()
diff --git a/install_lichen.sh b/install_lichen.sh
@@ -62,6 +62,7 @@ cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir
 cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/mips/mips_tokenizer.py ${lichen_installation_dir}/bin/mips_tokenizer.py
+cp ${lichen_repository_dir}/tokenizer/data.json ${lichen_installation_dir}/bin/data.json
 
 
 ########################################################################################################################
diff --git a/tokenizer/data.json b/tokenizer/data.json
@@ -0,0 +1,35 @@
+{
+  "plaintext": {
+    "tokenizer": "plaintext_tokenizer.out",
+    "command_executable": "",
+    "input_as_argument": false,
+    "command_args": [
+      "--ignore_newlines"
+    ],
+    "token_value": "value"
+  },
+  "python": {
+    "tokenizer": "python_tokenizer.py",
+    "command_executable": "python3",
+    "input_as_argument": true,
+    "token_value": "type"
+  },
+  "cpp": {
+    "tokenizer": "c_tokenizer.py",
+    "command_executable": "python",
+    "input_as_argument": true,
+    "token_value": "type"
+  },
+  "java": {
+    "tokenizer": "java_tokenizer.py",
+    "command_executable": "python",
+    "input_as_argument": true,
+    "token_value": "type"
+  },
+  "mips": {
+    "tokenizer": "mips_tokenizer.py",
+    "command_executable": "python3",
+    "input_as_argument": true,
+    "token_value": "type"
+  }
+}