Skip to content

Commit d46cef3

Browse files
Refactor tokenize_all and hash_all python files (#18)
* Fix hash_all ugly * remove whitespace * remove spaces and duplicate code adding data.json * remove hash map * fix token traversal * tweaks to json * tweaks * fin * fix json types
1 parent 5e19e2e commit d46cef3

File tree

6 files changed

+70
-80
lines changed

6 files changed

+70
-80
lines changed

bin/concatenate_all.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,6 @@ def main():
103103
my_cf.write("\n")
104104

105105
print ("done")
106-
106+
107107
if __name__ == "__main__":
108108
main()

bin/hash_all.py

+13-37
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
2121
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
2222

23-
2423
def parse_args():
2524
parser = argparse.ArgumentParser(description="")
2625
parser.add_argument("config_path")
@@ -34,48 +33,25 @@ def hasher(args,my_tokenized_file,my_hashes_file):
3433
language = lichen_config_data["language"]
3534
sequence_length = int(lichen_config_data["sequence_length"])
3635

36+
data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
37+
with open(data_json_path) as token_data_file:
38+
token_data = json.load(token_data_file)
39+
if not language in token_data:
40+
print("\n\nERROR: UNKNOWN HASHER\n\n")
41+
exit(1)
42+
3743
if (sequence_length < 1):
3844
print ("ERROR! sequence_length must be >= 1")
3945
exit(1)
4046

4147
with open(my_tokenized_file,'r',encoding='ISO-8859-1') as my_tf:
4248
with open(my_hashes_file,'w') as my_hf:
4349
tokens = json.load(my_tf)
50+
token_values = [str(x.get(token_data[language]["token_value"])) for x in tokens]
4451
num = len(tokens)
45-
for i in range(0,num-sequence_length):
46-
foo=""
47-
if language == "plaintext":
48-
for j in range(0,sequence_length):
49-
foo+=str(tokens[i+j].get("value"))
50-
51-
elif language == "python":
52-
for j in range(0,sequence_length):
53-
foo+=str(tokens[i+j].get("type"))
54-
55-
elif language == "cpp":
56-
for j in range(0,sequence_length):
57-
foo+=str(tokens[i+j].get("type"))
58-
59-
elif language == "java":
60-
for j in range(0,sequence_length):
61-
foo+=str(tokens[i+j].get("type"))
62-
63-
elif language == "mips":
64-
for j in range(0,sequence_length):
65-
foo+=str(tokens[i+j].get("type"))
66-
67-
else:
68-
print("\n\nERROR: UNKNOWN HASHER\n\n")
69-
exit(1)
70-
71-
hash_object = hashlib.md5(foo.encode())
72-
hash_object_string=hash_object.hexdigest()
73-
#FIXME: this truncation should be adjusted after more full-scale testing
74-
#hash_object_string_truncated=hash_object_string[0:4]
75-
hash_object_string_truncated=hash_object_string[0:8]
76-
#my_hf.write(hash_object_string+"\n")
77-
my_hf.write(hash_object_string_truncated+"\n")
78-
52+
#FIXME: this truncation should be adjusted after more full-scale testing
53+
token_hashed_values = [ (hashlib.md5(''.join(token_values[x:x+sequence_length]).encode()).hexdigest())[0:8] for x in range(0, num-sequence_length)]
54+
my_hf.write('\n'.join(token_hashed_values))
7955

8056
def main():
8157
args = parse_args()
@@ -88,7 +64,7 @@ def main():
8864

8965
sys.stdout.write("HASH ALL...")
9066
sys.stdout.flush()
91-
67+
9268
# ===========================================================================
9369
# error checking
9470
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",semester,course)
@@ -118,6 +94,6 @@ def main():
11894
hasher(args,my_tokenized_file,my_hashes_file)
11995

12096
print("done")
121-
97+
12298
if __name__ == "__main__":
12399
main()

bin/process_all.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,10 @@ do
3636
ignore_submissions+=("$argument")
3737
;;
3838
esac
39-
fi
39+
fi
4040
done
4141

42-
/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
42+
/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
4343
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --${language}
4444
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window $window --${language}
4545

bin/tokenize_all.py

+18-40
Original file line numberDiff line numberDiff line change
@@ -28,51 +28,30 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
2828
lichen_config_data = json.load(lichen_config)
2929
language = lichen_config_data["language"]
3030

31-
if language == "plaintext":
32-
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
33-
with open(my_concatenated_file,'r') as infile:
34-
with open (my_tokenized_file,'w') as outfile:
35-
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
36-
37-
elif language == "python":
38-
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
39-
with open(my_concatenated_file,'r') as infile:
40-
with open (my_tokenized_file,'w') as outfile:
41-
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
42-
os.system(command)
43-
44-
elif language == "cpp":
45-
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
46-
with open(my_concatenated_file,'r') as infile:
47-
with open (my_tokenized_file,'w') as outfile:
48-
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
49-
os.system(command)
50-
51-
elif language == "java":
52-
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","java_tokenizer.py")
53-
with open(my_concatenated_file,'r') as infile:
54-
with open (my_tokenized_file,'w') as outfile:
55-
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
56-
os.system(command)
57-
58-
elif language == "mips":
59-
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","mips_tokenizer.py")
60-
with open(my_concatenated_file,'r') as infile:
61-
with open (my_tokenized_file,'w') as outfile:
62-
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
63-
os.system(command)
64-
65-
else:
66-
print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
67-
exit(1)
68-
31+
language_token_data = dict()
32+
33+
data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
34+
with open(data_json_path, 'r') as token_data_file:
35+
token_data = json.load(token_data_file)
36+
if not language in token_data:
37+
print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
38+
exit(1)
39+
else:
40+
language_token_data = token_data[language]
41+
42+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin", language_token_data["tokenizer"])
43+
if not language_token_data.get("input_as_argument"):
44+
my_concatenated_file = f'< {my_concatenated_file}'
45+
cli_args = ' '.join(language_token_data["command_args"]) if "command_args" in language_token_data else ''
46+
command = f'{language_token_data["command_executable"]} {tokenizer} {cli_args} {my_concatenated_file} > {my_tokenized_file}'.strip()
47+
os.system(command)
6948

7049
def main():
7150
args = parse_args()
7251

7352
sys.stdout.write("TOKENIZE ALL...")
7453
sys.stdout.flush()
75-
54+
7655
with open(args.config_path) as lichen_config:
7756
lichen_config_data = json.load(lichen_config)
7857
semester = lichen_config_data["semester"]
@@ -108,6 +87,5 @@ def main():
10887

10988
print ("done")
11089

111-
11290
if __name__ == "__main__":
11391
main()

install_lichen.sh

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir
6262
cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
6363
cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
6464
cp ${lichen_repository_dir}/tokenizer/mips/mips_tokenizer.py ${lichen_installation_dir}/bin/mips_tokenizer.py
65+
cp ${lichen_repository_dir}/tokenizer/data.json ${lichen_installation_dir}/bin/data.json
6566

6667

6768
########################################################################################################################

tokenizer/data.json

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"plaintext": {
3+
"tokenizer": "plaintext_tokenizer.out",
4+
"command_executable": "",
5+
"input_as_argument": false,
6+
"command_args": [
7+
"--ignore_newlines"
8+
],
9+
"token_value": "value"
10+
},
11+
"python": {
12+
"tokenizer": "python_tokenizer.py",
13+
"command_executable": "python3",
14+
"input_as_argument": true,
15+
"token_value": "type"
16+
},
17+
"cpp": {
18+
"tokenizer": "c_tokenizer.py",
19+
"command_executable": "python",
20+
"input_as_argument": true,
21+
"token_value": "type"
22+
},
23+
"java": {
24+
"tokenizer": "java_tokenizer.py",
25+
"command_executable": "python",
26+
"input_as_argument": true,
27+
"token_value": "type"
28+
},
29+
"mips": {
30+
"tokenizer": "mips_tokenizer.py",
31+
"command_executable": "python3",
32+
"input_as_argument": true,
33+
"token_value": "type"
34+
}
35+
}

0 commit comments

Comments
 (0)