Skip to content

Commit 0034850

Browse files
tushargrbmcutler
authored andcommitted
Add python and c/cpp tokenizers (#3)
* add python and c tokenizers. Required packages for tokenizers not installed * add python and c tokenizers. Add instruction for install and usage. Some minor modifications to c and python tokenizers * progress on install/integrate c & python tokenizers * finish integration
1 parent 8238f69 commit 0034850

File tree

12 files changed

+1748
-11
lines changed

12 files changed

+1748
-11
lines changed

bin/hash_all.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,19 @@ def hasher(args,my_tokenized_file,my_hashes_file):
5252
if args.plaintext:
5353
for j in range(0,args.window):
5454
foo+=str(tokens[i+j].get("value"))
55+
5556
elif args.python:
56-
print("NEED A PYTHON HASHER")
57+
for j in range(0,args.window):
58+
foo+=str(tokens[i+j].get("type"))
59+
5760
elif args.cpp:
58-
print("NEED A C++ HASHER")
61+
for j in range(0,args.window):
62+
foo+=str(tokens[i+j].get("type"))
63+
5964
else:
60-
print("UNKNOWN HASHER")
65+
print("\n\nERROR: UNKNOWN HASHER\n\n")
66+
exit(1)
67+
6168
hash_object = hashlib.md5(foo.encode())
6269
hash_object_string=hash_object.hexdigest()
6370
#FIXME: this truncation should be adjusted after more full-scale testing

bin/process_all.sh

+5-3
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
semester=$1
44
course=$2
55
gradeable=$3
6-
window=$4
6+
language=$4
7+
window=$5
8+
79

810
/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
9-
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext
10-
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window $window --plaintext
11+
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --${language}
12+
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window $window --${language}
1113

1214
/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable --window $window
1315

bin/tokenize_all.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,27 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
3434
if args.plaintext:
3535
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
3636
with open(my_concatenated_file,'r') as infile:
37-
with open (my_tokenized_file,'w')as outfile:
37+
with open (my_tokenized_file,'w') as outfile:
3838
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
39+
3940
elif args.python:
40-
print("NEED A PYTHON TOKENIZER")
41+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
42+
with open(my_concatenated_file,'r') as infile:
43+
with open (my_tokenized_file,'w') as outfile:
44+
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
45+
os.system(command)
46+
4147
elif args.cpp:
42-
print("NEED A C++ TOKENIZER")
48+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
49+
with open(my_concatenated_file,'r') as infile:
50+
with open (my_tokenized_file,'w') as outfile:
51+
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
52+
os.system(command)
53+
4354
else:
44-
print("UNKNOWN TOKENIZER")
55+
print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
56+
exit(1)
57+
4558

4659
def main():
4760
args = parse_args()

install_lichen.sh

+8-1
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,13 @@ fi
2727

2828

2929
########################################################################################################################
30-
# compile & install the tokenizers
30+
# compile & install the tools
3131

3232
mkdir -p ${lichen_installation_dir}/bin
3333

34+
35+
#--------------------
36+
# plaintext tool
3437
pushd ${lichen_repository_dir} > /dev/null
3538
clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${lichen_installation_dir}/bin/plaintext_tokenizer.out
3639
if [ $? -ne 0 ]; then
@@ -40,6 +43,7 @@ fi
4043
popd > /dev/null
4144

4245

46+
#-------------------------------------------
4347
# compile & install the hash comparison tool
4448
pushd ${lichen_repository_dir} > /dev/null
4549
clang++ -I ${nlohmann_dir}/include/ -lboost_system -lboost_filesystem -Wall -g -std=c++11 -Wall compare_hashes/compare_hashes.cpp -o ${lichen_installation_dir}/bin/compare_hashes.out
@@ -54,6 +58,9 @@ popd > /dev/null
5458

5559
cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/
5660

61+
cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
62+
cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
63+
5764

5865
########################################################################################################################
5966
# fix permissions

tokenizer/c/README.md

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Installation Instruction:-
2+
3+
sudo apt-get install python-clang-3.8
4+
5+
Usage:-
6+
7+
python c_tokenizer.py path/to/inputfile
8+

tokenizer/c/c_tokenizer.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import clang.cindex
2+
import json
3+
import sys
4+
import shutil
5+
import tempfile
6+
import os
7+
8+
9+
# apparently, the file name must end in .cpp (or some standard
10+
# c/c++ suffix to be successfully tokenized)
11+
12+
# make a temprary filename
13+
tmp_cpp_file_handle,tmp_cpp_file_name=tempfile.mkstemp(suffix=".cpp")
14+
# copy the concatenated file to the temporary file location
15+
shutil.copy(sys.argv[1],tmp_cpp_file_name)
16+
17+
clang.cindex.Config.set_library_file("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1")
18+
idx = clang.cindex.Index.create()
19+
20+
# parse the input file
21+
parsed_data = idx.parse(tmp_cpp_file_name)
22+
23+
# remove the temporary file
24+
os.remove(tmp_cpp_file_name)
25+
26+
tokens = []
27+
28+
for token in parsed_data.get_tokens(extent = parsed_data.cursor.extent):
29+
tmp = dict()
30+
tmp["line"]=int(token.location.line)
31+
tmp["char"]=int(token.location.column)
32+
tmp["type"]=(str(token.kind))[10:]
33+
tmp["value"]=str(token.spelling)
34+
tokens.append(tmp)
35+
36+
print ( json.dumps(tokens, indent=4, sort_keys=True) )

0 commit comments

Comments
 (0)