Skip to content

Commit 5e19e2e

Browse files
benjaminrshermanbmcutler
authored andcommitted
[Feature:Lichen] Add MIPS tokenizer (#17)
* Add MIPS tokenizer The tokenizer identifies string literals, labels, dot-types, registers, immediate values, instructions, addresses, and comments. If two instructions are placed on the same line, this will identify the first as an instruction and the second as a label, however that's not too bad since MIPS syntax only allows one instruction per line. * Add installation
1 parent 2fd6ca5 commit 5e19e2e

File tree

6 files changed

+250
-0
lines changed

6 files changed

+250
-0
lines changed

bin/hash_all.py

+8
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,14 @@ def hasher(args,my_tokenized_file,my_hashes_file):
5656
for j in range(0,sequence_length):
5757
foo+=str(tokens[i+j].get("type"))
5858

59+
elif language == "java":
60+
for j in range(0,sequence_length):
61+
foo+=str(tokens[i+j].get("type"))
62+
63+
elif language == "mips":
64+
for j in range(0,sequence_length):
65+
foo+=str(tokens[i+j].get("type"))
66+
5967
else:
6068
print("\n\nERROR: UNKNOWN HASHER\n\n")
6169
exit(1)

bin/tokenize_all.py

+14
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
4848
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
4949
os.system(command)
5050

51+
elif language == "java":
52+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","java_tokenizer.py")
53+
with open(my_concatenated_file,'r') as infile:
54+
with open (my_tokenized_file,'w') as outfile:
55+
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
56+
os.system(command)
57+
58+
elif language == "mips":
59+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","mips_tokenizer.py")
60+
with open(my_concatenated_file,'r') as infile:
61+
with open (my_tokenized_file,'w') as outfile:
62+
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
63+
os.system(command)
64+
5165
else:
5266
print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
5367
exit(1)

install_lichen.sh

+2
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/
6060

6161
cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
6262
cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
63+
cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
64+
cp ${lichen_repository_dir}/tokenizer/mips/mips_tokenizer.py ${lichen_installation_dir}/bin/mips_tokenizer.py
6365

6466

6567
########################################################################################################################
+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
[
2+
{
3+
"char": 1,
4+
"line": 1,
5+
"type": "COMMENT",
6+
"value": "# Example Hello World program"
7+
},
8+
{
9+
"char": 2,
10+
"line": 3,
11+
"type": "DOT_TYPE",
12+
"value": ".data"
13+
},
14+
{
15+
"char": 1,
16+
"line": 4,
17+
"type": "LABEL",
18+
"value": "hello_world"
19+
},
20+
{
21+
"char": 14,
22+
"line": 4,
23+
"type": "DOT_TYPE",
24+
"value": ".asciiz"
25+
},
26+
{
27+
"char": 22,
28+
"line": 4,
29+
"type": "STRING_LITERAL",
30+
"value": "\"Hello, World!\""
31+
},
32+
{
33+
"char": 1,
34+
"line": 6,
35+
"type": "COMMENT",
36+
"value": "#############################################################################"
37+
},
38+
{
39+
"char": 2,
40+
"line": 8,
41+
"type": "DOT_TYPE",
42+
"value": ".text"
43+
},
44+
{
45+
"char": 1,
46+
"line": 9,
47+
"type": "LABEL",
48+
"value": "main"
49+
},
50+
{
51+
"char": 2,
52+
"line": 10,
53+
"type": "INSTRUCTION",
54+
"value": "li"
55+
},
56+
{
57+
"char": 5,
58+
"line": 10,
59+
"type": "REGISTER",
60+
"value": "$v0"
61+
},
62+
{
63+
"char": 10,
64+
"line": 10,
65+
"type": "IMMEDIATE",
66+
"value": "4"
67+
},
68+
{
69+
"char": 13,
70+
"line": 10,
71+
"type": "COMMENT",
72+
"value": "# syscall 4 (print_str)"
73+
},
74+
{
75+
"char": 2,
76+
"line": 11,
77+
"type": "INSTRUCTION",
78+
"value": "la"
79+
},
80+
{
81+
"char": 5,
82+
"line": 11,
83+
"type": "REGISTER",
84+
"value": "$a0"
85+
},
86+
{
87+
"char": 10,
88+
"line": 11,
89+
"type": "ADDRESS",
90+
"value": "hello_world"
91+
},
92+
{
93+
"char": 22,
94+
"line": 11,
95+
"type": "COMMENT",
96+
"value": "# argument: string"
97+
},
98+
{
99+
"char": 2,
100+
"line": 12,
101+
"type": "INSTRUCTION",
102+
"value": "syscall"
103+
},
104+
{
105+
"char": 11,
106+
"line": 12,
107+
"type": "COMMENT",
108+
"value": "# print the string"
109+
},
110+
{
111+
"char": 2,
112+
"line": 14,
113+
"type": "INSTRUCTION",
114+
"value": "jr"
115+
},
116+
{
117+
"char": 5,
118+
"line": 14,
119+
"type": "REGISTER",
120+
"value": "$ra"
121+
},
122+
{
123+
"char": 11,
124+
"line": 14,
125+
"type": "COMMENT",
126+
"value": "# return to caller"
127+
}
128+
]

tokenizer/mips/input.s

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Example Hello World program
2+
3+
.data
4+
hello_world: .asciiz "Hello, World!"
5+
6+
#############################################################################
7+
8+
.text
9+
main:
10+
li $v0, 4 # syscall 4 (print_str)
11+
la $a0, hello_world # argument: string
12+
syscall # print the string
13+
14+
jr $ra # return to caller

tokenizer/mips/mips_tokenizer.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import json
2+
import re
3+
import sys
4+
5+
if len(sys.argv) < 2:
6+
print(f"USAGE: python3 {sys.argv[0]} path/to/input/file")
7+
sys.exit(1)
8+
9+
with open(sys.argv[1], "r") as file:
10+
file_lines = file.readlines()
11+
12+
separators = ' #\n\$",' # these always signify the end of a non-string token
13+
string_re = re.compile('".+?"')
14+
label_re = re.compile('^[^#"]+?:')
15+
dot_type_re = re.compile(f"^\.[^{separators}]+?[{separators}]")
16+
register_re = re.compile(f'^\$[^#"]+?[{separators}]')
17+
immediate_re = re.compile(f"^\d+?[{separators}]")
18+
# instructions and addresses are combined since they match the same regex
19+
instruction_and_address_re = re.compile(f"^[^#]+?[{separators}]")
20+
21+
token_types = [
22+
("STRING_LITERAL", string_re),
23+
("LABEL", label_re),
24+
("DOT_TYPE", dot_type_re),
25+
("REGISTER", register_re),
26+
("IMMEDIATE", immediate_re),
27+
("INSTRUCTION_ADDRESS", instruction_and_address_re),
28+
]
29+
30+
tokens = []
31+
32+
for line_num, line in enumerate(file_lines):
33+
col_num = 0
34+
found_instruction = False
35+
while col_num < len(line):
36+
if line[col_num] == "#":
37+
token = {
38+
"line": line_num + 1,
39+
"char": col_num + 1,
40+
"type": "COMMENT",
41+
"value": line[col_num:-1],
42+
}
43+
tokens.append(token)
44+
break
45+
elif str.isspace(line[col_num]):
46+
col_num += 1
47+
continue
48+
49+
# Attempt to match any token type. Tokens are ordered by specificity,
50+
# so the first match is always correct
51+
for token_type, token_re in token_types:
52+
token_match = token_re.match(line[col_num:])
53+
if token_match:
54+
if token_type == "INSTRUCTION_ADDRESS" and not found_instruction:
55+
token_type = "INSTRUCTION"
56+
found_instruction = True
57+
elif token_type == "INSTRUCTION_ADDRESS":
58+
token_type = "ADDRESS"
59+
60+
token = {}
61+
token["line"] = line_num + 1
62+
token["char"] = col_num + 1
63+
token["type"] = token_type
64+
65+
col_num += len(token_match[0]) - 1
66+
67+
token_val = token_match[0].strip()
68+
69+
# Correct stray characters
70+
if token_type == "LABEL" or token_val[-1] == ",":
71+
token_val = token_val[:-1]
72+
elif token_val[-1] == "$" or token_val[-1] == "#":
73+
token_val = token_val[:-1]
74+
col_num -= 1
75+
76+
token["value"] = token_val
77+
78+
tokens.append(token)
79+
80+
break
81+
82+
col_num += 1
83+
84+
print(json.dumps(tokens, indent=4, sort_keys=True))

0 commit comments

Comments
 (0)