[Feature:Lichen] Add MIPS tokenizer (#17)

benjaminrsherman · bmcutler · commit 5e19e2ebce0e · 2019-12-12T15:11:21.000-05:00
* Add MIPS tokenizer

The tokenizer identifies string literals, labels, dot-types, registers,
immediate values, instructions, addresses, and comments.  If two
instructions are placed on the same line, this will identify the first as
an instruction and the second as a label, however that's not too bad since
MIPS syntax only allows one instruction per line.

* Add installation
diff --git a/bin/hash_all.py b/bin/hash_all.py
@@ -56,6 +56,14 @@ def hasher(args,my_tokenized_file,my_hashes_file):
                     for j in range(0,sequence_length):
                         foo+=str(tokens[i+j].get("type"))
 
+                elif language == "java":
+                    for j in range(0,sequence_length):
+                        foo+=str(tokens[i+j].get("type"))
+
+                elif language == "mips":
+                    for j in range(0,sequence_length):
+                        foo+=str(tokens[i+j].get("type"))
+
                 else:
                     print("\n\nERROR: UNKNOWN HASHER\n\n")
                     exit(1)
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
@@ -48,6 +48,20 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
                 command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
                 os.system(command)
 
+    elif language == "java":
+        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","java_tokenizer.py")
+        with open(my_concatenated_file,'r') as infile:
+            with open (my_tokenized_file,'w') as outfile:
+                command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
+                os.system(command)
+
+    elif language == "mips":
+        tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","mips_tokenizer.py")
+        with open(my_concatenated_file,'r') as infile:
+            with open (my_tokenized_file,'w') as outfile:
+                command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
+                os.system(command)
+
     else:
         print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
         exit(1)
diff --git a/install_lichen.sh b/install_lichen.sh
@@ -60,6 +60,8 @@ cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/
 
 cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
+cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
+cp ${lichen_repository_dir}/tokenizer/mips/mips_tokenizer.py ${lichen_installation_dir}/bin/mips_tokenizer.py
 
 
 ########################################################################################################################
diff --git a/tokenizer/mips/example_output/output.json b/tokenizer/mips/example_output/output.json
@@ -0,0 +1,128 @@
+[
+    {
+        "char": 1,
+        "line": 1,
+        "type": "COMMENT",
+        "value": "# Example Hello World program"
+    },
+    {
+        "char": 2,
+        "line": 3,
+        "type": "DOT_TYPE",
+        "value": ".data"
+    },
+    {
+        "char": 1,
+        "line": 4,
+        "type": "LABEL",
+        "value": "hello_world"
+    },
+    {
+        "char": 14,
+        "line": 4,
+        "type": "DOT_TYPE",
+        "value": ".asciiz"
+    },
+    {
+        "char": 22,
+        "line": 4,
+        "type": "STRING_LITERAL",
+        "value": "\"Hello, World!\""
+    },
+    {
+        "char": 1,
+        "line": 6,
+        "type": "COMMENT",
+        "value": "#############################################################################"
+    },
+    {
+        "char": 2,
+        "line": 8,
+        "type": "DOT_TYPE",
+        "value": ".text"
+    },
+    {
+        "char": 1,
+        "line": 9,
+        "type": "LABEL",
+        "value": "main"
+    },
+    {
+        "char": 2,
+        "line": 10,
+        "type": "INSTRUCTION",
+        "value": "li"
+    },
+    {
+        "char": 5,
+        "line": 10,
+        "type": "REGISTER",
+        "value": "$v0"
+    },
+    {
+        "char": 10,
+        "line": 10,
+        "type": "IMMEDIATE",
+        "value": "4"
+    },
+    {
+        "char": 13,
+        "line": 10,
+        "type": "COMMENT",
+        "value": "# syscall 4 (print_str)"
+    },
+    {
+        "char": 2,
+        "line": 11,
+        "type": "INSTRUCTION",
+        "value": "la"
+    },
+    {
+        "char": 5,
+        "line": 11,
+        "type": "REGISTER",
+        "value": "$a0"
+    },
+    {
+        "char": 10,
+        "line": 11,
+        "type": "ADDRESS",
+        "value": "hello_world"
+    },
+    {
+        "char": 22,
+        "line": 11,
+        "type": "COMMENT",
+        "value": "# argument: string"
+    },
+    {
+        "char": 2,
+        "line": 12,
+        "type": "INSTRUCTION",
+        "value": "syscall"
+    },
+    {
+        "char": 11,
+        "line": 12,
+        "type": "COMMENT",
+        "value": "# print the string"
+    },
+    {
+        "char": 2,
+        "line": 14,
+        "type": "INSTRUCTION",
+        "value": "jr"
+    },
+    {
+        "char": 5,
+        "line": 14,
+        "type": "REGISTER",
+        "value": "$ra"
+    },
+    {
+        "char": 11,
+        "line": 14,
+        "type": "COMMENT",
+        "value": "# return to caller"
+    }
+]
diff --git a/tokenizer/mips/input.s b/tokenizer/mips/input.s
@@ -0,0 +1,14 @@
+# Example Hello World program
+
+	.data
+hello_world:	.asciiz "Hello, World!"
+
+#############################################################################
+
+	.text
+main:
+	li $v0, 4 	# syscall 4 (print_str)
+	la $a0, hello_world # argument: string
+	syscall 	# print the string
+
+	jr $ra 		# return to caller
diff --git a/tokenizer/mips/mips_tokenizer.py b/tokenizer/mips/mips_tokenizer.py
@@ -0,0 +1,84 @@
+import json
+import re
+import sys
+
+if len(sys.argv) < 2:
+    print(f"USAGE: python3 {sys.argv[0]} path/to/input/file")
+    sys.exit(1)
+
+with open(sys.argv[1], "r") as file:
+    file_lines = file.readlines()
+
+separators = ' #\n\$",'  # these always signify the end of a non-string token
+string_re = re.compile('".+?"')
+label_re = re.compile('^[^#"]+?:')
+dot_type_re = re.compile(f"^\.[^{separators}]+?[{separators}]")
+register_re = re.compile(f'^\$[^#"]+?[{separators}]')
+immediate_re = re.compile(f"^\d+?[{separators}]")
+# instructions and addresses are combined since they match the same regex
+instruction_and_address_re = re.compile(f"^[^#]+?[{separators}]")
+
+token_types = [
+    ("STRING_LITERAL", string_re),
+    ("LABEL", label_re),
+    ("DOT_TYPE", dot_type_re),
+    ("REGISTER", register_re),
+    ("IMMEDIATE", immediate_re),
+    ("INSTRUCTION_ADDRESS", instruction_and_address_re),
+]
+
+tokens = []
+
+for line_num, line in enumerate(file_lines):
+    col_num = 0
+    found_instruction = False
+    while col_num < len(line):
+        if line[col_num] == "#":
+            token = {
+                "line": line_num + 1,
+                "char": col_num + 1,
+                "type": "COMMENT",
+                "value": line[col_num:-1],
+            }
+            tokens.append(token)
+            break
+        elif str.isspace(line[col_num]):
+            col_num += 1
+            continue
+
+        # Attempt to match any token type.  Tokens are ordered by specificity,
+        # so the first match is always correct
+        for token_type, token_re in token_types:
+            token_match = token_re.match(line[col_num:])
+            if token_match:
+                if token_type == "INSTRUCTION_ADDRESS" and not found_instruction:
+                    token_type = "INSTRUCTION"
+                    found_instruction = True
+                elif token_type == "INSTRUCTION_ADDRESS":
+                    token_type = "ADDRESS"
+
+                token = {}
+                token["line"] = line_num + 1
+                token["char"] = col_num + 1
+                token["type"] = token_type
+
+                col_num += len(token_match[0]) - 1
+
+                token_val = token_match[0].strip()
+
+                # Correct stray characters
+                if token_type == "LABEL" or token_val[-1] == ",":
+                    token_val = token_val[:-1]
+                elif token_val[-1] == "$" or token_val[-1] == "#":
+                    token_val = token_val[:-1]
+                    col_num -= 1
+
+                token["value"] = token_val
+
+                tokens.append(token)
+
+                break
+
+        col_num += 1
+
+print(json.dumps(tokens, indent=4, sort_keys=True))