|
| 1 | +import json |
| 2 | +import re |
| 3 | +import sys |
| 4 | + |
| 5 | +if len(sys.argv) < 2: |
| 6 | + print(f"USAGE: python3 {sys.argv[0]} path/to/input/file") |
| 7 | + sys.exit(1) |
| 8 | + |
| 9 | +with open(sys.argv[1], "r") as file: |
| 10 | + file_lines = file.readlines() |
| 11 | + |
| 12 | +separators = ' #\n\$",' # these always signify the end of a non-string token |
| 13 | +string_re = re.compile('".+?"') |
| 14 | +label_re = re.compile('^[^#"]+?:') |
| 15 | +dot_type_re = re.compile(f"^\.[^{separators}]+?[{separators}]") |
| 16 | +register_re = re.compile(f'^\$[^#"]+?[{separators}]') |
| 17 | +immediate_re = re.compile(f"^\d+?[{separators}]") |
| 18 | +# instructions and addresses are combined since they match the same regex |
| 19 | +instruction_and_address_re = re.compile(f"^[^#]+?[{separators}]") |
| 20 | + |
| 21 | +token_types = [ |
| 22 | + ("STRING_LITERAL", string_re), |
| 23 | + ("LABEL", label_re), |
| 24 | + ("DOT_TYPE", dot_type_re), |
| 25 | + ("REGISTER", register_re), |
| 26 | + ("IMMEDIATE", immediate_re), |
| 27 | + ("INSTRUCTION_ADDRESS", instruction_and_address_re), |
| 28 | +] |
| 29 | + |
| 30 | +tokens = [] |
| 31 | + |
| 32 | +for line_num, line in enumerate(file_lines): |
| 33 | + col_num = 0 |
| 34 | + found_instruction = False |
| 35 | + while col_num < len(line): |
| 36 | + if line[col_num] == "#": |
| 37 | + token = { |
| 38 | + "line": line_num + 1, |
| 39 | + "char": col_num + 1, |
| 40 | + "type": "COMMENT", |
| 41 | + "value": line[col_num:-1], |
| 42 | + } |
| 43 | + tokens.append(token) |
| 44 | + break |
| 45 | + elif str.isspace(line[col_num]): |
| 46 | + col_num += 1 |
| 47 | + continue |
| 48 | + |
| 49 | + # Attempt to match any token type. Tokens are ordered by specificity, |
| 50 | + # so the first match is always correct |
| 51 | + for token_type, token_re in token_types: |
| 52 | + token_match = token_re.match(line[col_num:]) |
| 53 | + if token_match: |
| 54 | + if token_type == "INSTRUCTION_ADDRESS" and not found_instruction: |
| 55 | + token_type = "INSTRUCTION" |
| 56 | + found_instruction = True |
| 57 | + elif token_type == "INSTRUCTION_ADDRESS": |
| 58 | + token_type = "ADDRESS" |
| 59 | + |
| 60 | + token = {} |
| 61 | + token["line"] = line_num + 1 |
| 62 | + token["char"] = col_num + 1 |
| 63 | + token["type"] = token_type |
| 64 | + |
| 65 | + col_num += len(token_match[0]) - 1 |
| 66 | + |
| 67 | + token_val = token_match[0].strip() |
| 68 | + |
| 69 | + # Correct stray characters |
| 70 | + if token_type == "LABEL" or token_val[-1] == ",": |
| 71 | + token_val = token_val[:-1] |
| 72 | + elif token_val[-1] == "$" or token_val[-1] == "#": |
| 73 | + token_val = token_val[:-1] |
| 74 | + col_num -= 1 |
| 75 | + |
| 76 | + token["value"] = token_val |
| 77 | + |
| 78 | + tokens.append(token) |
| 79 | + |
| 80 | + break |
| 81 | + |
| 82 | + col_num += 1 |
| 83 | + |
| 84 | +print(json.dumps(tokens, indent=4, sort_keys=True)) |
0 commit comments