|
| 1 | +use anyhow::{bail, Result}; |
| 2 | +use once_cell::sync::Lazy; |
| 3 | +use regex::{bytes::RegexSet, Regex}; |
| 4 | + |
| 5 | +use crate::defs::{LC3Word, Op, PseudoOp, RegAddr}; |
| 6 | + |
| 7 | +#[derive(Debug, Clone, Eq, PartialEq)] |
| 8 | +pub enum Token { |
| 9 | + INSTR(Op), |
| 10 | + REGISTER(RegAddr), |
| 11 | + META(PseudoOp), |
| 12 | + STRING(String), |
| 13 | + NUM(LC3Word), |
| 14 | + COMMENT(String), |
| 15 | + QUOTES, |
| 16 | + SEMICOLON, |
| 17 | + COMMA, |
| 18 | +} |
| 19 | + |
| 20 | +// This follows the same ordering as defs.rs > pub enum Op |
| 21 | +const INSTR_PATTERN: [&str; 23] = [ |
| 22 | + r"^ADD$", |
| 23 | + r"^AND$", |
| 24 | + r"^BR[nN]?[zZ]?[pP]?$", |
| 25 | + r"^JMP$", |
| 26 | + r"^JSR$", |
| 27 | + r"^JSRR$", |
| 28 | + r"^LD$", |
| 29 | + r"^LDI$", |
| 30 | + r"^LDR$", |
| 31 | + r"^LEA$", |
| 32 | + r"^NOT$", |
| 33 | + r"^RET$", |
| 34 | + r"^RTI$", |
| 35 | + r"^ST$", |
| 36 | + r"^STI$", |
| 37 | + r"^STR$", |
| 38 | + r"^TRAP$", |
| 39 | + r"^GETC$", |
| 40 | + r"^OUT$", |
| 41 | + r"^PUTS$", |
| 42 | + r"^IN$", |
| 43 | + r"^PUTSP$", |
| 44 | + r"^HALT$", |
| 45 | +]; |
| 46 | + |
| 47 | +const META_PATTERN: [&str; 5] = [r"^.ORIG$", r"^.FILL$", r"^BLKW$", r"^.STRINGZ$", r"^.END$"]; |
| 48 | +const NUM_PATTERN: &str = r"^[x|#|b]-?[0-9A-F]*$"; |
| 49 | +const REG_PATTERN: &str = r"^R[0-7],?$"; |
| 50 | +const COMMENT_PATTERN: &str = r"^;.*$"; |
| 51 | +const STRING_PATTERN: &str = r"^[0-9a-zA-Z[:punct:]]+$"; |
| 52 | + |
| 53 | +// Regexes get lazy compiled then stored for reuse |
| 54 | +static RE_REGISTER: Lazy<Regex> = Lazy::new(|| Regex::new(REG_PATTERN).unwrap()); |
| 55 | +static RE_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(COMMENT_PATTERN).unwrap()); |
| 56 | +static RE_INSTR: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(INSTR_PATTERN).unwrap()); |
| 57 | +static RE_META: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(META_PATTERN).unwrap()); |
| 58 | +static RE_NUM: Lazy<Regex> = Lazy::new(|| Regex::new(NUM_PATTERN).unwrap()); |
| 59 | +static RE_STRING: Lazy<Regex> = Lazy::new(|| Regex::new(STRING_PATTERN).unwrap()); |
| 60 | + |
| 61 | +fn match_op(line: &str, target: Vec<usize>) -> Result<Op> { |
| 62 | + let mut instr_type: Op = Op::ILLEGAL; |
| 63 | + for item in target { |
| 64 | + // this should be fine because there should only ever be 1 item in the vec |
| 65 | + // if there isn't then we just match the last |
| 66 | + instr_type = match item { |
| 67 | + 0 => Op::ADD, |
| 68 | + 1 => Op::AND, |
| 69 | + 2 => { |
| 70 | + // this was written before this returned a vector of tokens |
| 71 | + // it might be better to turn these into separate tokens |
| 72 | + let n: bool = line.contains(['n', 'N']); |
| 73 | + let z: bool = line.contains(['z', 'Z']); |
| 74 | + let p: bool = line.contains(['p', 'P']); |
| 75 | + Op::BR(n, z, p) |
| 76 | + } |
| 77 | + 3 => Op::JMP, |
| 78 | + 4 => Op::JSR, |
| 79 | + 5 => Op::JSRR, |
| 80 | + 6 => Op::LD, |
| 81 | + 7 => Op::LDI, |
| 82 | + 8 => Op::LDR, |
| 83 | + 9 => Op::LEA, |
| 84 | + 10 => Op::RET, |
| 85 | + 11 => Op::RTI, |
| 86 | + 12 => Op::ST, |
| 87 | + 13 => Op::STI, |
| 88 | + 14 => Op::STR, |
| 89 | + 15 => Op::TRAP, |
| 90 | + 16 => Op::GETC, |
| 91 | + 17 => Op::OUT, |
| 92 | + 18 => Op::PUTS, |
| 93 | + 19 => Op::IN, |
| 94 | + 20 => Op::PUTSP, |
| 95 | + 21 => Op::HALT, |
| 96 | + _ => bail!("Could not match with an operation. Likely an illegal op!"), |
| 97 | + }; |
| 98 | + } |
| 99 | + Ok(instr_type) |
| 100 | +} |
| 101 | + |
| 102 | +fn match_pseudo_op(target: Vec<usize>) -> Result<PseudoOp> { |
| 103 | + let mut pseudo_instr_type: PseudoOp = PseudoOp::ILLEGAL; |
| 104 | + |
| 105 | + for item in target { |
| 106 | + // this should be fine as there should only ever be 1 item in the vec |
| 107 | + // if there isn't then we just match the last |
| 108 | + pseudo_instr_type = match item { |
| 109 | + 0 => PseudoOp::ORIG, |
| 110 | + 1 => PseudoOp::FILL, |
| 111 | + 2 => PseudoOp::BLKW, |
| 112 | + 3 => PseudoOp::STRINGZ, |
| 113 | + 4 => PseudoOp::END, |
| 114 | + _ => bail!("Could not match with any pseudo operation. Likely an illegal psuedo-op!"), |
| 115 | + }; |
| 116 | + } |
| 117 | + Ok(pseudo_instr_type) |
| 118 | +} |
| 119 | + |
| 120 | +/// Take in a `&str`, returning a `Vec<Token>` that contains all syntax morphemes in the str. |
| 121 | +pub fn tokenize(line: &str) -> Result<Vec<Token>> { |
| 122 | + let mut token: Vec<Token> = Vec::new(); // this value is ultimately returned |
| 123 | + |
| 124 | + if RE_REGISTER.is_match(line) { |
| 125 | + let reg_num_char: char = line.chars().nth(1).unwrap(); |
| 126 | + let reg_num_int: u8 = reg_num_char.to_digit(10).unwrap() as u8; |
| 127 | + token.push(Token::REGISTER(RegAddr::try_from(reg_num_int)?)); |
| 128 | + if line.ends_with(',') { |
| 129 | + token.push(Token::COMMA) |
| 130 | + } |
| 131 | + Ok(token) |
| 132 | + } else if RE_COMMENT.is_match(line) { |
| 133 | + token.push(Token::COMMENT(line.to_string())); |
| 134 | + Ok(token) |
| 135 | + } else if RE_INSTR.is_match(line.as_bytes()) { |
| 136 | + let matches: Vec<usize> = RE_INSTR.matches(line.as_bytes()).into_iter().collect(); |
| 137 | + token.push(Token::INSTR(match_op(line, matches)?)); |
| 138 | + Ok(token) |
| 139 | + } else if RE_META.is_match(line.as_bytes()) { |
| 140 | + let matches: Vec<usize> = RE_META.matches(line.as_bytes()).into_iter().collect(); |
| 141 | + token.push(Token::META(match_pseudo_op(matches)?)); |
| 142 | + Ok(token) |
| 143 | + } else if RE_NUM.is_match(line) { |
| 144 | + let num = if line.starts_with('x') { |
| 145 | + u16::from_str_radix(line.strip_prefix('x').unwrap(), 16).unwrap() |
| 146 | + } else if line.starts_with('#') { |
| 147 | + line.strip_prefix('#').unwrap().parse().unwrap() |
| 148 | + } else { |
| 149 | + bail!("Found invalid number declaration!") |
| 150 | + }; |
| 151 | + token.push(Token::NUM(num)); |
| 152 | + Ok(token) |
| 153 | + } else if RE_STRING.is_match(line.trim_matches('"')) { |
| 154 | + // Strings and labels are functionally the same but one has quotes. |
| 155 | + // Therefore they aren't differentiated by token here, and should be dealt with |
| 156 | + // during lexing |
| 157 | + let string = line.trim_matches('"').to_string(); |
| 158 | + if line.starts_with('"') { |
| 159 | + token.push(Token::QUOTES) |
| 160 | + } |
| 161 | + token.push(Token::STRING(string)); |
| 162 | + if line.ends_with('"') { |
| 163 | + token.push(Token::QUOTES) |
| 164 | + } |
| 165 | + Ok(token) |
| 166 | + } else { |
| 167 | + bail!("Could not match with a token"); |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +#[cfg(test)] |
| 172 | +mod test { |
| 173 | + use super::*; |
| 174 | + |
| 175 | + #[test] |
| 176 | + fn tokenize_register() { |
| 177 | + // The number of registers is small enough that checking that all of them parse manually is fine |
| 178 | + let test_str: &str = "R0"; |
| 179 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 180 | + assert_eq!(result[0], Token::REGISTER(RegAddr::Zero)); |
| 181 | + } |
| 182 | + |
| 183 | + #[test] |
| 184 | + fn tokenize_register_comma() { |
| 185 | + // The number of registers is small enough that checking that all of them parse manually is fine |
| 186 | + let test_str: &str = "R3,"; |
| 187 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 188 | + assert_eq!(result[0], Token::REGISTER(RegAddr::Three)); |
| 189 | + assert_eq!(result[1], Token::COMMA); |
| 190 | + } |
| 191 | + |
| 192 | + #[test] |
| 193 | + #[should_panic] |
| 194 | + fn tokenize_unclean_register() { |
| 195 | + let test_str: &str = "R0, A_LABEL"; |
| 196 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 197 | + assert_ne!(result[0], Token::REGISTER(RegAddr::Zero)); |
| 198 | + } |
| 199 | + |
| 200 | + #[test] |
| 201 | + fn tokenize_comment() { |
| 202 | + let test_str: &str = "; Put return addr in R7"; |
| 203 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 204 | + assert_eq!( |
| 205 | + result[0], |
| 206 | + Token::COMMENT("; Put return addr in R7".to_string()) |
| 207 | + ); |
| 208 | + } |
| 209 | + |
| 210 | + #[test] |
| 211 | + fn tokenize_instr() { |
| 212 | + let test_str: &str = "ADD"; |
| 213 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 214 | + assert_eq!(result[0], Token::INSTR(Op::ADD)); |
| 215 | + } |
| 216 | + |
| 217 | + #[test] |
| 218 | + fn tokenize_num_dec() { |
| 219 | + let test_str: &str = "#32"; |
| 220 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 221 | + assert_eq!(result[0], Token::NUM(32)); |
| 222 | + } |
| 223 | + |
| 224 | + #[test] |
| 225 | + fn tokenize_num_hex() { |
| 226 | + let test_str: &str = "x20"; |
| 227 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 228 | + assert_eq!(result[0], Token::NUM(32)); |
| 229 | + } |
| 230 | + |
| 231 | + #[test] |
| 232 | + fn tokenize_meta_orig() { |
| 233 | + let test_str: &str = ".ORIG"; |
| 234 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 235 | + assert_eq!(result[0], Token::META(PseudoOp::ORIG)); |
| 236 | + } |
| 237 | + |
| 238 | + #[test] |
| 239 | + #[should_panic] |
| 240 | + fn tokenize_meta_missing_dot() { |
| 241 | + let test_str: &str = "END"; |
| 242 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 243 | + assert_eq!(result[0], Token::META(PseudoOp::END)) |
| 244 | + } |
| 245 | + |
| 246 | + #[test] |
| 247 | + fn tokenize_string_section() { |
| 248 | + let test_str: &str = "Strings!"; |
| 249 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 250 | + assert_eq!(result[0], Token::STRING("Strings!".to_string())); |
| 251 | + } |
| 252 | + |
| 253 | + #[test] |
| 254 | + fn tokenize_string_start() { |
| 255 | + let test_str: &str = "\"String?"; |
| 256 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 257 | + assert_eq!(result[0], Token::QUOTES); |
| 258 | + assert_eq!(result[1], Token::STRING("String?".to_string())); |
| 259 | + } |
| 260 | + |
| 261 | + #[test] |
| 262 | + fn tokenize_string_end() { |
| 263 | + let test_str: &str = "String.\""; |
| 264 | + let result: Vec<Token> = tokenize(test_str).unwrap(); |
| 265 | + assert_eq!(result[0], Token::STRING("String.".to_string())); |
| 266 | + assert_eq!(result[1], Token::QUOTES); |
| 267 | + } |
| 268 | +} |
0 commit comments