Skip to content
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ name = "lc3sim"
path = "src/cli.rs"

[dependencies]
anyhow = "1.0.95"
once_cell = "1.20.2"
regex = "1.11.1"
thiserror = "2"
17 changes: 17 additions & 0 deletions src/assembler/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use crate::defs::LC3Word;

pub mod tokenizer;

pub struct MaybeUnresolvedInstr {
value: LC3Word,
///Label, Start offset, End offset
bindings: Option<(String, u8, u8)>,
}

pub fn translate_line(line: &str) -> MaybeUnresolvedInstr {
todo!()
}

pub fn resolve_instr(instr: MaybeUnresolvedInstr) -> String {
todo!()
}
268 changes: 268 additions & 0 deletions src/assembler/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
use anyhow::{bail, Result};
use once_cell::sync::Lazy;
use regex::{bytes::RegexSet, Regex};

use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};

#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Token {
INSTR(Op),
REGISTER(RegAddr),
META(PseudoOp),
STRING(String),
NUM(LC3Word),
COMMENT(String),
QUOTES,
SEMICOLON,
COMMA,
}

// This follows the same ordering as defs.rs > pub enum Op
const INSTR_PATTERN: [&str; 23] = [
r"^ADD$",
r"^AND$",
r"^BR[nN]?[zZ]?[pP]?$",
r"^JMP$",
r"^JSR$",
r"^JSRR$",
r"^LD$",
r"^LDI$",
r"^LDR$",
r"^LEA$",
r"^NOT$",
r"^RET$",
r"^RTI$",
r"^ST$",
r"^STI$",
r"^STR$",
r"^TRAP$",
r"^GETC$",
r"^OUT$",
r"^PUTS$",
r"^IN$",
r"^PUTSP$",
r"^HALT$",
];

const META_PATTERN: [&str; 5] = [r"^.ORIG$", r"^.FILL$", r"^BLKW$", r"^.STRINGZ$", r"^.END$"];
const NUM_PATTERN: &str = r"^[x|#|b]-?[0-9A-F]*$";
const REG_PATTERN: &str = r"^R[0-7],?$";
const COMMENT_PATTERN: &str = r"^;.*$";
const STRING_PATTERN: &str = r"^[0-9a-zA-Z[:punct:]]+$";

// Regexes get lazy compiled then stored for reuse
static RE_REGISTER: Lazy<Regex> = Lazy::new(|| Regex::new(REG_PATTERN).unwrap());
static RE_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(COMMENT_PATTERN).unwrap());
static RE_INSTR: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(INSTR_PATTERN).unwrap());
static RE_META: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(META_PATTERN).unwrap());
static RE_NUM: Lazy<Regex> = Lazy::new(|| Regex::new(NUM_PATTERN).unwrap());
static RE_STRING: Lazy<Regex> = Lazy::new(|| Regex::new(STRING_PATTERN).unwrap());

fn match_op(line: &str, target: Vec<usize>) -> Result<Op> {
let mut instr_type: Op = Op::ILLEGAL;
for item in target {
// this should be fine because there should only ever be 1 item in the vec
// if there isn't then we just match the last
instr_type = match item {
0 => Op::ADD,
1 => Op::AND,
2 => {
// this was written before this returned a vector of tokens
// it might be better to turn these into separate tokens
let n: bool = line.contains(['n', 'N']);
let z: bool = line.contains(['z', 'Z']);
let p: bool = line.contains(['p', 'P']);
Op::BR(n, z, p)
}
3 => Op::JMP,
4 => Op::JSR,
5 => Op::JSRR,
6 => Op::LD,
7 => Op::LDI,
8 => Op::LDR,
9 => Op::LEA,
10 => Op::RET,
11 => Op::RTI,
12 => Op::ST,
13 => Op::STI,
14 => Op::STR,
15 => Op::TRAP,
16 => Op::GETC,
17 => Op::OUT,
18 => Op::PUTS,
19 => Op::IN,
20 => Op::PUTSP,
21 => Op::HALT,
_ => bail!("Could not match with an operation. Likely an illegal op!"),
};
}
Ok(instr_type)
}

fn match_pseudo_op(target: Vec<usize>) -> Result<PseudoOp> {
let mut pseudo_instr_type: PseudoOp = PseudoOp::ILLEGAL;

for item in target {
// this should be fine as there should only ever be 1 item in the vec
// if there isn't then we just match the last
pseudo_instr_type = match item {
0 => PseudoOp::ORIG,
1 => PseudoOp::FILL,
2 => PseudoOp::BLKW,
3 => PseudoOp::STRINGZ,
4 => PseudoOp::END,
_ => bail!("Could not match with any pseudo operation. Likely an illegal psuedo-op!"),
};
}
Ok(pseudo_instr_type)
}

/// Take in a `&str`, returning a `Vec<Token>` that contains all syntax morphemes in the str.
pub fn tokenize(line: &str) -> Result<Vec<Token>> {
let mut token: Vec<Token> = Vec::new(); // this value is ultimately returned

if RE_REGISTER.is_match(line) {
let reg_num_char: char = line.chars().nth(1).unwrap();
let reg_num_int: u8 = reg_num_char.to_digit(10).unwrap() as u8;
token.push(Token::REGISTER(RegAddr::try_from(reg_num_int)?));
if line.ends_with(',') {
token.push(Token::COMMA)
}
Ok(token)
} else if RE_COMMENT.is_match(line) {
token.push(Token::COMMENT(line.to_string()));
Ok(token)
} else if RE_INSTR.is_match(line.as_bytes()) {
let matches: Vec<usize> = RE_INSTR.matches(line.as_bytes()).into_iter().collect();
token.push(Token::INSTR(match_op(line, matches)?));
Ok(token)
} else if RE_META.is_match(line.as_bytes()) {
let matches: Vec<usize> = RE_META.matches(line.as_bytes()).into_iter().collect();
token.push(Token::META(match_pseudo_op(matches)?));
Ok(token)
} else if RE_NUM.is_match(line) {
let num = if line.starts_with('x') {
u16::from_str_radix(line.strip_prefix('x').unwrap(), 16).unwrap()
} else if line.starts_with('#') {
line.strip_prefix('#').unwrap().parse().unwrap()
} else {
bail!("Found invalid number declaration!")
};
token.push(Token::NUM(num));
Ok(token)
} else if RE_STRING.is_match(line.trim_matches('"')) {
// Strings and labels are functionally the same but one has quotes.
// Therefore they aren't differentiated by token here, and should be dealt with
// during lexing
let string = line.trim_matches('"').to_string();
if line.starts_with('"') {
token.push(Token::QUOTES)
}
token.push(Token::STRING(string));
if line.ends_with('"') {
token.push(Token::QUOTES)
}
Ok(token)
} else {
bail!("Could not match with a token");
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn tokenize_register() {
// The number of registers is small enough that checking that all of them parse manually is fine
let test_str: &str = "R0";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::REGISTER(RegAddr::Zero));
}

#[test]
fn tokenize_register_comma() {
// The number of registers is small enough that checking that all of them parse manually is fine
let test_str: &str = "R3,";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::REGISTER(RegAddr::Three));
assert_eq!(result[1], Token::COMMA);
}

#[test]
#[should_panic]
fn tokenize_unclean_register() {
let test_str: &str = "R0, A_LABEL";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_ne!(result[0], Token::REGISTER(RegAddr::Zero));
}

#[test]
fn tokenize_comment() {
let test_str: &str = "; Put return addr in R7";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(
result[0],
Token::COMMENT("; Put return addr in R7".to_string())
);
}

#[test]
fn tokenize_instr() {
let test_str: &str = "ADD";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::INSTR(Op::ADD));
}

#[test]
fn tokenize_num_dec() {
let test_str: &str = "#32";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::NUM(32));
}

#[test]
fn tokenize_num_hex() {
let test_str: &str = "x20";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::NUM(32));
}

#[test]
fn tokenize_meta_orig() {
let test_str: &str = ".ORIG";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::META(PseudoOp::ORIG));
}

#[test]
#[should_panic]
fn tokenize_meta_missing_dot() {
let test_str: &str = "END";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::META(PseudoOp::END))
}

#[test]
fn tokenize_string_section() {
let test_str: &str = "Strings!";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::STRING("Strings!".to_string()));
}

#[test]
fn tokenize_string_start() {
let test_str: &str = "\"String?";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::QUOTES);
assert_eq!(result[1], Token::STRING("String?".to_string()));
}

#[test]
fn tokenize_string_end() {
let test_str: &str = "String.\"";
let result: Vec<Token> = tokenize(test_str).unwrap();
assert_eq!(result[0], Token::STRING("String.".to_string()));
assert_eq!(result[1], Token::QUOTES);
}
}
38 changes: 38 additions & 0 deletions src/defs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,41 @@ impl From<RegAddr> for usize {
u8::from(value) as usize
}
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum Op {
ADD,
AND,
BR(bool, bool, bool), // NZP
JMP,
JSR,
JSRR,
LD,
LDI,
LDR,
LEA,
NOT,
RET,
RTI,
ST,
STI,
STR,
TRAP,
GETC,
OUT,
PUTS,
IN,
PUTSP,
HALT,
ILLEGAL,
}

#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum PseudoOp {
ORIG,
FILL,
BLKW,
STRINGZ,
END,
ILLEGAL,
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod assembler;
pub mod defs;
pub mod executors;
pub mod instruction;
Expand Down
Loading