Skip to content

Commit 921f82b

Browse files
Merge branch 'main' into trap
2 parents 3e656ff + 89bec56 commit 921f82b

File tree

5 files changed

+327
-0
lines changed

5 files changed

+327
-0
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ name = "testcli"
1919
path = "src/testcli.rs"
2020

2121
[dependencies]
22+
anyhow = "1.0.95"
23+
once_cell = "1.20.2"
24+
regex = "1.11.1"
2225
# To reduce error boilerplate
2326
thiserror = "2"
2427

src/assembler/mod.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use crate::defs::LC3Word;
2+
3+
pub mod tokenizer;
4+
5+
pub struct MaybeUnresolvedInstr {
6+
value: LC3Word,
7+
///Label, Start offset, End offset
8+
bindings: Option<(String, u8, u8)>,
9+
}
10+
11+
pub fn translate_line(line: &str) -> MaybeUnresolvedInstr {
12+
todo!()
13+
}
14+
15+
pub fn resolve_instr(instr: MaybeUnresolvedInstr) -> String {
16+
todo!()
17+
}

src/assembler/tokenizer.rs

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
use anyhow::{bail, Result};
2+
use once_cell::sync::Lazy;
3+
use regex::{bytes::RegexSet, Regex};
4+
5+
use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};
6+
7+
#[derive(Debug, Clone, Eq, PartialEq)]
8+
pub enum Token {
9+
INSTR(Op),
10+
REGISTER(RegAddr),
11+
META(PseudoOp),
12+
STRING(String),
13+
NUM(LC3Word),
14+
COMMENT(String),
15+
QUOTES,
16+
SEMICOLON,
17+
COMMA,
18+
}
19+
20+
// This follows the same ordering as defs.rs > pub enum Op
21+
const INSTR_PATTERN: [&str; 23] = [
22+
r"^ADD$",
23+
r"^AND$",
24+
r"^BR[nN]?[zZ]?[pP]?$",
25+
r"^JMP$",
26+
r"^JSR$",
27+
r"^JSRR$",
28+
r"^LD$",
29+
r"^LDI$",
30+
r"^LDR$",
31+
r"^LEA$",
32+
r"^NOT$",
33+
r"^RET$",
34+
r"^RTI$",
35+
r"^ST$",
36+
r"^STI$",
37+
r"^STR$",
38+
r"^TRAP$",
39+
r"^GETC$",
40+
r"^OUT$",
41+
r"^PUTS$",
42+
r"^IN$",
43+
r"^PUTSP$",
44+
r"^HALT$",
45+
];
46+
47+
const META_PATTERN: [&str; 5] = [r"^.ORIG$", r"^.FILL$", r"^BLKW$", r"^.STRINGZ$", r"^.END$"];
48+
const NUM_PATTERN: &str = r"^[x|#|b]-?[0-9A-F]*$";
49+
const REG_PATTERN: &str = r"^R[0-7],?$";
50+
const COMMENT_PATTERN: &str = r"^;.*$";
51+
const STRING_PATTERN: &str = r"^[0-9a-zA-Z[:punct:]]+$";
52+
53+
// Regexes get lazy compiled then stored for reuse
54+
static RE_REGISTER: Lazy<Regex> = Lazy::new(|| Regex::new(REG_PATTERN).unwrap());
55+
static RE_COMMENT: Lazy<Regex> = Lazy::new(|| Regex::new(COMMENT_PATTERN).unwrap());
56+
static RE_INSTR: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(INSTR_PATTERN).unwrap());
57+
static RE_META: Lazy<RegexSet> = Lazy::new(|| RegexSet::new(META_PATTERN).unwrap());
58+
static RE_NUM: Lazy<Regex> = Lazy::new(|| Regex::new(NUM_PATTERN).unwrap());
59+
static RE_STRING: Lazy<Regex> = Lazy::new(|| Regex::new(STRING_PATTERN).unwrap());
60+
61+
fn match_op(line: &str, target: Vec<usize>) -> Result<Op> {
62+
let mut instr_type: Op = Op::ILLEGAL;
63+
for item in target {
64+
// this should be fine because there should only ever be 1 item in the vec
65+
// if there isn't then we just match the last
66+
instr_type = match item {
67+
0 => Op::ADD,
68+
1 => Op::AND,
69+
2 => {
70+
// this was written before this returned a vector of tokens
71+
// it might be better to turn these into separate tokens
72+
let n: bool = line.contains(['n', 'N']);
73+
let z: bool = line.contains(['z', 'Z']);
74+
let p: bool = line.contains(['p', 'P']);
75+
Op::BR(n, z, p)
76+
}
77+
3 => Op::JMP,
78+
4 => Op::JSR,
79+
5 => Op::JSRR,
80+
6 => Op::LD,
81+
7 => Op::LDI,
82+
8 => Op::LDR,
83+
9 => Op::LEA,
84+
10 => Op::RET,
85+
11 => Op::RTI,
86+
12 => Op::ST,
87+
13 => Op::STI,
88+
14 => Op::STR,
89+
15 => Op::TRAP,
90+
16 => Op::GETC,
91+
17 => Op::OUT,
92+
18 => Op::PUTS,
93+
19 => Op::IN,
94+
20 => Op::PUTSP,
95+
21 => Op::HALT,
96+
_ => bail!("Could not match with an operation. Likely an illegal op!"),
97+
};
98+
}
99+
Ok(instr_type)
100+
}
101+
102+
fn match_pseudo_op(target: Vec<usize>) -> Result<PseudoOp> {
103+
let mut pseudo_instr_type: PseudoOp = PseudoOp::ILLEGAL;
104+
105+
for item in target {
106+
// this should be fine as there should only ever be 1 item in the vec
107+
// if there isn't then we just match the last
108+
pseudo_instr_type = match item {
109+
0 => PseudoOp::ORIG,
110+
1 => PseudoOp::FILL,
111+
2 => PseudoOp::BLKW,
112+
3 => PseudoOp::STRINGZ,
113+
4 => PseudoOp::END,
114+
_ => bail!("Could not match with any pseudo operation. Likely an illegal psuedo-op!"),
115+
};
116+
}
117+
Ok(pseudo_instr_type)
118+
}
119+
120+
/// Take in a `&str`, returning a `Vec<Token>` that contains all syntax morphemes in the str.
121+
pub fn tokenize(line: &str) -> Result<Vec<Token>> {
122+
let mut token: Vec<Token> = Vec::new(); // this value is ultimately returned
123+
124+
if RE_REGISTER.is_match(line) {
125+
let reg_num_char: char = line.chars().nth(1).unwrap();
126+
let reg_num_int: u8 = reg_num_char.to_digit(10).unwrap() as u8;
127+
token.push(Token::REGISTER(RegAddr::try_from(reg_num_int)?));
128+
if line.ends_with(',') {
129+
token.push(Token::COMMA)
130+
}
131+
Ok(token)
132+
} else if RE_COMMENT.is_match(line) {
133+
token.push(Token::COMMENT(line.to_string()));
134+
Ok(token)
135+
} else if RE_INSTR.is_match(line.as_bytes()) {
136+
let matches: Vec<usize> = RE_INSTR.matches(line.as_bytes()).into_iter().collect();
137+
token.push(Token::INSTR(match_op(line, matches)?));
138+
Ok(token)
139+
} else if RE_META.is_match(line.as_bytes()) {
140+
let matches: Vec<usize> = RE_META.matches(line.as_bytes()).into_iter().collect();
141+
token.push(Token::META(match_pseudo_op(matches)?));
142+
Ok(token)
143+
} else if RE_NUM.is_match(line) {
144+
let num = if line.starts_with('x') {
145+
u16::from_str_radix(line.strip_prefix('x').unwrap(), 16).unwrap()
146+
} else if line.starts_with('#') {
147+
line.strip_prefix('#').unwrap().parse().unwrap()
148+
} else {
149+
bail!("Found invalid number declaration!")
150+
};
151+
token.push(Token::NUM(num));
152+
Ok(token)
153+
} else if RE_STRING.is_match(line.trim_matches('"')) {
154+
// Strings and labels are functionally the same but one has quotes.
155+
// Therefore they aren't differentiated by token here, and should be dealt with
156+
// during lexing
157+
let string = line.trim_matches('"').to_string();
158+
if line.starts_with('"') {
159+
token.push(Token::QUOTES)
160+
}
161+
token.push(Token::STRING(string));
162+
if line.ends_with('"') {
163+
token.push(Token::QUOTES)
164+
}
165+
Ok(token)
166+
} else {
167+
bail!("Could not match with a token");
168+
}
169+
}
170+
171+
#[cfg(test)]
172+
mod test {
173+
use super::*;
174+
175+
#[test]
176+
fn tokenize_register() {
177+
// The number of registers is small enough that checking that all of them parse manually is fine
178+
let test_str: &str = "R0";
179+
let result: Vec<Token> = tokenize(test_str).unwrap();
180+
assert_eq!(result[0], Token::REGISTER(RegAddr::Zero));
181+
}
182+
183+
#[test]
184+
fn tokenize_register_comma() {
185+
// The number of registers is small enough that checking that all of them parse manually is fine
186+
let test_str: &str = "R3,";
187+
let result: Vec<Token> = tokenize(test_str).unwrap();
188+
assert_eq!(result[0], Token::REGISTER(RegAddr::Three));
189+
assert_eq!(result[1], Token::COMMA);
190+
}
191+
192+
#[test]
193+
#[should_panic]
194+
fn tokenize_unclean_register() {
195+
let test_str: &str = "R0, A_LABEL";
196+
let result: Vec<Token> = tokenize(test_str).unwrap();
197+
assert_ne!(result[0], Token::REGISTER(RegAddr::Zero));
198+
}
199+
200+
#[test]
201+
fn tokenize_comment() {
202+
let test_str: &str = "; Put return addr in R7";
203+
let result: Vec<Token> = tokenize(test_str).unwrap();
204+
assert_eq!(
205+
result[0],
206+
Token::COMMENT("; Put return addr in R7".to_string())
207+
);
208+
}
209+
210+
#[test]
211+
fn tokenize_instr() {
212+
let test_str: &str = "ADD";
213+
let result: Vec<Token> = tokenize(test_str).unwrap();
214+
assert_eq!(result[0], Token::INSTR(Op::ADD));
215+
}
216+
217+
#[test]
218+
fn tokenize_num_dec() {
219+
let test_str: &str = "#32";
220+
let result: Vec<Token> = tokenize(test_str).unwrap();
221+
assert_eq!(result[0], Token::NUM(32));
222+
}
223+
224+
#[test]
225+
fn tokenize_num_hex() {
226+
let test_str: &str = "x20";
227+
let result: Vec<Token> = tokenize(test_str).unwrap();
228+
assert_eq!(result[0], Token::NUM(32));
229+
}
230+
231+
#[test]
232+
fn tokenize_meta_orig() {
233+
let test_str: &str = ".ORIG";
234+
let result: Vec<Token> = tokenize(test_str).unwrap();
235+
assert_eq!(result[0], Token::META(PseudoOp::ORIG));
236+
}
237+
238+
#[test]
239+
#[should_panic]
240+
fn tokenize_meta_missing_dot() {
241+
let test_str: &str = "END";
242+
let result: Vec<Token> = tokenize(test_str).unwrap();
243+
assert_eq!(result[0], Token::META(PseudoOp::END))
244+
}
245+
246+
#[test]
247+
fn tokenize_string_section() {
248+
let test_str: &str = "Strings!";
249+
let result: Vec<Token> = tokenize(test_str).unwrap();
250+
assert_eq!(result[0], Token::STRING("Strings!".to_string()));
251+
}
252+
253+
#[test]
254+
fn tokenize_string_start() {
255+
let test_str: &str = "\"String?";
256+
let result: Vec<Token> = tokenize(test_str).unwrap();
257+
assert_eq!(result[0], Token::QUOTES);
258+
assert_eq!(result[1], Token::STRING("String?".to_string()));
259+
}
260+
261+
#[test]
262+
fn tokenize_string_end() {
263+
let test_str: &str = "String.\"";
264+
let result: Vec<Token> = tokenize(test_str).unwrap();
265+
assert_eq!(result[0], Token::STRING("String.".to_string()));
266+
assert_eq!(result[1], Token::QUOTES);
267+
}
268+
}

src/defs.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,41 @@ impl From<RegAddr> for u16 {
125125
u8::from(value) as u16
126126
}
127127
}
128+
129+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
130+
pub enum Op {
131+
ADD,
132+
AND,
133+
BR(bool, bool, bool), // NZP
134+
JMP,
135+
JSR,
136+
JSRR,
137+
LD,
138+
LDI,
139+
LDR,
140+
LEA,
141+
NOT,
142+
RET,
143+
RTI,
144+
ST,
145+
STI,
146+
STR,
147+
TRAP,
148+
GETC,
149+
OUT,
150+
PUTS,
151+
IN,
152+
PUTSP,
153+
HALT,
154+
ILLEGAL,
155+
}
156+
157+
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
158+
pub enum PseudoOp {
159+
ORIG,
160+
FILL,
161+
BLKW,
162+
STRINGZ,
163+
END,
164+
ILLEGAL,
165+
}

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
pub mod assembler;
12
pub mod defs;
23
pub mod executors;
34
pub mod harnesses;

0 commit comments

Comments
 (0)