Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
fde1fe4
add foundational structures for code->machine translation
wpkelso Dec 20, 2024
68bc5a3
Merge branch 'main' into assembler
wpkelso Dec 20, 2024
c09d59f
lay tokenizer groundwork
wpkelso Dec 23, 2024
8fba3da
Switch to using regex
wpkelso Dec 24, 2024
f175c9e
add all regex strings
wpkelso Dec 24, 2024
a625f0e
add some matching and a failing test
wpkelso Dec 24, 2024
85d5f51
cleaned up register matching logic
wpkelso Dec 25, 2024
f26d5ff
add tests for comments and unclean register strings
wpkelso Dec 25, 2024
9312b4b
instruction tokenization and test for add
wpkelso Dec 25, 2024
166b73f
tokenize pseudoops
wpkelso Dec 25, 2024
921f5dd
return vector of tokens
wpkelso Dec 25, 2024
217288b
split into multiple files
wpkelso Dec 25, 2024
0dae0e3
num tokens and tests
wpkelso Dec 26, 2024
1d9ce2c
add prefix label pass for lexer
wpkelso Dec 28, 2024
bfae0de
Add test for stripping prefix labels
wpkelso Dec 28, 2024
469ab05
Add wrapper function for lexer interface
wpkelso Dec 28, 2024
06e7699
token validation fn shell & fn inlining
wpkelso Dec 29, 2024
c0007c8
Implementation up to instruction formation, then stuck
wpkelso Jan 2, 2025
0ba9d4e
use strum macros instead of shell tokens
wpkelso Jan 2, 2025
e5cd945
Initial lexer skeleton
Bennett-Petzold Jan 2, 2025
c28b353
Merge branch 'main' into assembler
Bennett-Petzold Jan 2, 2025
c00c7e8
add load lexing
wpkelso Jan 13, 2025
e418601
add all multi-field instructions to lexer
wpkelso Jan 14, 2025
b4c9da9
Add tests for currently implemented instructions
wpkelso Jan 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ path = "src/testcli.rs"
anyhow = "1.0.95"
once_cell = "1.20.2"
regex = "1.11.1"
strum = { version = "0.26.3", features = ["derive"] }
strum_macros = "0.26.4"
# To reduce error boilerplate
thiserror = "2"

Expand Down
327 changes: 327 additions & 0 deletions src/assembler/lexer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
use crate::{
assembler::{MaybeUnresolvedInstr, Op, PseudoOp, Token},

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE
defs::{LC3Word, RegAddr},

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE
instruction::{ADD_OPCODE, AND_OPCODE, ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, ALL_LOAD_OPCODES, ALL_STORE_OPCODES, TRAP_OPCODE, NOT_OPCODE},

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE

Check warning

Code scanning / clippy

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE Warning

unused imports: ALL_JUMP_OPCODES, BRANCH_OPCODE, JSR_OPCODE, PseudoOp, RegAddr, and TRAP_OPCODE
};
use anyhow::{bail, Result};

// All of these functions are inlined because they work on the same exact data but are split up for
// legibility

/// First stage of the lexer operation, where any prefix labels are stripped out
#[inline]
pub fn prefix_label_pass(token_chain: &[Token]) -> (Option<&str>, &[Token]) {
if token_chain[0].is_string() {
let label_str: &str = match &token_chain[0] {
Token::STRING(label) => label.as_str(),
_ => panic!("This shouldn't happen"),
};
(Some(label_str), &token_chain[1..])
} else {
(None, token_chain)
}
}

/// Second stage of the lexer operation, where a chain of unresolved instructions is created from
/// the asm op. If the line consists only of a comment, then an empty Vec is returned
#[inline]
pub fn construct_instruction_pass(token_chain: &[Token]) -> Result<Vec<MaybeUnresolvedInstr>> {
let mut result: Vec<MaybeUnresolvedInstr> = Vec::new();

let operation = &token_chain[0];

if let Token::INSTR(op) = operation {
fn check_reg<const SHIFT: usize>(
token: &Token,
instr: &mut MaybeUnresolvedInstr,
) -> Result<(), anyhow::Error> {
if let Token::REGISTER(reg) = token {
instr.value |= (LC3Word::from(*reg) << SHIFT);

Check warning

Code scanning / clippy

unnecessary parentheses around assigned value Warning

unnecessary parentheses around assigned value

Check warning

Code scanning / clippy

unnecessary parentheses around assigned value Warning

unnecessary parentheses around assigned value
Ok(())
} else {
bail!("NOT REG")
}
}

fn check_offset<const SHIFT: u8, const MAX_LEN: u8>(
token: &Token,
instr: &mut MaybeUnresolvedInstr,
) -> Result<(), anyhow::Error> {
if let Token::NUM(num) = token {
let max_mask = const { 1 << (MAX_LEN + 1) };
if *num < max_mask {
instr.value |= num << SHIFT;
Ok(())
} else {
bail!("TOO BIG")
}
} else if let Token::STRING(label) = token {
instr
.bindings
.push((label.clone(), const { SHIFT + MAX_LEN }, SHIFT));
Ok(())
} else {
bail!("NOT OFFSET")
}
}

fn check_reg_or_offset<const SHIFT: u8, const MAX_OFFSET_LEN: u8>(
token: &Token,
instr: &mut MaybeUnresolvedInstr,
) -> Result<(), anyhow::Error> {
if let Token::REGISTER(reg) = token {
instr.value |= (LC3Word::from(*reg) << SHIFT);

Check warning

Code scanning / clippy

unnecessary parentheses around assigned value Warning

unnecessary parentheses around assigned value

Check warning

Code scanning / clippy

unnecessary parentheses around assigned value Warning

unnecessary parentheses around assigned value
Ok(())
} else if let Token::NUM(num) = token {
let max_mask = const { 1 << (MAX_OFFSET_LEN + 1) };
if *num < max_mask {
instr.value |= num << SHIFT;
instr.value |= 1 << MAX_OFFSET_LEN;
Ok(())
} else {
bail!("TOO BIG")
}
} else if let Token::STRING(label) = token {
instr
.bindings
.push((label.clone(), const { SHIFT + MAX_OFFSET_LEN }, SHIFT));
Ok(())
} else {
bail!("NOT REG OR OFFSET")
}
}

let (opcode, sequence) = match op {
Op::ADD => (
ADD_OPCODE,
[check_reg::<9>, check_reg::<6>, check_reg_or_offset::<0, 5>].as_slice(),
),
Op::AND => (
AND_OPCODE,
[check_reg::<9>, check_reg::<6>, check_reg_or_offset::<0, 5>].as_slice()),
Op::LD => (
ALL_LOAD_OPCODES[0],
[check_reg::<9>, check_offset::<0, 9>].as_slice()
),
Op::LDI => (
ALL_LOAD_OPCODES[1],
[check_reg::<9>, check_offset::<0, 9>].as_slice()
),
Op::LDR => (
ALL_LOAD_OPCODES[2],
[check_reg::<9>, check_reg::<6>, check_offset::<0, 6>].as_slice()
),
Op::LEA => (
ALL_LOAD_OPCODES[3],
[check_reg::<9>, check_offset::<0, 9>].as_slice()
),
Op::ST => (
ALL_STORE_OPCODES[0],
[check_reg::<9>, check_offset::<0, 9>].as_slice()
),
Op::STI => (
ALL_STORE_OPCODES[1],
[check_reg::<9>, check_offset::<0, 9>].as_slice()
),
Op::STR => (
ALL_STORE_OPCODES[2],
[check_reg::<9>, check_reg::<6>, check_offset::<0, 6>].as_slice()
),
Op::NOT => (
NOT_OPCODE,
[check_reg::<9>, check_reg::<6>].as_slice()
),
_ => todo!(),
};

let mut instr = MaybeUnresolvedInstr {
// Shift opcode to start
value: (opcode as LC3Word) << 12,
bindings: Vec::new(),
};

for (process, token) in sequence.iter().zip(&token_chain[1..]) {
process(token, &mut instr)?;
}

result.push(instr);
} else if operation.is_meta() {
todo!()
} else if !operation.is_comment() {
bail!("Line is invalid, does not start with an instruction!")
}

Ok(result)
}

/// Wrapper function to provide a cleaner API for the lexing passes
pub fn lexer(token_chain: &[Token]) -> (Option<&str>, Result<Vec<MaybeUnresolvedInstr>>) {
let (label, chain) = prefix_label_pass(token_chain);
let result = construct_instruction_pass(chain);

// The result gets passed on so the assembler can attatch more context to any error messages
// generated (i.e. the expected address of the error)
(label, result)
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn lex_label_instr() {
let test_vec = vec![
Token::STRING("LABEL1".to_string()),
Token::INSTR(Op::ILLEGAL),
];
let (label, instr) = prefix_label_pass(&test_vec);

assert_eq!(label.unwrap(), "LABEL1");
assert_eq!(instr[0], Token::INSTR(Op::ILLEGAL));
}

#[test]
fn lex_and_instr() {
let test_vec = vec![
Token::STRING("LABEL1".to_string()),
Token::INSTR(Op::AND),
Token::REGISTER(RegAddr::Zero),
Token::REGISTER(RegAddr::One),
Token::REGISTER(RegAddr::Zero)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label.unwrap(), "LABEL1");
assert_eq!(instr.unwrap().first().unwrap().value, 0b0101000001000000);

let test_vec = vec![
Token::INSTR(Op::AND),
Token::REGISTER(RegAddr::Three),
Token::REGISTER(RegAddr::One),
Token::NUM(0b10011)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0101011001110011);
}

#[test]
fn lex_add_instr() {
let test_vec = vec![
Token::STRING("LABEL1".to_string()),
Token::INSTR(Op::ADD),
Token::REGISTER(RegAddr::Zero),
Token::REGISTER(RegAddr::One),
Token::REGISTER(RegAddr::Zero)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label.unwrap(), "LABEL1");
assert_eq!(instr.unwrap().first().unwrap().value, 0b0001000001000000);

let test_vec = vec![
Token::INSTR(Op::ADD),
Token::REGISTER(RegAddr::Three),
Token::REGISTER(RegAddr::One),
Token::NUM(0b10011)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0001011001110011);
}

#[test]
fn lex_load_instrs() {
let test_vec = vec![
Token::INSTR(Op::LD),
Token::REGISTER(RegAddr::Five),
Token::NUM(0b000111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0010101000111000);

let test_vec = vec![
Token::INSTR(Op::LDI),
Token::REGISTER(RegAddr::Five),
Token::NUM(0b000111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b1010101000111000);

let test_vec = vec![
Token::INSTR(Op::LDR),
Token::REGISTER(RegAddr::Five),
Token::REGISTER(RegAddr::Two),
Token::NUM(0b111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0110101010111000);

let test_vec = vec![
Token::INSTR(Op::LEA),
Token::REGISTER(RegAddr::Five),
Token::NUM(0b000111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b1110101000111000);
}

#[test]
fn lex_store_instrs() {
let test_vec = vec![
Token::INSTR(Op::ST),
Token::REGISTER(RegAddr::Five),
Token::NUM(0b000111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0011101000111000);

let test_vec = vec![
Token::INSTR(Op::STI),
Token::REGISTER(RegAddr::Five),
Token::NUM(0b000111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b1011101000111000);

let test_vec = vec![
Token::INSTR(Op::STR),
Token::REGISTER(RegAddr::Five),
Token::REGISTER(RegAddr::Two),
Token::NUM(0b111000)
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
assert_eq!(instr.unwrap().first().unwrap().value, 0b0111101010111000);
}

#[test]
fn lex_not_instr() {
let test_vec = vec![
Token::INSTR(Op::NOT),
Token::REGISTER(RegAddr::Five),
Token::REGISTER(RegAddr::Zero),
];
let (label, instr) = lexer(&test_vec);

assert_eq!(label, None);
// This is the value that should be produced. Currently this fails, as there is no way to
// insert arbitrary bits into instructions when forming them.
assert_eq!(instr.unwrap().first().unwrap().value, 0b1001101000111111);
}
}
21 changes: 19 additions & 2 deletions src/assembler/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,28 @@
use crate::defs::LC3Word;
use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};
use strum::EnumIs;
use strum_macros::EnumDiscriminants;

pub mod lexer;
pub mod tokenizer;

#[derive(Debug, Clone, Eq, PartialEq)]
pub struct MaybeUnresolvedInstr {
value: LC3Word,
///Label, Start offset, End offset
bindings: Option<(String, u8, u8)>,
bindings: Vec<(String, u8, u8)>,
}

#[derive(Debug, Clone, Eq, PartialEq, EnumIs, EnumDiscriminants)]
pub enum Token {
INSTR(Op),
REGISTER(RegAddr),
META(PseudoOp),
STRING(String),
NUM(LC3Word),
COMMENT(String),
QUOTES,
SEMICOLON,
COMMA,
}

pub fn translate_line(line: &str) -> MaybeUnresolvedInstr {
Expand Down
16 changes: 2 additions & 14 deletions src/assembler/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,8 @@ use anyhow::{bail, Result};
use once_cell::sync::Lazy;
use regex::{bytes::RegexSet, Regex};

use crate::defs::{LC3Word, Op, PseudoOp, RegAddr};

#[derive(Debug, Clone, Eq, PartialEq)]
pub enum Token {
INSTR(Op),
REGISTER(RegAddr),
META(PseudoOp),
STRING(String),
NUM(LC3Word),
COMMENT(String),
QUOTES,
SEMICOLON,
COMMA,
}
use crate::assembler::Token;
use crate::defs::{Op, PseudoOp, RegAddr};

// This follows the same ordering as defs.rs > pub enum Op
const INSTR_PATTERN: [&str; 23] = [
Expand Down
Loading
Loading