-
Notifications
You must be signed in to change notification settings - Fork 0
feature/implement lexer - part 2 #3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8b17db1
1706ca2
86bb4d2
9ae4ac6
04c4e12
22b0804
e32587b
d477f5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,19 @@ pub struct Lexer { | |
| } | ||
|
|
||
| impl Lexer { | ||
| /// Saves position, runs closure, restores position only if closure returns None | ||
| fn backtrack_if_needed<F, T>(&mut self, f: F) -> Option<T> | ||
| where | ||
| F: FnOnce(&mut Self) -> Option<T>, | ||
| { | ||
| let saved = self.position; | ||
| let result = f(self); | ||
| if result.is_none() { | ||
| self.position = saved; | ||
| } | ||
| result | ||
| } | ||
|
|
||
| pub fn new(input: &str) -> Self { | ||
| Self { | ||
| input: input.chars().collect(), | ||
|
|
@@ -134,6 +147,150 @@ impl Lexer { | |
| self.input[start..self.position].iter().collect() | ||
| } | ||
|
|
||
| // Read a column reference: $?[A-Z]+ | ||
| fn read_column(&mut self) -> String { | ||
| let start = self.position; | ||
|
|
||
| // Optional $ prefix | ||
| if self.current() == Some('$') { | ||
| self.advance(); | ||
| } | ||
|
|
||
| // Read column letters [A-Z]+ | ||
| while let Some(c) = self.current() { | ||
| if c.is_ascii_uppercase() { | ||
| self.advance(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| self.input[start..self.position].iter().collect() | ||
| } | ||
|
|
||
| // Read a row reference: $?[1-9][0-9]* | ||
| fn read_row(&mut self) -> String { | ||
| let start = self.position; | ||
|
|
||
| // Optional $ prefix | ||
| if self.current() == Some('$') { | ||
| self.advance(); | ||
| } | ||
|
|
||
| // Row must start with [1-9] | ||
| if matches!(self.current(), Some('1'..='9')) { | ||
| self.advance(); | ||
| // Followed by [0-9]* | ||
| while let Some(c) = self.current() { | ||
| if c.is_ascii_digit() { | ||
| self.advance(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| self.input[start..self.position].iter().collect() | ||
| } | ||
|
|
||
| // Try to read a cell reference or vertical range starting with column letters | ||
| fn try_read_cell_or_vertical_range(&mut self) -> Option<Token> { | ||
| self.backtrack_if_needed(|lexer| { | ||
| let col1 = lexer.read_column(); | ||
|
|
||
| // Must have at least one letter (not just $) | ||
| let has_letters = col1.chars().any(|c| c.is_ascii_uppercase()); | ||
| if !has_letters { | ||
| return None; | ||
| } | ||
|
|
||
| // Check if followed by ':' | ||
| if lexer.current() == Some(':') { | ||
| lexer.advance(); | ||
| let col2 = lexer.read_column(); | ||
|
|
||
| // Verify col2 has letters | ||
| let has_letters2 = col2.chars().any(|c| c.is_ascii_uppercase()); | ||
| if has_letters2 { | ||
| return Some(Token::VerticalRange(format!("{}:{}", col1, col2))); | ||
| } else { | ||
| return None; | ||
| } | ||
| } | ||
|
|
||
| // Check if followed by a row number (making it a CELL) | ||
| if let Some('$' | '1'..='9') = lexer.current() { | ||
| let row = lexer.read_row(); | ||
| if !row.is_empty() && row.chars().any(|ch| ch.is_ascii_digit()) { | ||
| return Some(Token::Cell(format!("{}{}", col1, row))); | ||
| } | ||
| } | ||
|
|
||
| None | ||
| }) | ||
| } | ||
|
|
||
| // Try to read a horizontal range: $?[0-9]+:$?[0-9]+ | ||
| fn try_read_horizontal_range(&mut self) -> Option<Token> { | ||
| self.backtrack_if_needed(|lexer| { | ||
| // Read first row number | ||
| let row1_start = lexer.position; | ||
| if lexer.current() == Some('$') { | ||
| lexer.advance(); | ||
| } | ||
|
|
||
| // Must start with a digit | ||
| if let Some(c) = lexer.current() { | ||
| if !c.is_ascii_digit() { | ||
| return None; | ||
| } | ||
| } else { | ||
| return None; | ||
| } | ||
|
|
||
| while let Some(c) = lexer.current() { | ||
| if c.is_ascii_digit() { | ||
| lexer.advance(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| let row1: String = lexer.input[row1_start..lexer.position].iter().collect(); | ||
|
|
||
| // Check for ':' | ||
| if lexer.current() != Some(':') { | ||
| return None; | ||
| } | ||
| lexer.advance(); | ||
|
|
||
| // Read second row number | ||
| let row2_start = lexer.position; | ||
| if lexer.current() == Some('$') { | ||
| lexer.advance(); | ||
| } | ||
|
|
||
| if let Some(c) = lexer.current() { | ||
| if !c.is_ascii_digit() { | ||
| return None; | ||
| } | ||
| } else { | ||
| return None; | ||
| } | ||
|
|
||
| while let Some(c) = lexer.current() { | ||
| if c.is_ascii_digit() { | ||
| lexer.advance(); | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| let row2: String = lexer.input[row2_start..lexer.position].iter().collect(); | ||
| Some(Token::HorizontalRange(format!("{}:{}", row1, row2))) | ||
| }) | ||
| } | ||
|
|
||
| pub fn next_token(&mut self) -> Result<Token, LexerError> { | ||
| self.skip_whitespace(); | ||
|
|
||
|
|
@@ -207,6 +364,10 @@ impl Lexer { | |
| self.advance(); | ||
| Ok(Token::Semicolon) | ||
| } | ||
| Some(':') => { | ||
| self.advance(); | ||
| Ok(Token::Colon) | ||
| } | ||
| Some('"') => { | ||
| let s = self.read_string()?; | ||
| Ok(Token::String(s)) | ||
|
|
@@ -222,10 +383,44 @@ impl Lexer { | |
| } | ||
| } | ||
| Some(c) if c.is_ascii_digit() => { | ||
| let num = self.read_number()?; | ||
| Ok(Token::Number(num)) | ||
| // Try horizontal range first (e.g., 1:10) | ||
| if let Some(token) = self.try_read_horizontal_range() { | ||
| Ok(token) | ||
| } else { | ||
| // Otherwise it's a number | ||
| let num = self.read_number()?; | ||
| Ok(Token::Number(num)) | ||
| } | ||
| } | ||
| Some('$') => { | ||
| // Could be a cell reference like $A$1 or range like $A:$B or $1:$10 | ||
| // Try cell/vertical range first | ||
| if let Some(token) = self.try_read_cell_or_vertical_range() { | ||
| Ok(token) | ||
| } else if let Some(token) = self.try_read_horizontal_range() { | ||
| Ok(token) | ||
| } else { | ||
| // Invalid $ usage | ||
| let c = self.current().unwrap_or('$'); | ||
| Err(LexerError::UnexpectedChar(c)) | ||
| } | ||
| } | ||
| Some(c) if c.is_ascii_uppercase() => { | ||
| // Try cell/vertical range first (e.g., A1, A:Z) | ||
| if let Some(token) = self.try_read_cell_or_vertical_range() { | ||
| Ok(token) | ||
| } else { | ||
| // Try identifier for TRUE/FALSE | ||
| let ident = self.read_identifier(); | ||
| match ident.to_uppercase().as_str() { | ||
| "TRUE" => Ok(Token::Bool(true)), | ||
| "FALSE" => Ok(Token::Bool(false)), | ||
| _ => Err(LexerError::UnexpectedChar(c)), | ||
| } | ||
| } | ||
|
Comment on lines
+396
to
+420
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually yeah I feel like this is a parser job
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or maybe not, but I think the backtracking can be removed by doing some sort of precedence, see: https://github.com/spreadsheetlab/XLParser/blob/master/src/XLParser/ExcelFormulaGrammar.cs ^above library references the paper too/the paper also references that library
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that code is just defining some grammer and letting some other library to do lexing and parsing
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think my naming maybe bad? By "backtrack" function name, I was just implementing kind of like context management in python where the position gets reset if there was no match. But yes time complexity is higher but we could get code like this which would be a little more maintainable and maybe easier to reason about for priorities; If we are trying for pure performance, i can refactor it though
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what is the best way, leaning towards cleaner code. The last time I did was just writing regex and passing to a library for lexing |
||
| } | ||
| Some(c) if c.is_alphabetic() => { | ||
| // Lowercase letters - try identifier for true/false | ||
| let ident = self.read_identifier(); | ||
| match ident.to_uppercase().as_str() { | ||
| "TRUE" => Ok(Token::Bool(true)), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| use expy::bindings::lexer::Lexer; | ||
| use expy::bindings::token::Token; | ||
|
|
||
| // ============================================================================ | ||
| // SPEC: CELL - $?[A-Z]+$?[1-9][0-9]* | ||
| // Priority: 2 | ||
| // ============================================================================ | ||
|
|
||
yarkhinephyo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| #[test] | ||
| fn test_cell_simple() { | ||
| let mut lexer = Lexer::new("A1"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_double_letter() { | ||
| let mut lexer = Lexer::new("AA10"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "AA10")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_triple_letter() { | ||
| let mut lexer = Lexer::new("XFD1048576"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "XFD1048576")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_absolute_column() { | ||
| let mut lexer = Lexer::new("$A1"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "$A1")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_absolute_row() { | ||
| let mut lexer = Lexer::new("A$1"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "A$1")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_absolute_both() { | ||
| let mut lexer = Lexer::new("$A$1"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "$A$1")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_large_row() { | ||
| let mut lexer = Lexer::new("B999"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "B999")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_multiple() { | ||
| let mut lexer = Lexer::new("A1 B2 $C$3"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1")); | ||
| assert!(matches!(&tokens[1], Token::Cell(s) if s == "B2")); | ||
| assert!(matches!(&tokens[2], Token::Cell(s) if s == "$C$3")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_in_expression() { | ||
| let mut lexer = Lexer::new("A1 + B2"); | ||
| let tokens = lexer.tokenize().unwrap(); | ||
| assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1")); | ||
| assert!(matches!(tokens[1], Token::Plus)); | ||
| assert!(matches!(&tokens[2], Token::Cell(s) if s == "B2")); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_cell_row_zero_variations() { | ||
| // Test various forms of invalid row 0 | ||
| let test_cases = vec!["A0", "B0", "$A0", "A$0", "$A$0", "AA0", "XFD0"]; | ||
|
|
||
| for input in test_cases { | ||
| let mut lexer = Lexer::new(input); | ||
| let result = lexer.tokenize(); | ||
| assert!( | ||
| result.is_err(), | ||
| "Input '{}' should error (row 0 is invalid) but got: {:?}", | ||
| input, | ||
| result | ||
| ); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like this is a parser job, not a lexer job
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No i don't think so. The paper implies these are still lexical tokens (grammer not as much ambiguity)