Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 197 additions & 2 deletions src/bindings/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ pub struct Lexer {
}

impl Lexer {
/// Saves position, runs closure, restores position only if closure returns None
fn backtrack_if_needed<F, T>(&mut self, f: F) -> Option<T>
where
F: FnOnce(&mut Self) -> Option<T>,
{
let saved = self.position;
let result = f(self);
if result.is_none() {
self.position = saved;
}
result
}
Comment on lines +10 to +20
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this is a parser job, not a lexer job

Copy link
Member Author

@yarkhinephyo yarkhinephyo Jan 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No i don't think so. The paper implies these are still lexical tokens (grammer not as much ambiguity)


pub fn new(input: &str) -> Self {
Self {
input: input.chars().collect(),
Expand Down Expand Up @@ -134,6 +147,150 @@ impl Lexer {
self.input[start..self.position].iter().collect()
}

// Read a column reference: $?[A-Z]+
fn read_column(&mut self) -> String {
let start = self.position;

// Optional $ prefix
if self.current() == Some('$') {
self.advance();
}

// Read column letters [A-Z]+
while let Some(c) = self.current() {
if c.is_ascii_uppercase() {
self.advance();
} else {
break;
}
}

self.input[start..self.position].iter().collect()
}

// Read a row reference: $?[1-9][0-9]*
fn read_row(&mut self) -> String {
let start = self.position;

// Optional $ prefix
if self.current() == Some('$') {
self.advance();
}

// Row must start with [1-9]
if matches!(self.current(), Some('1'..='9')) {
self.advance();
// Followed by [0-9]*
while let Some(c) = self.current() {
if c.is_ascii_digit() {
self.advance();
} else {
break;
}
}
}

self.input[start..self.position].iter().collect()
}

// Try to read a cell reference or vertical range starting with column letters
fn try_read_cell_or_vertical_range(&mut self) -> Option<Token> {
self.backtrack_if_needed(|lexer| {
let col1 = lexer.read_column();

// Must have at least one letter (not just $)
let has_letters = col1.chars().any(|c| c.is_ascii_uppercase());
if !has_letters {
return None;
}

// Check if followed by ':'
if lexer.current() == Some(':') {
lexer.advance();
let col2 = lexer.read_column();

// Verify col2 has letters
let has_letters2 = col2.chars().any(|c| c.is_ascii_uppercase());
if has_letters2 {
return Some(Token::VerticalRange(format!("{}:{}", col1, col2)));
} else {
return None;
}
}

// Check if followed by a row number (making it a CELL)
if let Some('$' | '1'..='9') = lexer.current() {
let row = lexer.read_row();
if !row.is_empty() && row.chars().any(|ch| ch.is_ascii_digit()) {
return Some(Token::Cell(format!("{}{}", col1, row)));
}
}

None
})
}

// Try to read a horizontal range: $?[0-9]+:$?[0-9]+
fn try_read_horizontal_range(&mut self) -> Option<Token> {
self.backtrack_if_needed(|lexer| {
// Read first row number
let row1_start = lexer.position;
if lexer.current() == Some('$') {
lexer.advance();
}

// Must start with a digit
if let Some(c) = lexer.current() {
if !c.is_ascii_digit() {
return None;
}
} else {
return None;
}

while let Some(c) = lexer.current() {
if c.is_ascii_digit() {
lexer.advance();
} else {
break;
}
}

let row1: String = lexer.input[row1_start..lexer.position].iter().collect();

// Check for ':'
if lexer.current() != Some(':') {
return None;
}
lexer.advance();

// Read second row number
let row2_start = lexer.position;
if lexer.current() == Some('$') {
lexer.advance();
}

if let Some(c) = lexer.current() {
if !c.is_ascii_digit() {
return None;
}
} else {
return None;
}

while let Some(c) = lexer.current() {
if c.is_ascii_digit() {
lexer.advance();
} else {
break;
}
}

let row2: String = lexer.input[row2_start..lexer.position].iter().collect();
Some(Token::HorizontalRange(format!("{}:{}", row1, row2)))
})
}

pub fn next_token(&mut self) -> Result<Token, LexerError> {
self.skip_whitespace();

Expand Down Expand Up @@ -207,6 +364,10 @@ impl Lexer {
self.advance();
Ok(Token::Semicolon)
}
Some(':') => {
self.advance();
Ok(Token::Colon)
}
Some('"') => {
let s = self.read_string()?;
Ok(Token::String(s))
Expand All @@ -222,10 +383,44 @@ impl Lexer {
}
}
Some(c) if c.is_ascii_digit() => {
let num = self.read_number()?;
Ok(Token::Number(num))
// Try horizontal range first (e.g., 1:10)
if let Some(token) = self.try_read_horizontal_range() {
Ok(token)
} else {
// Otherwise it's a number
let num = self.read_number()?;
Ok(Token::Number(num))
}
}
Some('$') => {
// Could be a cell reference like $A$1 or range like $A:$B or $1:$10
// Try cell/vertical range first
if let Some(token) = self.try_read_cell_or_vertical_range() {
Ok(token)
} else if let Some(token) = self.try_read_horizontal_range() {
Ok(token)
} else {
// Invalid $ usage
let c = self.current().unwrap_or('$');
Err(LexerError::UnexpectedChar(c))
}
}
Some(c) if c.is_ascii_uppercase() => {
// Try cell/vertical range first (e.g., A1, A:Z)
if let Some(token) = self.try_read_cell_or_vertical_range() {
Ok(token)
} else {
// Try identifier for TRUE/FALSE
let ident = self.read_identifier();
match ident.to_uppercase().as_str() {
"TRUE" => Ok(Token::Bool(true)),
"FALSE" => Ok(Token::Bool(false)),
_ => Err(LexerError::UnexpectedChar(c)),
}
}
Comment on lines +396 to +420
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually yeah I feel like this is a parser job

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe not, but I think the backtracking can be removed by doing some sort of precedence, see: https://github.com/spreadsheetlab/XLParser/blob/master/src/XLParser/ExcelFormulaGrammar.cs

^above library references the paper too/the paper also references that library

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that code is just defining some grammer and letting some other library to do lexing and parsing

var assembly = typeof(ExcelFormulaGrammar).GetTypeInfo().Assembly;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my naming maybe bad? By "backtrack" function name, I was just implementing kind of like context management in python where the position gets reset if there was no match.

But yes time complexity is higher but we could get code like this which would be a little more maintainable and maybe easier to reason about for priorities;

 fn parse_reference(&mut self) -> Token {        
                                                                                                                                             
      if let Some(cell) = self.try_read_cell() {                                                                        
          return cell;                                                          
      }                                                                                                                 
                                                      
      if let Some(name) = self.try_read_named_range() {                                                                 
          return name;                                                                                                  
      }

     <more tokens depending on priorities>                                 
      ...                                                                                 
  }      

If we are trying for pure performance, i can refactor it though

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what is the best way, leaning towards cleaner code. The last time I did was just writing regex and passing to a library for lexing

}
Some(c) if c.is_alphabetic() => {
// Lowercase letters - try identifier for true/false
let ident = self.read_identifier();
match ident.to_uppercase().as_str() {
"TRUE" => Ok(Token::Bool(true)),
Expand Down
6 changes: 6 additions & 0 deletions src/bindings/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,17 @@ pub enum Token {
// Operators - String
Concatenate,

// References
Cell(String), // $?[A-Z]+$?[1-9][0-9]* e.g., A1, $B$2, AA100
VerticalRange(String), // $?[A-Z]+:$?[A-Z]+ e.g., A:Z, $A:$C
HorizontalRange(String), // $?[0-9]+:$?[0-9]+ e.g., 1:10, $5:$8

// Delimiters
LeftBrace,
RightBrace,
Comma,
Semicolon,
Colon,

// End of input
Eof,
Expand Down
21 changes: 6 additions & 15 deletions tests/lexer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@ This document tracks the implementation and test coverage of the Excel formula l

- **ERROR-REF** - Reference error literal `#REF!`

### To be implemented

- **CELL** - Cell reference `$? [A-Z]+ $? [1-9][0-9]*`
- [ ] Simple cell reference (e.g., A1)
- [ ] Absolute row (e.g., A$1)
- [ ] Absolute column (e.g., $A1)
- [ ] Fully absolute (e.g., $A$1)
- [ ] Multi-letter columns (e.g., AA1, ZZ100)

- **HORIZONTAL-RANGE** - Range of rows `$? [0-9]+ : $? [0-9]+`

- **VERTICAL-RANGE** - Range of columns `$? [A-Z]+ : $? [A-Z]+`

### To be implemented

- **DDECALL** - Dynamic Data Exchange link `' ([^ '] | ")+ '`
- [ ] Basic DDE calls
Expand All @@ -34,14 +33,6 @@ This document tracks the implementation and test coverage of the Excel formula l
- **FILE** - External file reference `\[ [0-9]+ \]`
- [ ] File references

- **HORIZONTAL-RANGE** - Range of rows `$? [0-9]+ : $? [0-9]+`
- [ ] Simple row range (e.g., 1:5)
- [ ] Absolute row ranges

- **VERTICAL-RANGE** - Range of columns `$? [A-Z]+ : $? [A-Z]+`
- [ ] Simple column range (e.g., A:C)
- [ ] Absolute column ranges

- **NR** - Named range `[A-Z_\\★1][★4]*`
- [ ] Simple named ranges
- [ ] Named ranges with underscores
Expand Down
91 changes: 91 additions & 0 deletions tests/lexer/test_cell.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use expy::bindings::lexer::Lexer;
use expy::bindings::token::Token;

// ============================================================================
// SPEC: CELL - $?[A-Z]+$?[1-9][0-9]*
// Priority: 2
// ============================================================================

#[test]
fn test_cell_simple() {
let mut lexer = Lexer::new("A1");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1"));
}

#[test]
fn test_cell_double_letter() {
let mut lexer = Lexer::new("AA10");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "AA10"));
}

#[test]
fn test_cell_triple_letter() {
let mut lexer = Lexer::new("XFD1048576");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "XFD1048576"));
}

#[test]
fn test_cell_absolute_column() {
let mut lexer = Lexer::new("$A1");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "$A1"));
}

#[test]
fn test_cell_absolute_row() {
let mut lexer = Lexer::new("A$1");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "A$1"));
}

#[test]
fn test_cell_absolute_both() {
let mut lexer = Lexer::new("$A$1");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "$A$1"));
}

#[test]
fn test_cell_large_row() {
let mut lexer = Lexer::new("B999");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "B999"));
}

#[test]
fn test_cell_multiple() {
let mut lexer = Lexer::new("A1 B2 $C$3");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1"));
assert!(matches!(&tokens[1], Token::Cell(s) if s == "B2"));
assert!(matches!(&tokens[2], Token::Cell(s) if s == "$C$3"));
}

#[test]
fn test_cell_in_expression() {
let mut lexer = Lexer::new("A1 + B2");
let tokens = lexer.tokenize().unwrap();
assert!(matches!(&tokens[0], Token::Cell(s) if s == "A1"));
assert!(matches!(tokens[1], Token::Plus));
assert!(matches!(&tokens[2], Token::Cell(s) if s == "B2"));
}

#[test]
fn test_cell_row_zero_variations() {
// Test various forms of invalid row 0
let test_cases = vec!["A0", "B0", "$A0", "A$0", "$A$0", "AA0", "XFD0"];

for input in test_cases {
let mut lexer = Lexer::new(input);
let result = lexer.tokenize();
assert!(
result.is_err(),
"Input '{}' should error (row 0 is invalid) but got: {:?}",
input,
result
);
}
}
Loading
Loading