Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

## Unreleased

## [1.5.12] - 2026-01-03
- Fix lexer not handling UTF-8 characters correctly in comments.
- Multi-byte UTF-8 characters (e.g., `→`) in comments caused incorrect span positions for subsequent tokens.
- The lexer now properly tracks byte positions instead of character indices.

## [1.5.11] - 2025-12-24
- Fix broken npm release workflow.

Expand Down
16 changes: 8 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ license = "MIT OR Apache-2.0"
readme = "README.md"
repository = "https://github.com/cakevm/huff-neo"
rust-version = "1.89"
version = "1.5.11"
version = "1.5.12"

[workspace.dependencies]
huff-neo-codegen = { path = "crates/codegen" }
Expand Down
81 changes: 32 additions & 49 deletions crates/lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use huff_neo_utils::prelude::*;
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashMap;
use std::iter::Enumerate;
use std::{iter::Peekable, str::Chars};
use tracing::{debug, error};

Expand Down Expand Up @@ -44,9 +43,11 @@ const MAX_HEX_LITERAL_LENGTH: usize = 66;
pub struct Lexer<'a> {
/// The source code as peekable chars.
/// WARN: SHOULD NEVER BE MODIFIED!
pub chars: Peekable<Enumerate<Chars<'a>>>,
/// The current position in the source code.
pub chars: Peekable<Chars<'a>>,
/// The byte offset of the last consumed character.
position: usize,
/// The byte offset after the last consumed character (position of next char).
byte_offset: usize,
/// The previous lexed Token.
/// NOTE: Cannot be a whitespace.
pub lookback: Option<Token>,
Expand All @@ -63,25 +64,29 @@ pub type TokenResult = Result<Token, LexicalError>;
impl<'a> Lexer<'a> {
pub fn new(source: FullFileSource<'a>) -> Self {
Lexer {
chars: source.source.chars().enumerate().peekable(),
chars: source.source.chars().peekable(),
position: 0,
byte_offset: 0,
lookback: None,
eof: false,
context_stack: ContextStack::new(),
source,
}
}

/// Consumes the next character
/// Consumes the next character and updates byte position tracking.
/// After calling, `position` holds the byte offset of the consumed char,
/// and `byte_offset` holds the byte offset after it.
pub fn consume(&mut self) -> Option<char> {
let (index, c) = self.chars.next()?;
self.position = index;
let c = self.chars.next()?;
self.position = self.byte_offset;
self.byte_offset += c.len_utf8();
Some(c)
}

/// Try to peek at the next character from the source
pub fn peek(&mut self) -> Option<char> {
self.chars.peek().map(|(_, c)| *c)
self.chars.peek().copied()
}

fn next_token(&mut self) -> TokenResult {
Expand Down Expand Up @@ -133,7 +138,7 @@ impl<'a> Lexer<'a> {
}

Ok(TokenKind::Comment(comment_string)
.into_token_with_span(self.source.relative_span_by_pos(start, self.position + 1)))
.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
}
_ => self.single_char_token(TokenKind::Div),
}
Expand All @@ -160,13 +165,14 @@ impl<'a> Lexer<'a> {
if let Some(kind) = found_kind {
Ok(kind.into_token_with_span(self.source.relative_span_by_pos(start, end)))
} else if self.context_stack.top() == &Context::Global && self.peek().unwrap() == '[' {
Ok(TokenKind::Pound.into_token_with_span(self.source.relative_span_by_pos(self.position, self.position + 1)))
// Return Pound token for just the '#' character
Ok(TokenKind::Pound.into_token_with_span(self.source.relative_span_by_pos(start, start + 1)))
} else {
// Otherwise we don't support # prefixed identifiers
error!(target: "lexer", "INVALID '#' CHARACTER USAGE in context {:?}", self.context_stack.top());
return Err(LexicalError::new(
LexicalErrorKind::InvalidCharacter('#'),
self.source.relative_span_by_pos(self.position, self.position + 1),
self.source.relative_span_by_pos(start, end),
));
}
}
Expand Down Expand Up @@ -217,17 +223,14 @@ impl<'a> Lexer<'a> {
if new_context.is_some() && self.context_stack.top() != &Context::Global {
debug!(target: "lexer", "POP CONTEXT {:?}", self.context_stack.top());
self.context_stack.pop(1).map_err(|_| {
LexicalError::new(
LexicalErrorKind::StackUnderflow,
self.source.relative_span_by_pos(self.position, self.position + 1),
)
LexicalError::new(LexicalErrorKind::StackUnderflow, self.source.relative_span_by_pos(start, end))
})?;
}
// Verify that the context is correct
if new_context.is_some() && self.context_stack.top() != &Context::Global {
return Err(LexicalError::new(
LexicalErrorKind::UnexpectedContext(self.context_stack.top().clone()),
self.source.relative_span_by_pos(self.position, self.position + 1),
self.source.relative_span_by_pos(start, end),
));
}
// Push the new context
Expand Down Expand Up @@ -342,10 +345,8 @@ impl<'a> Lexer<'a> {
// Check if next char is also '=' for EqualEqual (==)
if self.peek() == Some('=') {
let start = self.position;
self.consume(); // consume first '='
self.consume(); // consume second '='
let end = self.position;
Ok(TokenKind::EqualEqual.into_token_with_span(self.source.relative_span_by_pos(start, end)))
Ok(TokenKind::EqualEqual.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
} else {
self.single_char_token(TokenKind::Assign)
}
Expand All @@ -354,10 +355,8 @@ impl<'a> Lexer<'a> {
// Check if next char is '=' for NotEqual (!=)
if self.peek() == Some('=') {
let start = self.position;
self.consume(); // consume '!'
self.consume(); // consume '='
let end = self.position;
Ok(TokenKind::NotEqual.into_token_with_span(self.source.relative_span_by_pos(start, end)))
Ok(TokenKind::NotEqual.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
} else {
// '!' by itself is logical NOT
self.single_char_token(TokenKind::Not)
Expand Down Expand Up @@ -394,7 +393,7 @@ impl<'a> Lexer<'a> {
self.context_stack.pop(1).map_err(|_| {
LexicalError::new(
LexicalErrorKind::StackUnderflow,
self.source.relative_span_by_pos(self.position, self.position + 1),
self.source.relative_span_by_pos(self.position, self.byte_offset),
)
})?
}
Expand Down Expand Up @@ -422,7 +421,7 @@ impl<'a> Lexer<'a> {
self.context_stack.pop(1).map_err(|_| {
LexicalError::new(
LexicalErrorKind::StackUnderflow,
self.source.relative_span_by_pos(self.position, self.position + 1),
self.source.relative_span_by_pos(self.position, self.byte_offset),
)
})?;
}
Expand All @@ -436,10 +435,8 @@ impl<'a> Lexer<'a> {
// Check if next char is '=' for LessEqual (<=)
if self.peek() == Some('=') {
let start = self.position;
self.consume(); // consume '<'
self.consume(); // consume '='
let end = self.position;
Ok(TokenKind::LessEqual.into_token_with_span(self.source.relative_span_by_pos(start, end)))
Ok(TokenKind::LessEqual.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
} else {
self.single_char_token(TokenKind::LeftAngle)
}
Expand All @@ -448,10 +445,8 @@ impl<'a> Lexer<'a> {
// Check if next char is '=' for GreaterEqual (>=)
if self.peek() == Some('=') {
let start = self.position;
self.consume(); // consume '>'
self.consume(); // consume '='
let end = self.position;
Ok(TokenKind::GreaterEqual.into_token_with_span(self.source.relative_span_by_pos(start, end)))
Ok(TokenKind::GreaterEqual.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
} else {
self.single_char_token(TokenKind::RightAngle)
}
Expand All @@ -461,15 +456,15 @@ impl<'a> Lexer<'a> {
'.' => {
// Check if next char is also '.' for DoubleDot (..)
if self.peek() == Some('.') {
let start = self.position;
self.consume(); // consume the second '.'
Ok(TokenKind::DoubleDot
.into_token_with_span(self.source.relative_span_by_pos(self.position - 1, self.position + 1)))
Ok(TokenKind::DoubleDot.into_token_with_span(self.source.relative_span_by_pos(start, self.byte_offset)))
} else {
// Single dot is not supported
error!(target: "lexer", "UNSUPPORTED TOKEN '.'");
return Err(LexicalError::new(
LexicalErrorKind::InvalidCharacter('.'),
self.source.relative_span_by_pos(self.position, self.position),
self.source.relative_span_by_pos(self.position, self.byte_offset),
));
}
}
Expand All @@ -487,7 +482,7 @@ impl<'a> Lexer<'a> {
error!(target: "lexer", "UNSUPPORTED TOKEN '{}'", ch);
return Err(LexicalError::new(
LexicalErrorKind::InvalidCharacter(ch),
self.source.relative_span_by_pos(self.position, self.position),
self.source.relative_span_by_pos(self.position, self.byte_offset),
));
}
}?;
Expand Down Expand Up @@ -578,7 +573,7 @@ impl<'a> Lexer<'a> {
}

fn single_char_token(&self, token_kind: TokenKind) -> TokenResult {
Ok(token_kind.into_token_with_span(self.source.relative_span_by_pos(self.position, self.position + 1)))
Ok(token_kind.into_token_with_span(self.source.relative_span_by_pos(self.position, self.byte_offset)))
}

/// Keeps consuming tokens as long as the predicate is satisfied
Expand All @@ -599,10 +594,7 @@ impl<'a> Lexer<'a> {
// cursor If not, return word. The next character will be analyzed on the
// next iteration of next_token, Which will increment the cursor
if !predicate(peek_char) {
// Position currently points to the last consumed character
// We need to add its byte length to get the end position
let end_position = if word.is_empty() { self.position } else { self.position + word.chars().last().unwrap().len_utf8() };
return (word, start, end_position);
return (word, start, self.byte_offset);
}
word.push(peek_char);

Expand All @@ -611,16 +603,7 @@ impl<'a> Lexer<'a> {
self.consume();
}

// After consuming all characters, position points to the last consumed character
// We need to add the length of that last character to get the end position
let end_position = if word.is_empty() {
self.position
} else {
// Get the byte length of the last character and add it to position
self.position + word.chars().last().unwrap().len_utf8()
};

(word, start, end_position)
(word, start, self.byte_offset)
}

fn eat_digit(&mut self, initial_char: char) -> TokenResult {
Expand Down
48 changes: 48 additions & 0 deletions crates/lexer/tests/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,51 @@ fn multi_line_comments() {
assert!(lexer.eof);
assert_eq!(source.len() - 1, 47);
}

#[test]
fn single_line_comment_with_utf8_arrow() {
// Test that UTF-8 arrow character in comments doesn't break subsequent token spans
// Arrow (→) is 3 bytes (UTF-8: e2 86 92)
let source = "// a → b\n#define macro X() = takes(0) returns(0) {}";
let flattened_source = FullFileSource { source, file: None, spans: vec![] };
let mut lexer = Lexer::new(flattened_source);

// Comment: "// a → b" = 2 + 1 + 1 + 1 + 3 + 1 + 1 = 10 bytes (positions 0-10)
let tok = lexer.next();
let unwrapped = tok.unwrap().unwrap();
let comment_span = Span::new(0..10, None);
assert_eq!(unwrapped, Token::new(TokenKind::Comment("// a → b".to_string()), comment_span));

// The next token should be the newline character parsed as whitespace
let tok = lexer.next();
let unwrapped = tok.unwrap().unwrap();
let ws_span = Span::new(10..11, None);
assert_eq!(unwrapped, Token::new(TokenKind::Whitespace, ws_span));

// #define should start at byte 11, not byte 9 (verifies UTF-8 byte positions are correct)
let tok = lexer.next();
let unwrapped = tok.unwrap().unwrap();
let define_span = Span::new(11..18, None);
assert_eq!(unwrapped, Token::new(TokenKind::Define, define_span));
}

#[test]
fn multi_line_comment_with_utf8_arrow() {
// Test that UTF-8 arrow character in block comments doesn't break subsequent token spans
// Arrow (→) is 3 bytes (UTF-8: e2 86 92)
let source = "/* a → b */#define macro X() = takes(0) returns(0) {}";
let flattened_source = FullFileSource { source, file: None, spans: vec![] };
let mut lexer = Lexer::new(flattened_source);

// Comment: "/* a → b */" = 2 + 1 + 1 + 1 + 3 + 1 + 1 + 1 + 2 = 13 bytes (positions 0-13)
let tok = lexer.next();
let unwrapped = tok.unwrap().unwrap();
let comment_span = Span::new(0..13, None);
assert_eq!(unwrapped, Token::new(TokenKind::Comment("/* a → b */".to_string()), comment_span));

// #define should start at byte 13, not byte 11 (verifies UTF-8 byte positions are correct)
let tok = lexer.next();
let unwrapped = tok.unwrap().unwrap();
let define_span = Span::new(13..20, None);
assert_eq!(unwrapped, Token::new(TokenKind::Define, define_span));
}
2 changes: 2 additions & 0 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ ignore = [
"RUSTSEC-2024-0436",
# number_prefix is unmaintained - https://rustsec.org/advisories/RUSTSEC-2025-0119
"RUSTSEC-2025-0119",
# unsoundness of safe `reciprocal_mg10` - https://rustsec.org/advisories/RUSTSEC-2025-0137
"RUSTSEC-2025-0137",
]
yanked = "warn"

Expand Down
Loading