From cba8730812800ada0c2b6bf260d8334610a5d3b9 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Wed, 30 Apr 2025 23:29:24 +0100 Subject: [PATCH 1/4] Fix: parsing ident starting with underscore in certain dialects The dialects that support underscore as a separator in numeric literals used to parse ._123 as a number, meaning that an identifier like ._abc would be parsed as Number `._` and word `abc`, which is obv wrong. This PR splits the tokenizer branch for numbers and periods into two branches to make things easier, fixes the issue mentioned above and adds tests. --- src/tokenizer.rs | 141 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 35 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 4fad54624..8bf7ceddd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } - // numbers and period - '0'..='9' | '.' => { + // Numbers + '0'..='9' => { // Some dialects support underscore as number separator // There can only be one at a time and it must be followed by another digit let is_number_separator = |ch: char, next_char: Option| { @@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> { && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) }; + // Start with number or potential separator let mut s = peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // match binary literal that starts with 0x + // Match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); let s2 = peeking_next_take_while(chars, |ch, next_ch| { @@ -1212,51 +1213,34 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::HexStringLiteral(s2))); } - // match one period + // Match fractional part after a dot if let Some('.') = chars.peek() { s.push('.'); chars.next(); } - // If the dialect supports identifiers that start with a numeric prefix - // and we have now consumed a dot, check if the previous token was a Word. - // If so, what follows is definitely not part of a decimal number and - // we should yield the dot as a dedicated token so compound identifiers - // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if let Some(Token::Word(_)) = prev_token { - return Ok(Some(Token::Period)); - } - } - - // Consume fractional digits. + // Consume fractional digits s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No fraction -> Token::Period - if s == "." { - return Ok(Some(Token::Period)); - } - - // Parse exponent as number + // Parse exponent part (e.g., e+10 or E-5) let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); - exponent_part.push(char_clone.next().unwrap()); + exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E' // Optional sign - match char_clone.peek() { - Some(&c) if matches!(c, '+' | '-') => { + if let Some(&c) = char_clone.peek() { + if c == '+' || c == '-' { exponent_part.push(c); char_clone.next(); } - _ => (), } - match char_clone.peek() { - // Definitely an exponent, get original iterator up to speed and use it - Some(&c) if c.is_ascii_digit() => { + // Parse digits after the exponent + if let Some(&c) = char_clone.peek() { + if c.is_ascii_digit() { for _ in 0..exponent_part.len() { chars.next(); } @@ -1264,8 +1248,6 @@ impl<'a> Tokenizer<'a> { &peeking_take_while(chars, |ch| ch.is_ascii_digit()); s += exponent_part.as_str(); } - // Not an exponent, discard the work done - _ => (), } } @@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> { // be tokenized as a word. if self.dialect.supports_numeric_prefix() { if exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. + // Handle as potential word if no exponent part let word = peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); @@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::make_word(s.as_str(), None))); } } else if prev_token == Some(&Token::Period) { - // If the previous token was a period, thus not belonging to a number, - // the value we have is part of an identifier. + // Handle as word if it follows a period return Ok(Some(Token::make_word(s.as_str(), None))); } } + // Handle "L" suffix for long numbers let long = if chars.peek() == Some(&'L') { chars.next(); true } else { false }; + + // Return the final token for the number Ok(Some(Token::Number(s, long))) } + + // Period (`.`) handling + '.' => { + chars.next(); // consume the dot + + match chars.peek() { + Some('_') => { + // Handle "._" case as a period (special token) followed by identifier + Ok(Some(Token::Period)) + } + Some(ch) + // Hive and mysql dialects allow numeric prefixes for identifers + if ch.is_ascii_digit() + && self.dialect.supports_numeric_prefix() + && matches!(prev_token, Some(Token::Word(_))) => + { + Ok(Some(Token::Period)) + } + Some(ch) if ch.is_ascii_digit() => { + // Handle numbers starting with a dot (e.g., ".123") + let mut s = String::from("."); + let is_number_separator = |ch: char, next_char: Option| { + self.dialect.supports_numeric_literal_underscores() + && ch == '_' + && next_char.is_some_and(|c| c.is_ascii_digit()) + }; + + s += &peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); + + // Handle exponent part + if matches!(chars.peek(), Some('e' | 'E')) { + let mut exp = String::new(); + exp.push(chars.next().unwrap()); + + if matches!(chars.peek(), Some('+' | '-')) { + exp.push(chars.next().unwrap()); + } + + if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) { + exp += &peeking_take_while(chars, |c| c.is_ascii_digit()); + s += &exp; + } + } + + // Handle "L" suffix for long numbers + let long = if chars.peek() == Some(&'L') { + chars.next(); + true + } else { + false + }; + + Ok(Some(Token::Number(s, long))) + } + _ => { + // Just a plain period + Ok(Some(Token::Period)) + } + } + } // punctuation '(' => self.consume_and_return(chars, Token::LParen), ')' => self.consume_and_return(chars, Token::RParen), @@ -2435,6 +2480,32 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_period_underscore() { + let sql = String::from("SELECT table._col"); + // a dialect that supports underscores in numeric literals + let dialect = PostgreSqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "table".to_string(), + quote_style: None, + keyword: Keyword::TABLE, + }), + Token::Period, + Token::Word(Word { + value: "_col".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + ]; + + compare(expected, tokens); + } + #[test] fn tokenize_select_float() { let sql = String::from("SELECT .1"); From 5c5b5e19e7986493d694776c78b5aeb56579fa60 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Thu, 1 May 2025 21:42:57 +0100 Subject: [PATCH 2/4] Error on numeric literals and unqualified identifers starting with ._ --- src/tokenizer.rs | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 8bf7ceddd..7c4fbf9cc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1287,10 +1287,17 @@ impl<'a> Tokenizer<'a> { chars.next(); // consume the dot match chars.peek() { - Some('_') => { - // Handle "._" case as a period (special token) followed by identifier + // Handle "._" case as a period followed by identifier + // if the last token was a word + Some('_') if matches!(prev_token, Some(Token::Word(_))) => { Ok(Some(Token::Period)) } + Some('_') => { + self.tokenizer_error( + chars.location(), + "Unexpected an underscore here".to_string(), + ) + } Some(ch) // Hive and mysql dialects allow numeric prefixes for identifers if ch.is_ascii_digit() @@ -2504,6 +2511,16 @@ mod tests { ]; compare(expected, tokens); + + let sql = String::from("SELECT ._123"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + + let sql = String::from("SELECT ._abc"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } } #[test] From c1a32205e962e3c4c76220706695257f139b4e33 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Thu, 1 May 2025 21:46:33 +0100 Subject: [PATCH 3/4] Update error message --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 7c4fbf9cc..10e810fbc 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1295,7 +1295,7 @@ impl<'a> Tokenizer<'a> { Some('_') => { self.tokenizer_error( chars.location(), - "Unexpected an underscore here".to_string(), + "Unexpected underscore here".to_string(), ) } Some(ch) From f9a10c4f22792ae80a9652f7e77b31a72005d663 Mon Sep 17 00:00:00 2001 From: MohamedAbdeen21 Date: Fri, 9 May 2025 21:01:12 +0100 Subject: [PATCH 4/4] Re-merge dot and numeric tokenizer branches --- src/tokenizer.rs | 210 ++++++++++++++++++++--------------------------- 1 file changed, 87 insertions(+), 123 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 10e810fbc..afe1e35c7 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1189,8 +1189,24 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } - // Numbers - '0'..='9' => { + // numbers and period + '0'..='9' | '.' => { + // special case where if ._ is encountered after a word then that word + // is a table and the _ is the start of the col name. + // if the prev token is not a word, then this is not a valid sql + // word or number. + if ch == '.' && chars.peekable.clone().nth(1) == Some('_') { + if let Some(Token::Word(_)) = prev_token { + chars.next(); + return Ok(Some(Token::Period)); + } + + return self.tokenizer_error( + chars.location(), + "Unexpected character '_'".to_string(), + ); + } + // Some dialects support underscore as number separator // There can only be one at a time and it must be followed by another digit let is_number_separator = |ch: char, next_char: Option| { @@ -1199,12 +1215,11 @@ impl<'a> Tokenizer<'a> { && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) }; - // Start with number or potential separator let mut s = peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // Match binary literal that starts with 0x + // match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); let s2 = peeking_next_take_while(chars, |ch, next_ch| { @@ -1213,34 +1228,51 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::HexStringLiteral(s2))); } - // Match fractional part after a dot + // match one period if let Some('.') = chars.peek() { s.push('.'); chars.next(); } - // Consume fractional digits + // If the dialect supports identifiers that start with a numeric prefix + // and we have now consumed a dot, check if the previous token was a Word. + // If so, what follows is definitely not part of a decimal number and + // we should yield the dot as a dedicated token so compound identifiers + // starting with digits can be parsed correctly. + if s == "." && self.dialect.supports_numeric_prefix() { + if let Some(Token::Word(_)) = prev_token { + return Ok(Some(Token::Period)); + } + } + + // Consume fractional digits. s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // Parse exponent part (e.g., e+10 or E-5) + // No fraction -> Token::Period + if s == "." { + return Ok(Some(Token::Period)); + } + + // Parse exponent as number let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); - exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E' + exponent_part.push(char_clone.next().unwrap()); // Optional sign - if let Some(&c) = char_clone.peek() { - if c == '+' || c == '-' { + match char_clone.peek() { + Some(&c) if matches!(c, '+' | '-') => { exponent_part.push(c); char_clone.next(); } + _ => (), } - // Parse digits after the exponent - if let Some(&c) = char_clone.peek() { - if c.is_ascii_digit() { + match char_clone.peek() { + // Definitely an exponent, get original iterator up to speed and use it + Some(&c) if c.is_ascii_digit() => { for _ in 0..exponent_part.len() { chars.next(); } @@ -1248,6 +1280,8 @@ impl<'a> Tokenizer<'a> { &peeking_take_while(chars, |ch| ch.is_ascii_digit()); s += exponent_part.as_str(); } + // Not an exponent, discard the work done + _ => (), } } @@ -1256,7 +1290,8 @@ impl<'a> Tokenizer<'a> { // be tokenized as a word. if self.dialect.supports_numeric_prefix() { if exponent_part.is_empty() { - // Handle as potential word if no exponent part + // If it is not a number with an exponent, it may be + // an identifier starting with digits. let word = peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); @@ -1265,91 +1300,20 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::make_word(s.as_str(), None))); } } else if prev_token == Some(&Token::Period) { - // Handle as word if it follows a period + // If the previous token was a period, thus not belonging to a number, + // the value we have is part of an identifier. return Ok(Some(Token::make_word(s.as_str(), None))); } } - // Handle "L" suffix for long numbers let long = if chars.peek() == Some(&'L') { chars.next(); true } else { false }; - - // Return the final token for the number Ok(Some(Token::Number(s, long))) } - - // Period (`.`) handling - '.' => { - chars.next(); // consume the dot - - match chars.peek() { - // Handle "._" case as a period followed by identifier - // if the last token was a word - Some('_') if matches!(prev_token, Some(Token::Word(_))) => { - Ok(Some(Token::Period)) - } - Some('_') => { - self.tokenizer_error( - chars.location(), - "Unexpected underscore here".to_string(), - ) - } - Some(ch) - // Hive and mysql dialects allow numeric prefixes for identifers - if ch.is_ascii_digit() - && self.dialect.supports_numeric_prefix() - && matches!(prev_token, Some(Token::Word(_))) => - { - Ok(Some(Token::Period)) - } - Some(ch) if ch.is_ascii_digit() => { - // Handle numbers starting with a dot (e.g., ".123") - let mut s = String::from("."); - let is_number_separator = |ch: char, next_char: Option| { - self.dialect.supports_numeric_literal_underscores() - && ch == '_' - && next_char.is_some_and(|c| c.is_ascii_digit()) - }; - - s += &peeking_next_take_while(chars, |ch, next_ch| { - ch.is_ascii_digit() || is_number_separator(ch, next_ch) - }); - - // Handle exponent part - if matches!(chars.peek(), Some('e' | 'E')) { - let mut exp = String::new(); - exp.push(chars.next().unwrap()); - - if matches!(chars.peek(), Some('+' | '-')) { - exp.push(chars.next().unwrap()); - } - - if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) { - exp += &peeking_take_while(chars, |c| c.is_ascii_digit()); - s += &exp; - } - } - - // Handle "L" suffix for long numbers - let long = if chars.peek() == Some(&'L') { - chars.next(); - true - } else { - false - }; - - Ok(Some(Token::Number(s, long))) - } - _ => { - // Just a plain period - Ok(Some(Token::Period)) - } - } - } // punctuation '(' => self.consume_and_return(chars, Token::LParen), ')' => self.consume_and_return(chars, Token::RParen), @@ -2487,42 +2451,6 @@ mod tests { compare(expected, tokens); } - #[test] - fn tokenize_period_underscore() { - let sql = String::from("SELECT table._col"); - // a dialect that supports underscores in numeric literals - let dialect = PostgreSqlDialect {}; - let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); - - let expected = vec![ - Token::make_keyword("SELECT"), - Token::Whitespace(Whitespace::Space), - Token::Word(Word { - value: "table".to_string(), - quote_style: None, - keyword: Keyword::TABLE, - }), - Token::Period, - Token::Word(Word { - value: "_col".to_string(), - quote_style: None, - keyword: Keyword::NoKeyword, - }), - ]; - - compare(expected, tokens); - - let sql = String::from("SELECT ._123"); - if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { - panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); - } - - let sql = String::from("SELECT ._abc"); - if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { - panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); - } - } - #[test] fn tokenize_select_float() { let sql = String::from("SELECT .1"); @@ -4106,4 +4034,40 @@ mod tests { ], ); } + + #[test] + fn tokenize_period_underscore() { + let sql = String::from("SELECT table._col"); + // a dialect that supports underscores in numeric literals + let dialect = PostgreSqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "table".to_string(), + quote_style: None, + keyword: Keyword::TABLE, + }), + Token::Period, + Token::Word(Word { + value: "_col".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + ]; + + compare(expected, tokens); + + let sql = String::from("SELECT ._123"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + + let sql = String::from("SELECT ._abc"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + } }