Skip to content

Commit

Permalink
ascii fast path
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Oct 27, 2023
1 parent d22e2c7 commit 58f99a5
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1284,15 +1284,23 @@ function lex_backtick(l::Lexer)
end

const MAX_KW_LENGTH = 10
const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f)
function lex_identifier(l::Lexer, c)
h = simple_hash(c, UInt64(0))
n = 1
graphemestate = Ref(zero(Int32))
ascii = isascii(c)
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
graphemestate_peek = Ref(zero(Int32))
while true
pc, ppc = dpeekchar(l)
if Unicode.isgraphemebreak!(graphemestate, c, pc)
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
ascii = ascii && isascii(pc)
if ascii # fast path
pc_byte = pc % UInt8
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_identifier_chars[pc_byte+1]
break
end
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
if (pc == '!' && ppc == '=') || !ascii_is_identifier_char(pc)
break
end
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
Expand Down

0 comments on commit 58f99a5

Please sign in to comment.