Skip to content

Commit 58f99a5

Browse files
committed
ascii fast path
1 parent d22e2c7 commit 58f99a5

File tree

1 file changed

+11
-3
lines changed

1 file changed

+11
-3
lines changed

src/tokenize.jl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1284,15 +1284,23 @@ function lex_backtick(l::Lexer)
12841284
end
12851285

12861286
const MAX_KW_LENGTH = 10
1287+
const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f)
12871288
function lex_identifier(l::Lexer, c)
12881289
h = simple_hash(c, UInt64(0))
12891290
n = 1
1290-
graphemestate = Ref(zero(Int32))
1291+
ascii = isascii(c)
1292+
graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
12911293
graphemestate_peek = Ref(zero(Int32))
12921294
while true
12931295
pc, ppc = dpeekchar(l)
1294-
if Unicode.isgraphemebreak!(graphemestate, c, pc)
1295-
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
1296+
ascii = ascii && isascii(pc)
1297+
if ascii # fast path
1298+
pc_byte = pc % UInt8
1299+
@inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_identifier_chars[pc_byte+1]
1300+
break
1301+
end
1302+
elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
1303+
if (pc == '!' && ppc == '=') || !ascii_is_identifier_char(pc)
12961304
break
12971305
end
12981306
elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters

0 commit comments

Comments
 (0)