diff --git a/src/tokenize.jl b/src/tokenize.jl index 739a24c6..9c19c040 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1284,13 +1284,31 @@ function lex_backtick(l::Lexer) end const MAX_KW_LENGTH = 10 +const ascii_is_identifier_char = Bool[is_identifier_char(Char(b)) for b=0x00:0x7f] function lex_identifier(l::Lexer, c) h = simple_hash(c, UInt64(0)) n = 1 + ascii = isascii(c) + graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER + graphemestate_peek = Ref(zero(Int32)) while true pc, ppc = dpeekchar(l) - if (pc == '!' && ppc == '=') || !is_identifier_char(pc) - break + ascii = ascii && isascii(pc) + if ascii # fast path + pc_byte = pc % UInt8 + @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1] + break + end + elseif Unicode.isgraphemebreak!(graphemestate, c, pc) + if (pc == '!' && ppc == '=') || !is_identifier_char(pc) + break + end + elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters + # ZWJ/ZWNJ only within grapheme sequences, not at end + graphemestate_peek[] = graphemestate[] + if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) + break + end end c = readchar(l) h = simple_hash(c, h) diff --git a/test/diagnostics.jl b/test/diagnostics.jl index ea2feb37..d7fd0b30 100644 --- a/test/diagnostics.jl +++ b/test/diagnostics.jl @@ -7,7 +7,7 @@ function diagnostic(str; only_first=false, allow_multiple=false, rule=:all, vers if !only_first @test length(stream.diagnostics) == 1 end - return stream.diagnostics[1] + return isempty(stream.diagnostics) ? nothing : stream.diagnostics[1] end end diff --git a/test/tokenize.jl b/test/tokenize.jl index 07972c98..26ab044a 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -44,12 +44,14 @@ end end # testset @testset "tokenize unicode" begin - str = "𝘋 =2β" + # FIXME: rm VERSION check once we implement our own is_identifier_char + emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode + str = "𝘋 =2"*emoji for s in [str, IOBuffer(str)] l = tokenize(s) kinds = [K"Identifier", K"Whitespace", K"=", K"Integer", K"Identifier", K"EndMarker"] - token_strs = ["𝘋", " ", "=", "2", "β", ""] + token_strs = ["𝘋", " ", "=", "2", emoji, ""] for (i, n) in enumerate(l) @test kind(n) == kinds[i] @test untokenize(n, str) == token_strs[i]