Skip to content

Commit

Permalink
Disallow unbalanced bidirectional unicode (#288)
Browse files Browse the repository at this point in the history
Disallow unbalanced Unicode bidirectional formatting directives within
strings and comments, to mitigate the "trojan source" vulnerability
https://www.trojansource.codes

See also JuliaLang/julia#42918
  • Loading branch information
c42f authored May 18, 2023
1 parent 6e3782f commit 4d2e561
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 49 deletions.
3 changes: 3 additions & 0 deletions src/kinds.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const _kind_names =
"ErrorInvalidUTF8"
"ErrorInvisibleChar"
"ErrorUnknownCharacter"
"ErrorBidiFormatting"
# Generic error
"error"
"END_ERRORS"
Expand Down Expand Up @@ -1049,6 +1050,7 @@ const _nonunique_kind_names = Set([
K"ErrorInvalidUTF8"
K"ErrorInvisibleChar"
K"ErrorUnknownCharacter"
K"ErrorBidiFormatting"
K"ErrorInvalidOperator"

K"Integer"
Expand Down Expand Up @@ -1098,6 +1100,7 @@ const _token_error_descriptions = Dict{Kind, String}(
K"ErrorInvalidUTF8"=>"invalid UTF-8 character",
K"ErrorInvisibleChar"=>"invisible character",
K"ErrorUnknownCharacter"=>"unknown unicode character",
K"ErrorBidiFormatting"=>"unbalanced bidirectional unicode formatting",
K"ErrorInvalidOperator" => "invalid operator",
K"Error**" => "use `x^y` instead of `x**y` for exponentiation, and `x...` instead of `**x` for splatting",
K"error" => "unknown error token",
Expand Down
2 changes: 2 additions & 0 deletions src/parse_stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,8 @@ function validate_tokens(stream::ParseStream)
# Emit messages for non-generic token errors
msg = if k in KSet"ErrorInvalidUTF8 ErrorInvisibleChar ErrorUnknownCharacter"
"$(_token_error_descriptions[k]) $(repr(text[fbyte]))"
elseif k == K"ErrorBidiFormatting"
"$(_token_error_descriptions[k]) $(repr(text[fbyte:prevind(text, nbyte)]))"
else
_token_error_descriptions[k]
end
Expand Down
5 changes: 5 additions & 0 deletions src/parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3282,6 +3282,9 @@ function parse_string(ps::ParseState, raw::Bool)
first_chunk = false
n_valid_chunks += 1
end
elseif k == K"ErrorInvalidInterpolationTerminator" || k == K"ErrorBidiFormatting"
# Treat these errors as string chunks
bump(ps)
else
break
end
Expand Down Expand Up @@ -3381,6 +3384,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
else
if k == K"Char"
bump(ps)
elseif is_error(k)
bump(ps)
else
# FIXME: This case is actually a tokenization error.
# Make a best-effort attempt to workaround this for now by
Expand Down
114 changes: 72 additions & 42 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module Tokenize

export tokenize, untokenize, Tokens

using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str

import ..JuliaSyntax: kind,
is_literal, is_error, is_contextual_keyword, is_word_operator
Expand Down Expand Up @@ -382,9 +382,6 @@ end

Returns the next character and increments the current position.
"""
function readchar end


function readchar(l::Lexer)
c = readchar(l.io)
l.chars = (l.chars[2], l.chars[3], l.chars[4], c)
Expand Down Expand Up @@ -446,17 +443,6 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
return tok
end

"""
emit_error(l::Lexer, err::Kind)

Returns an `K"error"` token with error `err` and starts a new `RawToken`.
"""
function emit_error(l::Lexer, err::Kind)
@assert is_error(err)
return emit(l, err)
end


"""
next_token(l::Lexer)

Expand Down Expand Up @@ -551,20 +537,43 @@ function _next_token(l::Lexer, c)
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
return emit(l, k)
else
emit_error(l,
emit(l,
!isvalid(c) ? K"ErrorInvalidUTF8" :
is_invisible_char(c) ? K"ErrorInvisibleChar" :
K"ErrorUnknownCharacter")
end
end

# UAX #9: Unicode Bidirectional Algorithm
# https://unicode.org/reports/tr9/
# Very partial implementation - just enough to check correct nesting in strings
# and multiline comments.
function update_bidi_state((embedding_nesting, isolate_nesting), c)
if c == '\n'
embedding_nesting = 0
isolate_nesting = 0
elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO
embedding_nesting += 1
elseif c == '\U202C' # PDF
embedding_nesting -= 1
elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI
isolate_nesting += 1
elseif c == '\U2069' # PDI
isolate_nesting -= 1
end
return (embedding_nesting, isolate_nesting)
end

# We're inside a string; possibly reading the string characters, or maybe in
# Julia code within an interpolation.
function lex_string_chunk(l)
state = last(l.string_states)
if state.paren_depth > 0
# Read normal Julia code inside an interpolation but track nesting of
# parentheses.
# TODO: This stateful tracking should probably, somehow, be done by the
# parser instead? Especially for recovery of unbalanced parens inside
# interpolations?
c = readchar(l)
if c == '('
l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
Expand Down Expand Up @@ -598,7 +607,7 @@ function lex_string_chunk(l)
# Only allow certain characters after interpolated vars
# https://github.com/JuliaLang/julia/pull/25234
readchar(l)
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
return emit(l, K"ErrorInvalidInterpolationTerminator")
end
if pc == EOF_CHAR
return emit(l, K"EndMarker")
Expand Down Expand Up @@ -637,6 +646,8 @@ function lex_string_chunk(l)
end
end
# Read a chunk of string characters
init_bidi_state = (0,0)
bidi_state = init_bidi_state
if state.raw
# Raw strings treat all characters as literals with the exception that
# the closing quotes can be escaped with an odd number of \ characters.
Expand All @@ -647,7 +658,10 @@ function lex_string_chunk(l)
elseif state.triplestr && (pc == '\n' || pc == '\r')
# triple quoted newline splitting
readchar(l)
if pc == '\r' && peekchar(l) == '\n'
if pc == '\n'
bidi_state = init_bidi_state
elseif pc == '\r' && peekchar(l) == '\n'
bidi_state = init_bidi_state
readchar(l)
end
break
Expand All @@ -663,6 +677,7 @@ function lex_string_chunk(l)
readchar(l)
end
end
bidi_state = update_bidi_state(bidi_state, c)
end
else
while true
Expand All @@ -672,29 +687,39 @@ function lex_string_chunk(l)
elseif state.triplestr && (pc == '\n' || pc == '\r')
# triple quoted newline splitting
readchar(l)
if pc == '\r' && peekchar(l) == '\n'
if pc == '\n'
bidi_state = init_bidi_state
elseif pc == '\r' && peekchar(l) == '\n'
readchar(l)
bidi_state = init_bidi_state
end
break
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
break
elseif pc == '\\'
# Escaped newline
pc2 = dpeekchar(l)[2]
_, pc2, pc3 = peekchar3(l)
if pc2 == '\r' || pc2 == '\n'
if pc2 == '\n' || pc3 == '\n'
bidi_state = init_bidi_state
end
break
end
end
c = readchar(l)
if c == '\\'
c = readchar(l)
c == EOF_CHAR && break
continue
end
bidi_state = update_bidi_state(bidi_state, c)
end
end
return emit(l, state.delim == '"' ? K"String" :
state.delim == '`' ? K"CmdString" : K"Char")
outk = state.delim == '\'' ? K"Char" :
bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
state.delim == '"' ? K"String" :
state.delim == '`' ? K"CmdString" :
(@assert(state.delim in KSet"' \" `"); K"error")
return emit(l, outk)
end

# Lex whitespace, a whitespace char `c` has been consumed
Expand Down Expand Up @@ -725,13 +750,16 @@ function lex_comment(l::Lexer)
end
else
c = readchar(l) # consume the '='
init_bidi_state = (0,0)
bidi_state = init_bidi_state
skip = true # true => c was part of the prev comment marker pair
nesting = 1
while true
if c == EOF_CHAR
return emit_error(l, K"ErrorEofMultiComment")
return emit(l, K"ErrorEofMultiComment")
end
nc = readchar(l)
bidi_state = update_bidi_state(bidi_state, nc)
if skip
skip = false
else
Expand All @@ -742,7 +770,9 @@ function lex_comment(l::Lexer)
nesting -= 1
skip = true
if nesting == 0
return emit(l, K"Comment")
outk = bidi_state == init_bidi_state ?
K"Comment" : K"ErrorBidiFormatting"
return emit(l, outk)
end
end
end
Expand Down Expand Up @@ -791,12 +821,12 @@ function lex_less(l::Lexer)
elseif dpeekchar(l) == ('-', '-')
readchar(l); readchar(l)
if accept(l, '-')
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
if accept(l, '>')
return emit(l, K"<-->")
elseif accept(l, '-')
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
return emit(l, K"<--")
end
Expand Down Expand Up @@ -879,7 +909,7 @@ function lex_minus(l::Lexer)
if accept(l, '>')
return emit(l, K"-->")
else
return emit_error(l, K"ErrorInvalidOperator") # "--" is an invalid operator
return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator
end
elseif !l.dotop && accept(l, '>')
return emit(l, K"->")
Expand All @@ -891,7 +921,7 @@ end

function lex_star(l::Lexer)
if accept(l, '*')
return emit_error(l, K"Error**") # "**" is an invalid operator use ^
return emit(l, K"Error**") # "**" is an invalid operator use ^
elseif accept(l, '=')
return emit(l, K"*=")
end
Expand Down Expand Up @@ -952,15 +982,15 @@ function lex_digit(l::Lexer, kind)
elseif kind === K"Float"
# If we enter the function with kind == K"Float" then a '.' has been parsed.
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant")
return emit(l, K"ErrorInvalidNumericConstant")
elseif is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
return emit(l, K"ErrorAmbiguousNumericConstant") # `1.+`
end
readchar(l)

kind = K"Float"
accept(l, '_') && return emit_error(l, K"ErrorInvalidNumericConstant") # `1._`
accept(l, '_') && return emit(l, K"ErrorInvalidNumericConstant") # `1._`
had_fraction_digs = accept_number(l, isdigit)
pc, ppc = dpeekchar(l)
if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
Expand All @@ -971,18 +1001,18 @@ function lex_digit(l::Lexer, kind)
pc,ppc = dpeekchar(l)
if pc === '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1.e1.`
end
else
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
return emit(l, K"ErrorInvalidNumericConstant") # `1.e`
end
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1.1.`
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
pc == '(' || pc == '[' || pc == '{' ||
pc == '@' || pc == '`' || pc == '"')
return emit_error(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
return emit(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
end
elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
kind = pc == 'f' ? K"Float32" : K"Float"
Expand All @@ -992,10 +1022,10 @@ function lex_digit(l::Lexer, kind)
pc,ppc = dpeekchar(l)
if pc === '.' && !is_dottable_operator_start_char(ppc)
accept(l, '.')
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1e1.`
end
else
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e+`
return emit(l, K"ErrorInvalidNumericConstant") # `1e+`
end
elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
kind == K"Integer"
Expand All @@ -1015,10 +1045,10 @@ function lex_digit(l::Lexer, kind)
kind = K"Float"
accept(l, "+-−")
if !accept_number(l, isdigit) || !had_digits
return emit_error(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
return emit(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
end
elseif isfloat
return emit_error(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
return emit(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
end
is_bin_oct_hex_int = !isfloat
elseif pc == 'b'
Expand All @@ -1038,7 +1068,7 @@ function lex_digit(l::Lexer, kind)
accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
# `0x` `0xg` `0x_` `0x-`
# `0b123` `0o78p` `0xenomorph` `0xaα`
return emit_error(l, K"ErrorInvalidNumericConstant")
return emit(l, K"ErrorInvalidNumericConstant")
end
end
end
Expand Down Expand Up @@ -1132,7 +1162,7 @@ function lex_dot(l::Lexer)
else
if is_dottable_operator_start_char(peekchar(l))
readchar(l)
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
return emit(l, K"..")
end
Expand Down
7 changes: 7 additions & 0 deletions test/diagnostics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ end
Diagnostic(2, 1+sizeof(string(c)), :error, "invisible character $(repr(c))")
end
@test diagnostic(":⥻") == Diagnostic(2, 4, :error, "unknown unicode character '⥻'")

@test diagnostic("\"X \u202a X\"") == Diagnostic(2, 8, :error, "unbalanced bidirectional unicode formatting \"X \\u202a X\"")
@test diagnostic("#= \u202a =#") == Diagnostic(1, 9, :error, "unbalanced bidirectional unicode formatting \"#= \\u202a =#\"")
@test diagnostic("\"X \u202a \$xx\u202c\"", allow_multiple=true) == [
Diagnostic(2, 7, :error, "unbalanced bidirectional unicode formatting \"X \\u202a \"")
Diagnostic(11, 13, :error, "unbalanced bidirectional unicode formatting \"\\u202c\"")
]
end

@testset "parser errors" begin
Expand Down
Loading

0 comments on commit 4d2e561

Please sign in to comment.