Skip to content

Commit 7b1231b

Browse files
committed
Reduce allocations in the lexer
- We flatten the representation of (lexer) tokens, so they now are mostly constants (e.g. `RETURN` rather than `Token Return`. - Also shuffle around some of the lexing functions to avoid some intermediate values. Honestly, this code is still pretty bad — it's responsible for 40% of our allocations. Have a couple more ideas on how to reduce node overhead.
1 parent b2a18fc commit 7b1231b

File tree

6 files changed

+292
-234
lines changed

6 files changed

+292
-234
lines changed

src/parser/illuaminateParser.ml

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3,53 +3,38 @@ module I = Grammar.MenhirInterpreter
33
module PE = Lrgrep_runtime.Interpreter (Parse_errors.Table_error_message) (I)
44
module Error = Error
55

6-
type 'a located =
7-
{ span : Span.t;
8-
start : Lexing.position;
9-
finish : Lexing.position;
10-
token : 'a
11-
}
6+
type located_token = Token.lexer_token * Lexing.position * Lexing.position
127

13-
let lex_one lines (lexbuf : Lexing.lexbuf) =
8+
let lex_one lines (lexbuf : Lexing.lexbuf) : located_token =
149
let start = lexbuf.lex_curr_p in
1510
let token = Lexer.token lines lexbuf in
1611
let finish = lexbuf.lex_curr_p in
17-
{ token; span = Span.of_pos2 lines start lexbuf.lex_curr_p; start; finish }
12+
(token, start, finish)
1813

19-
let lex_leading lines lexbuf =
20-
let rec go xs =
21-
match lex_one lines lexbuf with
22-
| { token = Trivial value; span; _ } -> go ({ Span.value; span } :: xs)
23-
| { token = Token token; _ } as rest -> (List.rev xs, { rest with token })
24-
in
25-
go []
14+
let rec lex_leading_worker lines (lexbuf : Lexing.lexbuf) xs =
15+
let start = lexbuf.lex_curr_p in
16+
match Lexer.token lines lexbuf with
17+
| TRIVIA value ->
18+
lex_leading_worker lines lexbuf
19+
({ Span.value; span = Span.of_pos2 lines start lexbuf.lex_curr_p } :: xs)
20+
| token -> (List.rev xs, (token, start, lexbuf.lex_curr_p))
2621

27-
let lex_trailing file lexbuf prev_line =
22+
let lex_leading lines lexbuf (token : located_token) =
23+
match token with
24+
| TRIVIA value, start, finish ->
25+
lex_leading_worker lines lexbuf [ { Span.value; span = Span.of_pos2 lines start finish } ]
26+
| tok -> ([], tok)
27+
28+
let lex_trailing lines (lexbuf : Lexing.lexbuf) prev_line =
2829
let rec go xs =
29-
match lex_one file lexbuf with
30-
| { token = Trivial value; span; start; _ } when start.pos_lnum = prev_line ->
31-
go ({ Span.value; span } :: xs)
32-
| t -> (List.rev xs, t)
30+
let start = lexbuf.lex_curr_p in
31+
match Lexer.token lines lexbuf with
32+
| TRIVIA value when start.pos_lnum = prev_line ->
33+
go ({ Span.value; span = Span.of_pos2 lines start lexbuf.lex_curr_p } :: xs)
34+
| t -> (List.rev xs, (t, start, lexbuf.lex_curr_p))
3335
in
3436
go []
3537

36-
let lex_token file lexbuf (next : Token.lexer_token located) =
37-
let leading, { token; span = tok_span; start; finish } =
38-
match next with
39-
| { token = Trivial value; span; _ } ->
40-
let leading, t = lex_leading file lexbuf in
41-
({ Span.value; span } :: leading, t)
42-
| { token = Token token; _ } as rest -> ([], { rest with token })
43-
in
44-
match token with
45-
| EoF ->
46-
(* Just return the current "next" token (we won't inspect it after all, and an EOF token with
47-
no trailing data. *)
48-
(Token.make_token leading [] tok_span token, start, finish, next)
49-
| _ ->
50-
let trailing, next = lex_trailing file lexbuf start.pos_lnum in
51-
(Token.make_token leading trailing tok_span token, start, finish, next)
52-
5338
let get_error_message token ~pre_env ~post_env : Error.message =
5439
match
5540
PE.run pre_env
@@ -68,18 +53,29 @@ let get_error_message token ~pre_env ~post_env : Error.message =
6853
let parse start (file : Illuaminate.File_id.t) (lexbuf : Lexing.lexbuf) =
6954
Span.Lines.using file lexbuf @@ fun lines ->
7055
let position_map = Span.Lines.position_map lines in
71-
let rec go env token token_start token_end next = function
56+
let rec go env token next = function
7257
| I.InputNeeded env as checkpoint -> go_input env checkpoint next
73-
| (I.Shifting _ | I.AboutToReduce _) as checkpoint ->
74-
I.resume checkpoint |> go env token token_start token_end next
58+
| (I.Shifting _ | I.AboutToReduce _) as checkpoint -> I.resume checkpoint |> go env token next
7559
| I.HandlingError post_env ->
76-
let message = get_error_message (token, token_start, token_end) ~pre_env:env ~post_env in
60+
let message = get_error_message token ~pre_env:env ~post_env in
7761
Error { Error.file; position_map; message }
7862
| I.Accepted x -> Ok x
7963
| I.Rejected -> assert false
8064
and go_input env checkpoint token =
81-
let token, start, finish, next = lex_token lines lexbuf token in
82-
I.offer checkpoint (token, start, finish) |> go env token start finish next
65+
let leading_trivia, ((token, start, finish) as lex_token) = lex_leading lines lexbuf token in
66+
let span = Span.of_pos2 lines start lexbuf.lex_curr_p in
67+
let token, next =
68+
match token with
69+
| EOF ->
70+
(* Just return the current "next" token (we won't inspect it after all, and an EOF token
71+
with no trailing data. *)
72+
( (Token.make_token ~leading_trivia ~trailing_trivia:[] ~span token, start, finish),
73+
lex_token )
74+
| _ ->
75+
let trailing_trivia, next = lex_trailing lines lexbuf start.pos_lnum in
76+
((Token.make_token ~leading_trivia ~trailing_trivia ~span token, start, finish), next)
77+
in
78+
I.offer checkpoint token |> go env token next
8379
in
8480
try
8581
match start Lexing.dummy_pos with
@@ -91,18 +87,26 @@ let program = parse Grammar.Incremental.program
9187
let repl_exprs = parse Grammar.Incremental.repl_exprs
9288

9389
module Lexer = struct
94-
type token = Token.lexer_token =
95-
| Token of IlluaminateCore.Token.t
90+
type token =
91+
| Token of string
9692
| Trivial of IlluaminateCore.Node.trivial
9793

9894
let lex (file : Illuaminate.File_id.t) (lexbuf : Lexing.lexbuf) =
9995
Span.Lines.using file lexbuf @@ fun lines ->
10096
try
10197
let rec go xs =
102-
let { token; span; _ } = lex_one lines lexbuf in
103-
let xs = { Span.value = token; span } :: xs in
98+
let token, start, finish = lex_one lines lexbuf in
99+
let span = Span.of_pos2 lines start finish in
100+
let value =
101+
match token with
102+
| TRIVIA t -> Trivial t
103+
| t ->
104+
Token
105+
(Token.make_token ~leading_trivia:[] ~trailing_trivia:[] ~span t |> Token.to_string)
106+
in
107+
let xs = { Span.value; span } :: xs in
104108
match token with
105-
| Token EoF -> xs
109+
| EOF -> xs
106110
| _ -> go xs
107111
in
108112
go [] |> List.rev |> Array.of_list |> Result.ok

src/parser/illuaminateParser.mli

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ module Error = Error
44
module Lexer : sig
55
(** The type of tokens produced by the lexer. *)
66
type token =
7-
| Token of IlluaminateCore.Token.t
7+
| Token of string
88
| Trivial of IlluaminateCore.Node.trivial
99

1010
(** Lex a file, producing a simple token stream. *)

src/parser/lexer.mll

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929
Buffer.add_string b str;
3030
b
3131

32-
let mk_long_comment c = Trivial (BlockComment c)
33-
let mk_long_string c = Token (String c)
32+
let mk_long_comment c = TRIVIA (BlockComment c)
33+
let mk_long_string c = STRING c
3434
}
3535

3636
let white = [' ' '\t']
@@ -44,85 +44,85 @@ let ident_head = ['a'-'z' 'A'-'Z' '_']
4444
let ident_tail = ident_head | '_' | digit
4545

4646
rule token l = parse
47-
| white+ as x { Trivial (Whitespace x) }
48-
| '\n' { new_line l; Trivial (Whitespace "\n") }
49-
| '\r' '\n' { new_line l; Trivial (Whitespace "\r\n") }
47+
| white+ as x { TRIVIA (Whitespace x) }
48+
| '\n' { new_line l; TRIVIA (Whitespace "\n") }
49+
| '\r' '\n' { new_line l; TRIVIA (Whitespace "\r\n") }
5050
| ("--[" '='* '[') as x { long_string (buffer_with' 16 x) (String.length x - 4) mk_long_comment l lexbuf }
5151
(* We split line comments into two parts. Otherwise "--[^\n]*" would match "--[[foo]]". *)
5252
| "--" { line_comment lexbuf }
5353

54-
| "and" { Token And }
55-
| "break" { Token Break }
56-
| "do" { Token Do }
57-
| "else" { Token Else }
58-
| "elseif" { Token ElseIf }
59-
| "end" { Token End }
60-
| "false" { Token False }
61-
| "for" { Token For }
62-
| "function" { Token Function }
63-
| "if" { Token If }
64-
| "in" { Token In }
65-
| "local" { Token Local }
66-
| "nil" { Token Nil }
67-
| "not" { Token Not }
68-
| "or" { Token Or }
69-
| "repeat" { Token Repeat }
70-
| "return" { Token Return }
71-
| "then" { Token Then }
72-
| "true" { Token True }
73-
| "until" { Token Until }
74-
| "while" { Token While }
75-
76-
| ":" { Token Colon }
77-
| "::" { Token Double_colon }
78-
| "," { Token Comma }
79-
| "." { Token Dot }
80-
| "..." { Token Dots }
81-
| "=" { Token Equals }
82-
| ";" { Token Semicolon }
83-
84-
| '(' { Token OParen } | ')' { Token CParen }
85-
| '{' { Token OBrace } | '}' { Token CBrace }
86-
| '[' { Token OSquare } | ']' { Token CSquare }
87-
88-
| '+' { Token Add }
89-
| '-' { Token Sub }
90-
| '*' { Token Mul }
91-
| '/' { Token Div }
92-
| '^' { Token Pow }
93-
| '%' { Token Mod }
94-
| ".." { Token Concat }
95-
| "==" { Token Eq }
96-
| "~=" { Token Ne }
97-
| "<" { Token Lt }
98-
| "<=" { Token Le }
99-
| ">" { Token Gt }
100-
| ">=" { Token Ge }
101-
| '#' { Token Len }
54+
| "and" { AND }
55+
| "break" { BREAK }
56+
| "do" { DO }
57+
| "else" { ELSE }
58+
| "elseif" { ELSEIF }
59+
| "end" { END }
60+
| "false" { FALSE }
61+
| "for" { FOR }
62+
| "function" { FUNCTION }
63+
| "if" { IF }
64+
| "in" { IN }
65+
| "local" { LOCAL }
66+
| "nil" { NIL }
67+
| "not" { NOT }
68+
| "or" { OR }
69+
| "repeat" { REPEAT }
70+
| "return" { RETURN }
71+
| "then" { THEN }
72+
| "true" { TRUE }
73+
| "until" { UNTIL }
74+
| "while" { WHILE }
75+
76+
| ":" { COLON }
77+
| "::" { DOUBLE_COLON }
78+
| "," { COMMA }
79+
| "." { DOT }
80+
| "..." { DOTS }
81+
| "=" { EQUALS }
82+
| ";" { SEMICOLON }
83+
84+
| '(' { OPAREN } | ')' { CPAREN }
85+
| '{' { OBRACE } | '}' { CBRACE }
86+
| '[' { OSQUARE } | ']' { CSQUARE }
87+
88+
| '+' { ADD }
89+
| '-' { SUB }
90+
| '*' { MUL }
91+
| '/' { DIV }
92+
| '^' { POW }
93+
| '%' { MOD }
94+
| ".." { CONCAT }
95+
| "==" { EQ }
96+
| "~=" { NE }
97+
| "<" { LT }
98+
| "<=" { LE }
99+
| ">" { GT }
100+
| ">=" { GE }
101+
| '#' { LEN }
102102

103103
(* Numbers *)
104-
| "0x" hex+ as i { Token (Number i) }
105-
| digit+ as i { Token (Number i) }
106-
| digit number* as i { Token (Number i) }
107-
| '.' digit number* as i { Token (Number i) }
104+
| "0x" hex+ as i { NUMBER i }
105+
| digit+ as i { NUMBER i }
106+
| digit number* as i { NUMBER i }
107+
| '.' digit number* as i { NUMBER i }
108108

109109
(* Identifiers *)
110-
| ident_head ident_tail* as i { Token (Ident i) }
110+
| ident_head ident_tail* as i { IDENT i }
111111

112112
| '\"' { string (buffer_with 17 '\"') '\"' lexbuf }
113113
| '\'' { string (buffer_with 17 '\'') '\'' lexbuf }
114114
| ('[' '='* '[') as x { long_string (buffer_with' 16 x) (String.length x - 2) mk_long_string l lexbuf }
115115

116-
| eof { Token EoF }
116+
| eof { EOF }
117117

118118
| _ { unexpected_character lexbuf }
119119

120120
and string contents c = parse
121121
| '\"' { Buffer.add_char contents '\"';
122-
if c = '\"' then Token (String (Buffer.contents contents))
122+
if c = '\"' then STRING (Buffer.contents contents)
123123
else string contents c lexbuf }
124124
| '\'' { Buffer.add_char contents '\'';
125-
if c = '\'' then Token (String (Buffer.contents contents))
125+
if c = '\'' then STRING (Buffer.contents contents)
126126
else string contents c lexbuf }
127127

128128
| "\\a" { Buffer.add_string contents "\\a"; string contents c lexbuf }
@@ -168,4 +168,4 @@ and long_string buf eqs term l = parse
168168
| eof { unterminated_string ~eol:false lexbuf }
169169

170170
and line_comment = parse
171-
| [^'\r' '\n']* as x { Trivial (LineComment ("--" ^ x)) }
171+
| [^'\r' '\n']* as x { TRIVIA (LineComment ("--" ^ x)) }

0 commit comments

Comments
 (0)