forked from carbon-language/carbon-lang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlex.cpp
1617 lines (1418 loc) · 65.3 KB
/
lex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Part of the Carbon Language project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "toolchain/lex/lex.h"
#include <array>
#include <limits>
#include "common/check.h"
#include "common/variant_helpers.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Compiler.h"
#include "toolchain/base/shared_value_stores.h"
#include "toolchain/lex/character_set.h"
#include "toolchain/lex/helpers.h"
#include "toolchain/lex/numeric_literal.h"
#include "toolchain/lex/string_literal.h"
#include "toolchain/lex/token_index.h"
#include "toolchain/lex/token_kind.h"
#include "toolchain/lex/tokenized_buffer.h"
#if __ARM_NEON
#include <arm_neon.h>
#define CARBON_USE_SIMD 1
#elif __x86_64__
#include <x86intrin.h>
#define CARBON_USE_SIMD 1
#else
#define CARBON_USE_SIMD 0
#endif
namespace Carbon::Lex {
// Implementation of the lexer logic itself.
//
// The design is that lexing can loop over the source buffer, consuming it into
// tokens by calling into this API. This class handles the state and breaks down
// the different lexing steps that may be used. It directly updates the provided
// tokenized buffer with the lexed tokens.
//
// We'd typically put this in an anonymous namespace, but it is `friend`-ed by
// the `TokenizedBuffer`. One of the important benefits of being in an anonymous
// namespace is having internal linkage. That allows the optimizer to much more
// aggressively inline away functions that are called in only one place. We keep
// that benefit for now by using the `internal_linkage` attribute.
//
// TODO: Investigate ways to refactor the code that allow moving this into an
// anonymous namespace without overly exposing implementation details of the
// `TokenizedBuffer` or undermining the performance constraints of the lexer.
class [[clang::internal_linkage]] Lexer {
public:
using TokenInfo = TokenizedBuffer::TokenInfo;
using LineInfo = TokenizedBuffer::LineInfo;
// Symbolic result of a lexing action. This indicates whether we successfully
// lexed a token, or whether other lexing actions should be attempted.
//
// While it wraps a simple boolean state, its API both helps make the failures
// more self documenting, and by consuming the actual token constructively
// when one is produced, it helps ensure the correct result is returned.
class LexResult {
public:
// Consumes (and discard) a valid token to construct a result
// indicating a token has been produced. Relies on implicit conversions.
// NOLINTNEXTLINE(google-explicit-constructor)
LexResult(TokenIndex /*discarded_token*/) : LexResult(true) {}
// Returns a result indicating no token was produced.
static auto NoMatch() -> LexResult { return LexResult(false); }
// Tests whether a token was produced by the lexing routine, and
// the lexer can continue forming tokens.
explicit operator bool() const { return formed_token_; }
private:
explicit LexResult(bool formed_token) : formed_token_(formed_token) {}
bool formed_token_;
};
Lexer(SharedValueStores& value_stores, SourceBuffer& source,
DiagnosticConsumer& consumer)
: buffer_(value_stores, source),
consumer_(consumer),
emitter_(&consumer_, &buffer_),
token_emitter_(&consumer_, &buffer_) {}
// Find all line endings and create the line data structures.
//
// Explicitly kept out-of-line because this is a significant loop that is
// useful to have in the profile and it doesn't simplify by inlining at all.
// But because it can, the compiler will flatten this otherwise.
[[gnu::noinline]] auto MakeLines(llvm::StringRef source_text) -> void;
auto current_line() -> LineIndex { return LineIndex(line_index_); }
auto current_line_info() -> LineInfo* {
return &buffer_.line_infos_[line_index_];
}
auto next_line() -> LineIndex { return LineIndex(line_index_ + 1); }
auto next_line_info() -> LineInfo* {
CARBON_CHECK(line_index_ + 1 <
static_cast<ssize_t>(buffer_.line_infos_.size()));
return &buffer_.line_infos_[line_index_ + 1];
}
// Note when the lexer has encountered whitespace, and the next lexed token
// should reflect that it was preceded by some amount of whitespace.
auto NoteWhitespace() -> void { has_leading_space_ = true; }
// Add a lexed token to the tokenized buffer, and reset any token-specific
// state tracked in the lexer for the next token.
auto AddLexedToken(TokenInfo info) -> TokenIndex {
has_leading_space_ = false;
return buffer_.AddToken(info);
}
// Lexes a token with no payload: builds the correctly encoded token info,
// adds it to the tokenized buffer and returns the token index.
auto LexToken(TokenKind kind, int32_t byte_offset) -> TokenIndex {
// Check that we don't accidentally call this for one of the token kinds
// that *always* has a payload up front.
CARBON_DCHECK(!kind.IsOneOf(
{TokenKind::Identifier, TokenKind::StringLiteral, TokenKind::IntLiteral,
TokenKind::IntTypeLiteral, TokenKind::UnsignedIntTypeLiteral,
TokenKind::FloatTypeLiteral, TokenKind::RealLiteral,
TokenKind::Error}));
return AddLexedToken(TokenInfo(kind, has_leading_space_, byte_offset));
}
// Lexes a token with a payload: builds the correctly encoded token info,
// adds it to the tokenized buffer and returns the token index.
auto LexTokenWithPayload(TokenKind kind, int token_payload,
int32_t byte_offset) -> TokenIndex {
return AddLexedToken(
TokenInfo(kind, has_leading_space_, token_payload, byte_offset));
}
auto SkipHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
-> void;
auto LexHorizontalWhitespace(llvm::StringRef source_text, ssize_t& position)
-> void;
auto LexVerticalWhitespace(llvm::StringRef source_text, ssize_t& position)
-> void;
auto LexCR(llvm::StringRef source_text, ssize_t& position) -> void;
auto LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
-> void;
auto LexComment(llvm::StringRef source_text, ssize_t& position) -> void;
// Determines whether a real literal can be formed at the current location.
// This is the case unless the preceding token is `.` or `->` and there is no
// intervening whitespace.
auto CanFormRealLiteral() -> bool;
auto LexNumericLiteral(llvm::StringRef source_text, ssize_t& position)
-> LexResult;
auto LexStringLiteral(llvm::StringRef source_text, ssize_t& position)
-> LexResult;
auto LexOneCharSymbolToken(llvm::StringRef source_text, TokenKind kind,
ssize_t& position) -> TokenIndex;
auto LexOpeningSymbolToken(llvm::StringRef source_text, TokenKind kind,
ssize_t& position) -> LexResult;
auto LexClosingSymbolToken(llvm::StringRef source_text, TokenKind kind,
ssize_t& position) -> LexResult;
auto LexSymbolToken(llvm::StringRef source_text, ssize_t& position)
-> LexResult;
// Given a word that has already been lexed, determine whether it is a type
// literal and if so form the corresponding token.
auto LexWordAsTypeLiteralToken(llvm::StringRef word, int32_t byte_offset)
-> LexResult;
auto LexKeywordOrIdentifier(llvm::StringRef source_text, ssize_t& position)
-> LexResult;
auto LexHash(llvm::StringRef source_text, ssize_t& position) -> LexResult;
auto LexError(llvm::StringRef source_text, ssize_t& position) -> LexResult;
auto LexFileStart(llvm::StringRef source_text, ssize_t& position) -> void;
auto LexFileEnd(llvm::StringRef source_text, ssize_t position) -> void;
// Perform final checking and cleanup that should be done once we have
// finished lexing the whole file, and before we consider the tokenized buffer
// to be complete.
auto Finalize() -> void;
auto DiagnoseAndFixMismatchedBrackets() -> void;
// The main entry point for dispatching through the lexer's table. This method
// should always fully consume the source text.
auto Lex() && -> TokenizedBuffer;
private:
class ErrorRecoveryBuffer;
TokenizedBuffer buffer_;
ssize_t line_index_;
// Tracks whether the lexer has encountered whitespace that will be leading
// whitespace for the next lexed token. Reset after each token lexed.
bool has_leading_space_ = false;
llvm::SmallVector<TokenIndex> open_groups_;
bool has_mismatched_brackets_ = false;
ErrorTrackingDiagnosticConsumer consumer_;
TokenizedBuffer::SourcePointerDiagnosticEmitter emitter_;
TokenizedBuffer::TokenDiagnosticEmitter token_emitter_;
};
#if CARBON_USE_SIMD
namespace {
#if __ARM_NEON
using SimdMaskT = uint8x16_t;
#elif __x86_64__
using SimdMaskT = __m128i;
#else
#error "Unsupported SIMD architecture!"
#endif
using SimdMaskArrayT = std::array<SimdMaskT, sizeof(SimdMaskT) + 1>;
} // namespace
// A table of masks to include 0-16 bytes of an SSE register.
static constexpr SimdMaskArrayT PrefixMasks = []() constexpr {
SimdMaskArrayT masks = {};
for (int i = 1; i < static_cast<int>(masks.size()); ++i) {
masks[i] =
// The SIMD types and constexpr require a C-style cast.
// NOLINTNEXTLINE(google-readability-casting)
(SimdMaskT)(std::numeric_limits<unsigned __int128>::max() >>
((sizeof(SimdMaskT) - i) * 8));
}
return masks;
}();
#endif // CARBON_USE_SIMD
// A table of booleans that we can use to classify bytes as being valid
// identifier start. This is used by raw identifier detection.
static constexpr std::array<bool, 256> IsIdStartByteTable = [] {
std::array<bool, 256> table = {};
for (char c = 'A'; c <= 'Z'; ++c) {
table[c] = true;
}
for (char c = 'a'; c <= 'z'; ++c) {
table[c] = true;
}
table['_'] = true;
return table;
}();
// A table of booleans that we can use to classify bytes as being valid
// identifier (or keyword) characters. This is used in the generic,
// non-vectorized fallback code to scan for length of an identifier.
static constexpr std::array<bool, 256> IsIdByteTable = [] {
std::array<bool, 256> table = IsIdStartByteTable;
for (char c = '0'; c <= '9'; ++c) {
table[c] = true;
}
return table;
}();
// Baseline scalar version, also available for scalar-fallback in SIMD code.
// Uses `ssize_t` for performance when indexing in the loop.
//
// TODO: This assumes all Unicode characters are non-identifiers.
static auto ScanForIdentifierPrefixScalar(llvm::StringRef text, ssize_t i)
-> llvm::StringRef {
const ssize_t size = text.size();
while (i < size && IsIdByteTable[static_cast<unsigned char>(text[i])]) {
++i;
}
return text.substr(0, i);
}
#if CARBON_USE_SIMD && __x86_64__
// The SIMD code paths uses a scheme derived from the techniques in Geoff
// Langdale and Daniel Lemire's work on parsing JSON[1]. Specifically, that
// paper outlines a technique of using two 4-bit indexed in-register look-up
// tables (LUTs) to classify bytes in a branchless SIMD code sequence.
//
// [1]: https://arxiv.org/pdf/1902.08318.pdf
//
// The goal is to get a bit mask classifying different sets of bytes. For each
// input byte, we first test for a high bit indicating a UTF-8 encoded Unicode
// character. Otherwise, we want the mask bits to be set with the following
// logic derived by inspecting the high nibble and low nibble of the input:
// bit0 = 1 for `_`: high `0x5` and low `0xF`
// bit1 = 1 for `0-9`: high `0x3` and low `0x0` - `0x9`
// bit2 = 1 for `A-O` and `a-o`: high `0x4` or `0x6` and low `0x1` - `0xF`
// bit3 = 1 for `P-Z` and 'p-z': high `0x5` or `0x7` and low `0x0` - `0xA`
// bit4 = unused
// bit5 = unused
// bit6 = unused
// bit7 = unused
//
// No bits set means definitively non-ID ASCII character.
//
// Bits 4-7 remain unused if we need to classify more characters.
namespace {
// Struct used to implement the nibble LUT for SIMD implementations.
//
// Forced to 16-byte alignment to ensure we can load it easily in SIMD code.
struct alignas(16) NibbleLUT {
auto Load() const -> __m128i {
return _mm_load_si128(reinterpret_cast<const __m128i*>(this));
}
uint8_t nibble_0;
uint8_t nibble_1;
uint8_t nibble_2;
uint8_t nibble_3;
uint8_t nibble_4;
uint8_t nibble_5;
uint8_t nibble_6;
uint8_t nibble_7;
uint8_t nibble_8;
uint8_t nibble_9;
uint8_t nibble_a;
uint8_t nibble_b;
uint8_t nibble_c;
uint8_t nibble_d;
uint8_t nibble_e;
uint8_t nibble_f;
};
} // namespace
static constexpr NibbleLUT HighLUT = {
.nibble_0 = 0b0000'0000,
.nibble_1 = 0b0000'0000,
.nibble_2 = 0b0000'0000,
.nibble_3 = 0b0000'0010,
.nibble_4 = 0b0000'0100,
.nibble_5 = 0b0000'1001,
.nibble_6 = 0b0000'0100,
.nibble_7 = 0b0000'1000,
.nibble_8 = 0b1000'0000,
.nibble_9 = 0b1000'0000,
.nibble_a = 0b1000'0000,
.nibble_b = 0b1000'0000,
.nibble_c = 0b1000'0000,
.nibble_d = 0b1000'0000,
.nibble_e = 0b1000'0000,
.nibble_f = 0b1000'0000,
};
static constexpr NibbleLUT LowLUT = {
.nibble_0 = 0b1000'1010,
.nibble_1 = 0b1000'1110,
.nibble_2 = 0b1000'1110,
.nibble_3 = 0b1000'1110,
.nibble_4 = 0b1000'1110,
.nibble_5 = 0b1000'1110,
.nibble_6 = 0b1000'1110,
.nibble_7 = 0b1000'1110,
.nibble_8 = 0b1000'1110,
.nibble_9 = 0b1000'1110,
.nibble_a = 0b1000'1100,
.nibble_b = 0b1000'0100,
.nibble_c = 0b1000'0100,
.nibble_d = 0b1000'0100,
.nibble_e = 0b1000'0100,
.nibble_f = 0b1000'0101,
};
static auto ScanForIdentifierPrefixX86(llvm::StringRef text)
-> llvm::StringRef {
const auto high_lut = HighLUT.Load();
const auto low_lut = LowLUT.Load();
// Use `ssize_t` for performance here as we index memory in a tight loop.
ssize_t i = 0;
const ssize_t size = text.size();
while ((i + 16) <= size) {
__m128i input =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(text.data() + i));
// The high bits of each byte indicate a non-ASCII character encoded using
// UTF-8. Test those and fall back to the scalar code if present. These
// bytes will also cause spurious zeros in the LUT results, but we can
// ignore that because we track them independently here.
#if __SSE4_1__
if (!_mm_test_all_zeros(_mm_set1_epi8(0x80), input)) {
break;
}
#else
if (_mm_movemask_epi8(input) != 0) {
break;
}
#endif
// Do two LUT lookups and mask the results together to get the results for
// both low and high nibbles. Note that we don't need to mask out the high
// bit of input here because we track that above for UTF-8 handling.
__m128i low_mask = _mm_shuffle_epi8(low_lut, input);
// Note that the input needs to be masked to only include the high nibble or
// we could end up with bit7 set forcing the result to a zero byte.
__m128i input_high =
_mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0f));
__m128i high_mask = _mm_shuffle_epi8(high_lut, input_high);
__m128i mask = _mm_and_si128(low_mask, high_mask);
// Now compare to find the completely zero bytes.
__m128i id_byte_mask_vec = _mm_cmpeq_epi8(mask, _mm_setzero_si128());
int tail_ascii_mask = _mm_movemask_epi8(id_byte_mask_vec);
// Check if there are bits in the tail mask, which means zero bytes and the
// end of the identifier. We could do this without materializing the scalar
// mask on more recent CPUs, but we generally expect the median length we
// encounter to be <16 characters and so we avoid the extra instruction in
// that case and predict this branch to succeed so it is laid out in a
// reasonable way.
if (LLVM_LIKELY(tail_ascii_mask != 0)) {
// Move past the definitively classified bytes that are part of the
// identifier, and return the complete identifier text.
i += __builtin_ctz(tail_ascii_mask);
return text.substr(0, i);
}
i += 16;
}
return ScanForIdentifierPrefixScalar(text, i);
}
#endif // CARBON_USE_SIMD && __x86_64__
// Scans the provided text and returns the prefix `StringRef` of contiguous
// identifier characters.
//
// This is a performance sensitive function and where profitable uses vectorized
// code sequences to optimize its scanning. When modifying, the identifier
// lexing benchmarks should be checked for regressions.
//
// Identifier characters here are currently the ASCII characters `[0-9A-Za-z_]`.
//
// TODO: Currently, this code does not implement Carbon's design for Unicode
// characters in identifiers. It does work on UTF-8 code unit sequences, but
// currently considers non-ASCII characters to be non-identifier characters.
// Some work has been done to ensure the hot loop, while optimized, retains
// enough information to add Unicode handling without completely destroying the
// relevant optimizations.
static auto ScanForIdentifierPrefix(llvm::StringRef text) -> llvm::StringRef {
// Dispatch to an optimized architecture optimized routine.
#if CARBON_USE_SIMD && __x86_64__
return ScanForIdentifierPrefixX86(text);
#elif CARBON_USE_SIMD && __ARM_NEON
// Somewhat surprisingly, there is basically nothing worth doing in SIMD on
// Arm to optimize this scan. The Neon SIMD operations end up requiring you to
// move from the SIMD unit to the scalar unit in the critical path of finding
// the offset of the end of an identifier. Current ARM cores make the code
// sequences here (quite) unpleasant. For example, on Apple M1 and similar
// cores, the latency is as much as 10 cycles just to extract from the vector.
// SIMD might be more interesting on Neoverse cores, but it'd be nice to avoid
// core-specific tunings at this point.
//
// If this proves problematic and critical to optimize, the current leading
// theory is to have the newline searching code also create a bitmask for the
// entire source file of identifier and non-identifier bytes, and then use the
// bit-counting instructions here to do a fast scan of that bitmask. However,
// crossing that bridge will add substantial complexity to the newline
// scanner, and so currently we just use a boring scalar loop that pipelines
// well.
#endif
return ScanForIdentifierPrefixScalar(text, 0);
}
using DispatchFunctionT = auto(Lexer& lexer, llvm::StringRef source_text,
ssize_t position) -> void;
using DispatchTableT = std::array<DispatchFunctionT*, 256>;
static constexpr std::array<TokenKind, 256> OneCharTokenKindTable = [] {
std::array<TokenKind, 256> table = {};
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = TokenKind::TokenName;
#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
table[(Spelling)[0]] = TokenKind::TokenName;
#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
table[(Spelling)[0]] = TokenKind::TokenName;
#include "toolchain/lex/token_kind.def"
return table;
}();
// We use a collection of static member functions for table-based dispatch to
// lexer methods. These are named static member functions so that they show up
// helpfully in profiles and backtraces, but they tend to not contain the
// interesting logic and simply delegate to the relevant methods. All of their
// signatures need to be exactly the same however in order to ensure we can
// build efficient dispatch tables out of them. All of them end by doing a
// must-tail return call to this routine. It handles continuing the dispatch
// chain.
static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
ssize_t position) -> void;
// Define a set of dispatch functions that simply forward to a method that
// lexes a token. This includes validating that an actual token was produced,
// and continuing the dispatch.
#define CARBON_DISPATCH_LEX_TOKEN(LexMethod) \
static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
ssize_t position) -> void { \
Lexer::LexResult result = lexer.LexMethod(source_text, position); \
CARBON_CHECK(result, "Failed to form a token!"); \
[[clang::musttail]] return DispatchNext(lexer, source_text, position); \
}
CARBON_DISPATCH_LEX_TOKEN(LexError)
CARBON_DISPATCH_LEX_TOKEN(LexSymbolToken)
CARBON_DISPATCH_LEX_TOKEN(LexKeywordOrIdentifier)
CARBON_DISPATCH_LEX_TOKEN(LexHash)
CARBON_DISPATCH_LEX_TOKEN(LexNumericLiteral)
CARBON_DISPATCH_LEX_TOKEN(LexStringLiteral)
// A set of custom dispatch functions that pre-select the symbol token to lex.
#define CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexMethod) \
static auto Dispatch##LexMethod##SymbolToken( \
Lexer& lexer, llvm::StringRef source_text, ssize_t position) -> void { \
Lexer::LexResult result = lexer.LexMethod##SymbolToken( \
source_text, \
OneCharTokenKindTable[static_cast<unsigned char>( \
source_text[position])], \
position); \
CARBON_CHECK(result, "Failed to form a token!"); \
[[clang::musttail]] return DispatchNext(lexer, source_text, position); \
}
CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOneChar)
CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexOpening)
CARBON_DISPATCH_LEX_SYMBOL_TOKEN(LexClosing)
// Define a set of non-token dispatch functions that handle things like
// whitespace and comments.
#define CARBON_DISPATCH_LEX_NON_TOKEN(LexMethod) \
static auto Dispatch##LexMethod(Lexer& lexer, llvm::StringRef source_text, \
ssize_t position) -> void { \
lexer.LexMethod(source_text, position); \
[[clang::musttail]] return DispatchNext(lexer, source_text, position); \
}
CARBON_DISPATCH_LEX_NON_TOKEN(LexHorizontalWhitespace)
CARBON_DISPATCH_LEX_NON_TOKEN(LexVerticalWhitespace)
CARBON_DISPATCH_LEX_NON_TOKEN(LexCR)
CARBON_DISPATCH_LEX_NON_TOKEN(LexCommentOrSlash)
// Build a table of function pointers that we can use to dispatch to the
// correct lexer routine based on the first byte of source text.
//
// While it is tempting to simply use a `switch` on the first byte and
// dispatch with cases into this, in practice that doesn't produce great code.
// There seem to be two issues that are the root cause.
//
// First, there are lots of different values of bytes that dispatch to a
// fairly small set of routines, and then some byte values that dispatch
// differently for each byte. This pattern isn't one that the compiler-based
// lowering of switches works well with -- it tries to balance all the cases,
// and in doing so emits several compares and other control flow rather than a
// simple jump table.
//
// Second, with a `case`, it isn't as obvious how to create a single, uniform
// interface that is effective for *every* byte value, and thus makes for a
// single consistent table-based dispatch. By forcing these to be function
// pointers, we also coerce the code to use a strictly homogeneous structure
// that can form a single dispatch table.
//
// These two actually interact -- the second issue is part of what makes the
// non-table lowering in the first one desirable for many switches and cases.
//
// Ultimately, when table-based dispatch is such an important technique, we
// get better results by taking full control and manually creating the
// dispatch structures.
//
// The functions in this table also use tail-recursion to implement the loop
// of the lexer. This is based on the technique described more fully for any
// kind of byte-stream loop structure here:
// https://blog.reverberate.org/2021/04/21/musttail-efficient-interpreters.html
static constexpr auto MakeDispatchTable() -> DispatchTableT {
DispatchTableT table = {};
// First set the table entries to dispatch to our error token handler as the
// base case. Everything valid comes from an override below.
for (int i = 0; i < 256; ++i) {
table[i] = &DispatchLexError;
}
// Symbols have some special dispatching. First, set the first character of
// each symbol token spelling to dispatch to the symbol lexer. We don't
// provide a pre-computed token here, so the symbol lexer will compute the
// exact symbol token kind. We'll override this with more specific dispatch
// below.
#define CARBON_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = &DispatchLexSymbolToken;
#include "toolchain/lex/token_kind.def"
// Now special cased single-character symbols that are guaranteed to not
// join with another symbol. These are grouping symbols, terminators,
// or separators in the grammar and have a good reason to be
// orthogonal to any other punctuation. We do this separately because this
// needs to override some of the generic handling above, and provide a
// custom token.
#define CARBON_ONE_CHAR_SYMBOL_TOKEN(TokenName, Spelling) \
table[(Spelling)[0]] = &DispatchLexOneCharSymbolToken;
#define CARBON_OPENING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, ClosingName) \
table[(Spelling)[0]] = &DispatchLexOpeningSymbolToken;
#define CARBON_CLOSING_GROUP_SYMBOL_TOKEN(TokenName, Spelling, OpeningName) \
table[(Spelling)[0]] = &DispatchLexClosingSymbolToken;
#include "toolchain/lex/token_kind.def"
// Override the handling for `/` to consider comments as well as a `/`
// symbol.
table['/'] = &DispatchLexCommentOrSlash;
table['_'] = &DispatchLexKeywordOrIdentifier;
// Note that we don't use `llvm::seq` because this needs to be `constexpr`
// evaluated.
for (unsigned char c = 'a'; c <= 'z'; ++c) {
table[c] = &DispatchLexKeywordOrIdentifier;
}
for (unsigned char c = 'A'; c <= 'Z'; ++c) {
table[c] = &DispatchLexKeywordOrIdentifier;
}
// We dispatch all non-ASCII UTF-8 characters to the identifier lexing
// as whitespace characters should already have been skipped and the
// only remaining valid Unicode characters would be part of an
// identifier. That code can either accept or reject.
for (int i = 0x80; i < 0x100; ++i) {
table[i] = &DispatchLexKeywordOrIdentifier;
}
for (unsigned char c = '0'; c <= '9'; ++c) {
table[c] = &DispatchLexNumericLiteral;
}
table['\''] = &DispatchLexStringLiteral;
table['"'] = &DispatchLexStringLiteral;
table['#'] = &DispatchLexHash;
table[' '] = &DispatchLexHorizontalWhitespace;
table['\t'] = &DispatchLexHorizontalWhitespace;
table['\n'] = &DispatchLexVerticalWhitespace;
table['\r'] = &DispatchLexCR;
return table;
}
static constexpr DispatchTableT DispatchTable = MakeDispatchTable();
static auto DispatchNext(Lexer& lexer, llvm::StringRef source_text,
ssize_t position) -> void {
if (LLVM_LIKELY(position < static_cast<ssize_t>(source_text.size()))) {
// The common case is to tail recurse based on the next character. Note
// that because this is a must-tail return, this cannot fail to tail-call
// and will not grow the stack. This is in essence a loop with dynamic
// tail dispatch to the next stage of the loop.
// NOLINTNEXTLINE(readability-avoid-return-with-void-value): For musttail.
[[clang::musttail]] return DispatchTable[static_cast<unsigned char>(
source_text[position])](lexer, source_text, position);
}
// When we finish the source text, stop recursing. We also hint this so that
// the tail-dispatch is optimized as that's essentially the loop back-edge
// and this is the loop exit.
lexer.LexFileEnd(source_text, position);
}
// Estimate an upper bound on the number of identifiers we will need to lex.
//
// When analyzing both Carbon and LLVM's C++ code, we have found a roughly
// normal distribution of unique identifiers in the file centered at 0.5 *
// lines, and in the vast majority of cases bounded below 1.0 * lines. For
// example, here is LLVM's distribution computed with `scripts/source_stats.py`
// and rendered in an ASCII-art histogram:
//
// ## Unique IDs per 10 lines ## (median: 5, p90: 8, p95: 9, p99: 14)
// 1 ids [ 29] ▍
// 2 ids [ 282] ███▊
// 3 ids [1492] ███████████████████▉
// 4 ids [2674] ███████████████████████████████████▌
// 5 ids [3011] ████████████████████████████████████████
// 6 ids [2267] ██████████████████████████████▏
// 7 ids [1549] ████████████████████▋
// 8 ids [ 817] ██████████▉
// 9 ids [ 301] ████
// 10 ids [ 98] █▎
//
// (Trimmed to only cover 1 - 10 unique IDs per 10 lines of code, 272 files
// with more unique IDs in the tail.)
//
// We have checked this distribution with several large codebases (currently
// those at Google, happy to cross check with others) that use a similar coding
// style, and it appears to be very consistent. However, we suspect it may be
// dependent on the column width style. Currently, Carbon's toolchain style
// specifies 80-columns, but if we expect the lexer to routinely see files in
// different styles we should re-compute this estimate.
static auto EstimateUpperBoundOnNumIdentifiers(int line_count) -> int {
return line_count;
}
auto Lexer::Lex() && -> TokenizedBuffer {
llvm::StringRef source_text = buffer_.source_->text();
// Enforced by the source buffer, but something we heavily rely on throughout
// the lexer.
CARBON_CHECK(source_text.size() < std::numeric_limits<int32_t>::max());
// First build up our line data structures.
MakeLines(source_text);
// Use the line count (and any other info needed from this scan) to make rough
// estimated reservations of memory in the hot data structures used by the
// lexer. In practice, scanning for lines is one of the easiest parts of the
// lexer to accelerate, and we can use its results to minimize the cost of
// incrementally growing data structures during the hot path of the lexer.
//
// Note that for hashtables we want estimates near the upper bound to minimize
// growth across the vast majority of inputs. They will also typically reserve
// more memory than we request due to load factor and rounding to power-of-two
// size. This overshoot is usually fine for hot parts of the lexer where
// latency is expected to be more important than minimizing memory usage.
buffer_.value_stores_->identifiers().Reserve(
EstimateUpperBoundOnNumIdentifiers(buffer_.line_infos_.size()));
ssize_t position = 0;
LexFileStart(source_text, position);
// Manually enter the dispatch loop. This call will tail-recurse through the
// dispatch table until everything from source_text is consumed.
DispatchNext(*this, source_text, position);
Finalize();
if (consumer_.seen_error()) {
buffer_.has_errors_ = true;
}
return std::move(buffer_);
}
auto Lexer::MakeLines(llvm::StringRef source_text) -> void {
// We currently use `memchr` here which typically is well optimized to use
// SIMD or other significantly faster than byte-wise scanning. We also use
// carefully selected variables and the `ssize_t` type for performance and
// code size of this hot loop.
//
// Note that the `memchr` approach here works equally well for LF and CR+LF
// line endings. Either way, it finds the end of the line and the start of the
// next line. The lexer below will find the CR byte and peek to see the
// following LF and jump to the next line correctly. However, this approach
// does *not* support plain CR or LF+CR line endings. Nor does it support
// vertical tab or other vertical whitespace.
//
// TODO: Eventually, we should extend this to have correct fallback support
// for handling CR, LF+CR, vertical tab, and other esoteric vertical
// whitespace as line endings. Notably, including *mixtures* of them. This
// will likely be somewhat tricky as even detecting their absence without
// performance overhead and without a custom scanner here rather than memchr
// is likely to be difficult.
const char* const text = source_text.data();
const ssize_t size = source_text.size();
ssize_t start = 0;
while (const char* nl = reinterpret_cast<const char*>(
memchr(&text[start], '\n', size - start))) {
ssize_t nl_index = nl - text;
buffer_.AddLine(TokenizedBuffer::LineInfo(start));
start = nl_index + 1;
}
// The last line ends at the end of the file.
buffer_.AddLine(TokenizedBuffer::LineInfo(start));
// If the last line wasn't empty, the file ends with an unterminated line.
// Add an extra blank line so that we never need to handle the special case
// of being on the last line inside the lexer and needing to not increment
// to the next line.
if (start != size) {
buffer_.AddLine(TokenizedBuffer::LineInfo(size));
}
// Now that all the infos are allocated, get a fresh pointer to the first
// info for use while lexing.
line_index_ = 0;
}
auto Lexer::SkipHorizontalWhitespace(llvm::StringRef source_text,
ssize_t& position) -> void {
// Handle adjacent whitespace quickly. This comes up frequently for example
// due to indentation. We don't expect *huge* runs, so just use a scalar
// loop. While still scalar, this avoids repeated table dispatch and marking
// whitespace.
while (position < static_cast<ssize_t>(source_text.size()) &&
(source_text[position] == ' ' || source_text[position] == '\t')) {
++position;
}
}
auto Lexer::LexHorizontalWhitespace(llvm::StringRef source_text,
ssize_t& position) -> void {
CARBON_DCHECK(source_text[position] == ' ' || source_text[position] == '\t');
NoteWhitespace();
// Skip runs using an optimized code path.
SkipHorizontalWhitespace(source_text, position);
}
auto Lexer::LexVerticalWhitespace(llvm::StringRef source_text,
ssize_t& position) -> void {
NoteWhitespace();
++line_index_;
auto* line_info = current_line_info();
ssize_t line_start = line_info->start;
position = line_start;
SkipHorizontalWhitespace(source_text, position);
line_info->indent = position - line_start;
}
auto Lexer::LexCR(llvm::StringRef source_text, ssize_t& position) -> void {
if (LLVM_LIKELY((position + 1) < static_cast<ssize_t>(source_text.size())) &&
LLVM_LIKELY(source_text[position + 1] == '\n')) {
// Skip to the vertical whitespace path, it will skip over both CR and LF.
LexVerticalWhitespace(source_text, position);
return;
}
CARBON_DIAGNOSTIC(UnsupportedLfCrLineEnding, Error,
"the LF+CR line ending is not supported, only LF and CR+LF "
"are supported");
CARBON_DIAGNOSTIC(UnsupportedCrLineEnding, Error,
"a raw CR line ending is not supported, only LF and CR+LF "
"are supported");
bool is_lfcr = position > 0 && source_text[position - 1] == '\n';
// TODO: This diagnostic has an unfortunate snippet -- we should tweak the
// snippet rendering to gracefully handle CRs.
emitter_.Emit(source_text.begin() + position,
is_lfcr ? UnsupportedLfCrLineEnding : UnsupportedCrLineEnding);
// Recover by treating the CR as a horizontal whitespace. This should make our
// whitespace rules largely work and parse cleanly without disrupting the line
// tracking data structures that were pre-built.
NoteWhitespace();
++position;
}
auto Lexer::LexCommentOrSlash(llvm::StringRef source_text, ssize_t& position)
-> void {
CARBON_DCHECK(source_text[position] == '/');
// Both comments and slash symbols start with a `/`. We disambiguate with a
// max-munch rule -- if the next character is another `/` then we lex it as
// a comment start. If it isn't, then we lex as a slash. We also optimize
// for the comment case as we expect that to be much more important for
// overall lexer performance.
if (LLVM_LIKELY(position + 1 < static_cast<ssize_t>(source_text.size()) &&
source_text[position + 1] == '/')) {
LexComment(source_text, position);
return;
}
// This code path should produce a token, make sure that happens.
LexResult result = LexSymbolToken(source_text, position);
CARBON_CHECK(result, "Failed to form a token!");
}
auto Lexer::LexComment(llvm::StringRef source_text, ssize_t& position) -> void {
CARBON_DCHECK(source_text.substr(position).starts_with("//"));
int32_t comment_start = position;
// Any comment must be the only non-whitespace on the line.
const auto* line_info = current_line_info();
if (LLVM_UNLIKELY(position != line_info->start + line_info->indent)) {
CARBON_DIAGNOSTIC(TrailingComment, Error,
"trailing comments are not permitted");
emitter_.Emit(source_text.begin() + position, TrailingComment);
// Note that we cannot fall-through here as the logic below doesn't handle
// trailing comments. Instead, we treat trailing comments as vertical
// whitespace, which already is designed to skip over any erroneous text at
// the end of the line.
LexVerticalWhitespace(source_text, position);
buffer_.AddComment(line_info->indent, comment_start, position);
return;
}
// The introducer '//' must be followed by whitespace or EOF.
bool is_valid_after_slashes = true;
if (position + 2 < static_cast<ssize_t>(source_text.size()) &&
LLVM_UNLIKELY(!IsSpace(source_text[position + 2]))) {
CARBON_DIAGNOSTIC(NoWhitespaceAfterCommentIntroducer, Error,
"whitespace is required after '//'");
emitter_.Emit(source_text.begin() + position + 2,
NoWhitespaceAfterCommentIntroducer);
// We use this to tweak the lexing of blocks below.
is_valid_after_slashes = false;
}
// Skip over this line.
ssize_t line_index = line_index_;
++line_index;
position = buffer_.line_infos_[line_index].start;
// A very common pattern is a long block of comment lines all with the same
// indent and comment start. We skip these comment blocks in bulk both for
// speed and to reduce redundant diagnostics if each line has the same
// erroneous comment start like `//!`.
//
// When we have SIMD support this is even more important for speed, as short
// indents can be scanned extremely quickly with SIMD and we expect these to
// be the dominant cases.
//
// TODO: We should extend this to 32-byte SIMD on platforms with support.
constexpr int MaxIndent = 13;
const int indent = line_info->indent;
const ssize_t first_line_start = line_info->start;
ssize_t prefix_size = indent + (is_valid_after_slashes ? 3 : 2);
auto skip_to_next_line = [this, indent, &line_index, &position] {
// We're guaranteed to have a line here even on a comment on the last line
// as we ensure there is an empty line structure at the end of every file.
++line_index;
auto* next_line_info = &buffer_.line_infos_[line_index];
next_line_info->indent = indent;
position = next_line_info->start;
};
if (CARBON_USE_SIMD &&
position + 16 < static_cast<ssize_t>(source_text.size()) &&
indent <= MaxIndent) {
// Load a mask based on the amount of text we want to compare.
auto mask = PrefixMasks[prefix_size];
#if __ARM_NEON
// Load and mask the prefix of the current line.
auto prefix = vld1q_u8(reinterpret_cast<const uint8_t*>(source_text.data() +
first_line_start));
prefix = vandq_u8(mask, prefix);
do {
// Load and mask the next line to consider's prefix.
auto next_prefix = vld1q_u8(
reinterpret_cast<const uint8_t*>(source_text.data() + position));
next_prefix = vandq_u8(mask, next_prefix);
// Compare the two prefixes and if any lanes differ, break.
auto compare = vceqq_u8(prefix, next_prefix);
if (vminvq_u8(compare) == 0) {
break;
}
skip_to_next_line();
} while (position + 16 < static_cast<ssize_t>(source_text.size()));
#elif __x86_64__
// Use the current line's prefix as the exemplar to compare against.
// We don't mask here as we will mask when doing the comparison.
auto prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(
source_text.data() + first_line_start));
do {
// Load the next line to consider's prefix.
auto next_prefix = _mm_loadu_si128(
reinterpret_cast<const __m128i*>(source_text.data() + position));
// Compute the difference between the next line and our exemplar. Again,
// we don't mask the difference because the comparison below will be
// masked.
auto prefix_diff = _mm_xor_si128(prefix, next_prefix);
// If we have any differences (non-zero bits) within the mask, we can't
// skip the next line too.
if (!_mm_test_all_zeros(mask, prefix_diff)) {
break;
}
skip_to_next_line();
} while (position + 16 < static_cast<ssize_t>(source_text.size()));
#else
#error "Unsupported SIMD architecture!"
#endif
// TODO: If we finish the loop due to the position approaching the end of
// the buffer we may fail to skip the last line in a comment block that
// has an invalid initial sequence and thus emit extra diagnostics. We
// should really fall through to the generic skipping logic, but the code
// organization will need to change significantly to allow that.
} else {
while (position + prefix_size < static_cast<ssize_t>(source_text.size()) &&
memcmp(source_text.data() + first_line_start,
source_text.data() + position, prefix_size) == 0) {
skip_to_next_line();
}
}
buffer_.AddComment(indent, comment_start, position);
// Now compute the indent of this next line before we finish.
ssize_t line_start = position;
SkipHorizontalWhitespace(source_text, position);
// Now that we're done scanning, update to the latest line index and indent.
line_index_ = line_index;
current_line_info()->indent = position - line_start;