-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcustom.go
More file actions
522 lines (432 loc) · 12.9 KB
/
custom.go
File metadata and controls
522 lines (432 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
// Copyright 2023 Google LLC
// Copyright 2025 Ian Lewis
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lexparse
import (
"bufio"
"context"
"errors"
"fmt"
"io"
"os"
"strings"
"github.com/ianlewis/runeio"
)
// EOF is a rune that indicates that the lexer has finished processing.
const EOF rune = -1
// LexState is the state of the current lexing state machine. It defines the logic
// to process the current state and returns the next state.
type LexState interface {
// Run returns the next state to transition to or an error. If the returned
// error is io.EOF then the Lexer finishes processing normally.
Run(ctx *CustomLexerContext) (LexState, error)
}
type lexFnState struct {
f func(*CustomLexerContext) (LexState, error)
}
// Run implements [LexState.Run].
//
//nolint:ireturn // Returning interface required to satisfy [LexState.Run]
func (s *lexFnState) Run(ctx *CustomLexerContext) (LexState, error) {
return s.f(ctx)
}
// LexStateFn creates a State from the given Run function.
//
//nolint:ireturn // Returning interface required to satisfy [LexState.Run]
func LexStateFn(f func(*CustomLexerContext) (LexState, error)) LexState {
return &lexFnState{f}
}
// CustomLexerContext is a context that carries a reference to the current
// CustomLexer. It is passed to [LexState.Run] method to allow the state
// implementation to interact with the lexer.
type CustomLexerContext struct {
//nolint:containedctx // Embedding context required for interface compliance.
context.Context
l *CustomLexer
}
// Advance attempts to advance the underlying reader a single rune and returns
// true if actually advanced. The current token cursor position is not updated.
func (ctx *CustomLexerContext) Advance() bool {
return ctx.l.advance(1, false) == 1
}
// AdvanceN attempts to advance the underlying reader n runes and returns the
// number actually advanced. The current token cursor position is not updated.
func (ctx *CustomLexerContext) AdvanceN(n int) int {
return ctx.l.advance(n, false)
}
// Cursor returns the current position of the underlying cursor marking the
// beginning of the current token being processed.
func (ctx *CustomLexerContext) Cursor() Position {
return ctx.l.cursor
}
// Discard attempts to discard the next rune, advancing the current token
// cursor, and returns true if actually discarded.
func (ctx *CustomLexerContext) Discard() bool {
return ctx.l.advance(1, true) == 1
}
// DiscardN attempts to discard n runes, advancing the current token cursor
// position, and returns the number actually discarded.
func (ctx *CustomLexerContext) DiscardN(n int) int {
return ctx.l.advance(n, true)
}
// DiscardTo searches the input for one of the given search strings, advancing
// the reader, and stopping when one of the strings is found. The token cursor
// is advanced and data prior to the search string is discarded. The string
// found is returned. If no match is found an empty string is returned.
func (ctx *CustomLexerContext) DiscardTo(query []string) string {
return ctx.l.discardTo(query)
}
// Emit emits the token between the current cursor position and reader
// position and returns the token. If the lexer is not currently active, this
// is a no-op. This advances the current token cursor.
func (ctx *CustomLexerContext) Emit(typ TokenType) *Token {
return ctx.l.emit(typ)
}
// Find searches the input for one of the given search strings, advancing the
// reader, and stopping when one of the strings is found. The token cursor is
// not advanced. The string found is returned. If no match is found an empty
// string is returned.
func (ctx *CustomLexerContext) Find(query []string) string {
return ctx.l.find(query)
}
// Ignore ignores the previous input and resets the token start position to
// the current reader position.
func (ctx *CustomLexerContext) Ignore() {
ctx.l.ignore()
}
// NextRune returns the next rune of input, advancing the reader while not
// advancing the token cursor.
func (ctx *CustomLexerContext) NextRune() rune {
return ctx.l.nextRune()
}
// Peek returns the next rune from the buffer without advancing the reader or
// current token cursor.
func (ctx *CustomLexerContext) Peek() rune {
p := ctx.PeekN(1)
if len(p) < 1 {
return EOF
}
return p[0]
}
// PeekN returns the next n runes from the buffer without advancing the reader
// or current token cursor. PeekN may return fewer runes than requested if an
// error occurs or at end of input.
func (ctx *CustomLexerContext) PeekN(n int) []rune {
return ctx.l.peekN(n)
}
// Pos returns the current position of the underlying reader.
func (ctx *CustomLexerContext) Pos() Position {
return ctx.l.pos
}
// Token returns the current token value.
func (ctx *CustomLexerContext) Token() string {
return ctx.l.b.String()
}
// Width returns the current width of the token being processed. It is
// equivalent to l.Pos().Offset - l.Cursor().Offset.
func (ctx *CustomLexerContext) Width() int {
return ctx.l.pos.Offset - ctx.l.cursor.Offset
}
// CustomLexer lexically processes a byte stream. It is implemented as a
// finite-state machine in which each [LexState] implements it's own processing.
//
// A CustomLexer maintains an internal cursor which marks the start of the next
// token being currently processed. The Lexer can then advance the reader to
// find the end of the token before emitting it.
type CustomLexer struct {
// buf is a buffer of tokens that have been emitted but not yet processed.
buf []*Token
// state is the current state of the Lexer.
state LexState
// r is the underlying reader to read from.
r *runeio.RuneReader
// b is a strings builder that stores the current token value.
b strings.Builder
// pos is the current position of the underlying reader.
pos Position
// cursor is the start position of the current token.
cursor Position
// err is the first error the lexer encountered.
err error
}
// NewCustomLexer creates a new Lexer initialized with the given starting
// [LexState]. The Lexer takes ownership of the tokens channel and closes it
// when lexing is completed.
func NewCustomLexer(reader io.Reader, startingState LexState) *CustomLexer {
var fileName string
file, isFile := reader.(*os.File)
if isFile {
fileName = file.Name()
}
customLexer := &CustomLexer{
state: startingState,
pos: Position{
Filename: fileName,
Offset: 0,
Line: 1,
Column: 1,
},
cursor: Position{
Filename: fileName,
Offset: 0,
Line: 1,
Column: 1,
},
}
// If already a *bufio.Reader, use it directly.
br, isBufReader := reader.(*bufio.Reader)
if !isBufReader {
br = bufio.NewReader(reader)
}
customLexer.r = runeio.NewReader(br)
return customLexer
}
// NextToken implements [Lexer.NextToken] and returns the next token from the
// input stream. If the end of the input is reached, a token with type
// [TokenTypeEOF] is returned.
func (l *CustomLexer) NextToken(ctx context.Context) *Token {
if l.err != nil {
return l.newToken(TokenTypeEOF)
}
lexerCtx := &CustomLexerContext{
Context: ctx,
l: l,
}
// If we have no tokens to return, we need to run the current state.
for len(l.buf) == 0 && l.state != nil {
// Return EOF if the context is done/canceled. Don't rely on l.state.Run
// implementation to check the context.
select {
case <-ctx.Done():
l.setErr(ctx.Err())
return l.newToken(TokenTypeEOF)
default:
}
var err error
l.state, err = l.state.Run(lexerCtx)
l.setErr(err)
if l.err != nil {
return l.newToken(TokenTypeEOF)
}
}
if len(l.buf) > 0 {
// If we have already emitted tokens, return the next one.
token := l.buf[0]
if token.Type != TokenTypeEOF {
l.buf = l.buf[1:]
}
return token
}
// The state is nil and we have no tokens to return, so we are at the end
// of the input.
return l.newToken(TokenTypeEOF)
}
func (l *CustomLexer) nextRune() rune {
if l.err != nil {
return EOF
}
rn, _, err := l.r.ReadRune()
if err != nil {
l.setErr(err)
return EOF
}
l.pos.Offset++
l.pos.Column++
if rn == '\n' {
l.pos.Line++
l.pos.Column = 1
}
_, _ = l.b.WriteRune(rn)
return rn
}
// advance attempts to advance the reader numRunes runes. If discard is true the token
// cursor position is updated as well.
func (l *CustomLexer) advance(numRunes int, discard bool) int {
if l.err != nil {
return 0
}
var advanced int
if discard {
defer l.ignore()
}
// We will attempt to do a zero-copy read by peeking at no more than what is
// currently buffered in the reader operating on a slice that points
// directly to the buffer's memory.
// Minimum size the buffer of underlying reader could be expected to be.
minSize := 16
for numRunes > 0 {
// Determine the number of runes to read.
toRead := l.r.Buffered()
if numRunes < toRead {
toRead = numRunes
}
if toRead == 0 {
if minSize < numRunes {
toRead = minSize
} else {
toRead = numRunes
}
}
// Peek at input so we can increment position, line, column counters.
peekedRunes, peekErr := l.r.Peek(toRead)
if peekErr != nil && !errors.Is(peekErr, io.EOF) {
l.setErr(fmt.Errorf("peeking input: %w", peekErr))
return advanced
}
// Advance by peeked amount.
numDiscarded, dErr := l.r.Discard(len(peekedRunes))
advanced += numDiscarded
l.pos.Offset += numDiscarded
// NOTE: We must be careful since toRead could be different from # of
// runes peeked and/or discarded. We will only actually advance by the
// number of runes discarded in the underlying reader to maintain
// consistency.
for i := range numDiscarded {
if peekedRunes[i] == '\n' {
l.pos.Line++
l.pos.Column = 1
} else {
l.pos.Column++
}
}
if !discard {
l.b.WriteString(string(peekedRunes))
}
if dErr != nil {
l.setErr(fmt.Errorf("discarding input: %w", dErr))
return advanced
}
if peekErr != nil {
// EOF from Peek
return advanced
}
numRunes -= numDiscarded
}
return advanced
}
func (l *CustomLexer) discardTo(query []string) string {
var maxLen int
for i := range query {
if len(query[i]) > maxLen {
maxLen = len(query[i])
}
}
if maxLen == 0 {
return ""
}
for {
bufS := l.r.Buffered()
if bufS < maxLen {
bufS = maxLen
}
// TODO(#94): use backtracking
rns := l.peekN(bufS)
for i := range len(rns) - maxLen + 1 {
for j := range query {
if strings.HasPrefix(string(rns[i:i+maxLen]), query[j]) {
// We have found a match. Discard prior runes and return.
if n := l.advance(i, true); n < i {
// We should have been able to advance by this amount.
// An error has likely occurred.
return ""
}
return query[j]
}
}
}
// Advance the reader by the runes peeked checked.
// NOTE: Only advance the reader the number of runes that could never
// match the substring. Not the full number peeked.
toDiscard := len(rns) - maxLen + 1
if toDiscard <= 0 {
toDiscard = 1
}
if n := l.advance(toDiscard, true); n < toDiscard {
// We should have been able to advance by this amount.
// An error has likely occurred.
return ""
}
}
}
func (l *CustomLexer) emit(typ TokenType) *Token {
if l.err != nil {
return nil
}
token := l.newToken(typ)
l.buf = append(l.buf, token)
l.ignore()
return token
}
func (l *CustomLexer) find(query []string) string {
var maxLen int
for i := range query {
if len(query[i]) > maxLen {
maxLen = len(query[i])
}
}
if maxLen == 0 {
return ""
}
// TODO(#94): use backtracking
for {
// Continue until PeekN can't get any new runes or we find a string
// we're looking for.
rns := l.peekN(maxLen)
if len(rns) == 0 {
return ""
}
for j := range query {
if strings.HasPrefix(string(rns), query[j]) {
return query[j]
}
}
_ = l.nextRune()
}
}
func (l *CustomLexer) ignore() {
l.cursor = l.pos
l.b.Reset()
}
// newToken creates a new token starting from the current cursor position to the
// current reader position.
func (l *CustomLexer) newToken(typ TokenType) *Token {
return &Token{
Type: typ,
Value: l.b.String(),
Start: l.cursor,
End: l.pos,
}
}
func (l *CustomLexer) peekN(n int) []rune {
if l.err != nil {
return nil
}
p, err := l.r.Peek(n)
l.setErr(err)
return p
}
// Err returns any errors that the lexer encountered.
func (l *CustomLexer) Err() error {
return l.err
}
func (l *CustomLexer) setErr(err error) {
if l.err == nil && !errors.Is(err, io.EOF) {
l.err = err
}
}
// SetFilename sets the filename in the lexer's positional information.
func (l *CustomLexer) SetFilename(name string) {
l.pos.Filename = name
l.cursor.Filename = name
}