hujson/standardizer.go at c027d062a468b13becbc0d9d5dc0b4d531b05be4 · tailscale/hujson · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
// Copyright (c) 2023 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package hujson

import (
	"io"
	"slices"
	"unicode/utf8"
)

// NewStandardizer returns an [io.Reader] that strips HuJSON-specific features
// by replacing all non-whitespace characters in comments and trailing commas
// with space characters, thus preserving original line numbers and byte offsets.
//
// Unlike [Standardize], this does not validate the complete HuJSON grammar but
// does minimal transformation to convert valid HuJSON into valid standard JSON.
// It relies on a standard JSON parser to later detect syntax errors.
// The output is valid JSON if and only if the input is valid HuJSON.
func NewStandardizer(rd io.Reader) *Standardizer {
	return &Standardizer{rd: rd}
}

// Standardizer is an [io.Reader] that reads standard JSON from a HuJSON stream.
type Standardizer struct {
	rd    io.Reader
	rdErr error // non-persistent read error
	standardizerBuffer
}

// Reset discards the Standardizer's state and
// makes it equivalent to calling NewReader with rd instead.
func (r *Standardizer) Reset(rd io.Reader) {
	*r = Standardizer{rd: rd, standardizerBuffer: standardizerBuffer{buffer: r.buffer[:0]}}
}

// Read implements [io.Reader], reading standardized JSON
// from the underlying stream of HuJSON input.
func (r *Standardizer) Read(b []byte) (n int, err error) {
	defer func() {
		// Only report errors if there is no more standardize JSON data to copy.
		if r.jsonOffset >= r.commaOffset {
			err = r.rdErr
			r.rdErr = nil // let underlying io.Reader handle persistence
			if err == io.EOF && r.expectingMore() {
				err = io.ErrUnexpectedEOF
			}
		}
	}()
	switch {
	// Check whether there is already standardized data to copy out.
	case r.jsonOffset < r.commaOffset:
		n = copy(b, r.buffer[r.jsonOffset:r.commaOffset])
		r.jsonOffset += n
	// Check whether we encountered a previous read error.
	case r.rdErr != nil:
		break
	// Check whether we already have data in the internal buffer.
	// If so, read into it and convert it within there,
	// copying out any standardized JSON data.
	case r.commaOffset < len(r.buffer):
		n = len(r.buffer) - r.commaOffset
		r.buffer = slices.Grow(r.buffer, n)
		n, r.rdErr = r.rd.Read(r.buffer[len(r.buffer):][:n])
		r.buffer = r.buffer[:len(r.buffer)+n]
		r.standardize()
		n = copy(b, r.buffer[r.jsonOffset:r.commaOffset])
		r.jsonOffset += n
	// Otherwise, the internal buffer is empty. As an optimization,
	// read directly into the external buffer and standardize it in place
	// using the previous state. Some data may not be standardized,
	// so the unconverted data and state must be preserved.
	default:
		n, r.rdErr = r.rd.Read(b)
		sb := standardizerBuffer{buffer: b[:n], state: r.state}
		sb.standardize()
		r.standardizerBuffer = standardizerBuffer{
			buffer:       append(r.buffer[:0], b[sb.commaOffset:n]...),
			hujsonOffset: sb.hujsonOffset - sb.commaOffset,
			state:        sb.state,
		}
		n = sb.commaOffset
	}
	return n, err
}

// standardizerBuffer is a buffer split into several segments:
//
//   - buffer[:jsonOffset] contains data that has already been copied out
//     to a Read call. This is considered unused buffer space.
//
//   - buffer[jsonOffset:commaOffset] contains already standardized data
//     that is safe to copy out to a future Read call.
//
//   - buffer[commaOffset:hujsonOffset] contains data that is standardized except
//     that it may start with a trailing comma, which cannot be standardized
//     until we find the next JSON token. If the next token is a closing
//     object or array delimiter, then we must elide the comma.
//     Since there is no Limit to the whitespace between a trailing comma and
//     the closing delimiter, this may buffer an unbounded amount of memory.
//     This is always empty unless commaState == afterPossibleTrailingComma.
//
//   - buffer[hujsonOffset:] contains HuJSON data that is not yet standardized.
//     After calling standardize, this is either empty or contains a
//     short fragment because the meaning cannot yet be determined.
//     Fragments can occur within a comment when validating for UTF-8 and
//     a UTF-8 encoded sequence is truncated.
//     It can also occur within a block comment where the buffer
//     ends with a '*' and we do not know if the next character is a '/' or not,
//     which may terminate or continue the block comment sequence.
//     A fragment is always shorter than [utf8.UTFMax].
//
// Invariant: 0 <= jsonOffset <= commaOffset <= hujsonOffset <= len(buffer)
//
// It maintains a finite state machine for eliding comments and trailing commas.
// It does not validate for JSON as that requires a push-down automaton,
// which requires O(n) of stack memory.
type standardizerBuffer struct {
	buffer       []byte
	jsonOffset   int
	commaOffset  int
	hujsonOffset int

	state struct {
		comment commentState
		comma   commaState
	}
}

// commentState is a finite state machine for eliding HuJSON comments.
type commentState uint8

const (
	withinWhitespace       commentState = iota // zero or more whitespace characters
	withinLineComment                          // begins with "//" and ends with "\n"
	withinBlockComment                         // begins with "/*" and ends with "*/"
	withinStringLiteral                        // begins with '"' and ends with unescaped '"'
	withinNonStringLiteral                     // one or more non-whitespace or non-structural characters
)

// commaState is a finite state machine for eliding HuJSON trailing commas.
// A trailing comma only occurs after the completion of a JSON value and
// before a closing object or array delimiter.
type commaState uint8

const (
	beforeValueEnd commaState = iota
	afterValueEnd
	afterPossibleTrailingComma
)

// standardize standardizes HuJSON as standard JSON.
// This is an idempotent operation.
func (s *standardizerBuffer) standardize() {
	b := s.buffer
	i := s.hujsonOffset
stateMachine: // whenever state changes, continue here
	for uint(i) < uint(len(b)) {
		switch s.state.comment {
		case withinWhitespace: // JSON whitespace
			for uint(i) < uint(len(b)) {
				switch b[i] {
				case ' ', '\n', '\r', '\t': // skip over whitespace
					i += len(" ")
				case '/': // possible comment
					if uint(i+1) >= uint(len(b)) {
						break stateMachine // truncated input
					}
					switch b[i+1] {
					case '/': // HuJSON line comment
						copy(b[i:], "  ")
						s.state.comment = withinLineComment
						i += len("//")
						continue stateMachine
					case '*': // HuJSON block comment
						copy(b[i:], "  ")
						s.state.comment = withinBlockComment
						i += len("/*")
						continue stateMachine
					default: // invalid token; see withinNonStringLiteral case below
						s.state.comment = withinNonStringLiteral
						s.state.comma = beforeValueEnd
						i += len("/")
						continue stateMachine
					}
				case '{', '[', ':':
					s.state.comma = beforeValueEnd
					i += len("{")
					continue stateMachine
				case ',':
					if s.state.comma == afterValueEnd {
						s.state.comma = afterPossibleTrailingComma
						s.commaOffset = i
					} else {
						s.state.comma = beforeValueEnd
					}
					i += len(",")
					continue stateMachine
				case '}', ']':
					if s.state.comma == afterPossibleTrailingComma {
						b[s.commaOffset] = ' '
					}
					s.state.comma = afterValueEnd
					i += len("}")
					continue stateMachine
				case '"':
					s.state.comment = withinStringLiteral
					s.state.comma = beforeValueEnd
					i += len(`"`)
					continue stateMachine
				default:
					s.state.comment = withinNonStringLiteral
					s.state.comma = beforeValueEnd
					i += len(`?`)
					continue stateMachine
				}
			}
		case withinLineComment, withinBlockComment: // HuJSON comments
			for uint(i) < uint(len(b)) {
				switch {
				case b[i] == '\n' && s.state.comment == withinLineComment:
					i += len("\n")
					s.state.comment = withinWhitespace
					continue stateMachine
				case b[i] == '*' && s.state.comment == withinBlockComment:
					if uint(i+1) >= uint(len(b)) {
						break stateMachine // truncated input
					}
					if b[i+1] == '/' {
						copy(b[i:], "  ")
						i += len("*/")
						s.state.comment = withinWhitespace
						continue stateMachine
					}
					fallthrough
				case b[i] < utf8.RuneSelf: // single-byte ASCII
					switch b[i] {
					case ' ', '\n', '\r', '\t':
					default:
						b[i] = ' ' // convert non-whitespace to space
					}
					i += len(" ")
				default: // multi-byte Unicode
					// Invalid UTF-8 bytes are not replaced with spaces so that
					// a standard JSON parser can detect them as invalid syntax.
					r, rn := utf8.DecodeRune(b[i:])
					switch {
					case r != utf8.RuneError || rn != 1:
						copy(b[i:][:rn], "    ") // replace valid UTF-8 with space characters
					case !utf8.FullRune(b[i:]):
						break stateMachine // truncated UTF-8 sequence
					}
					i += rn
				}
			}
		case withinStringLiteral: // JSON strings
			for uint(i) < uint(len(b)) {
				switch b[i] {
				case '"': // terminating double quote
					s.state.comment = withinWhitespace
					s.state.comma = afterValueEnd
					i += len(`"`)
					continue stateMachine
				case '\\': // escaped byte (possibly a double quote)
					if uint(i+1) >= uint(len(b)) {
						break stateMachine // truncated input
					}
					i += len(`\?`)
				default: // non-escaped byte
					i += len("?")
				}
			}
		case withinNonStringLiteral: // JSON null, booleans, numbers
			// This treats all non-whitespace and non-structural characters as
			// part of a JSON non-string literal. This may include invalid JSON,
			// which is admissible since it will be passed on verbatim for
			// a standard JSON parser to eventually reject as a syntax error.
			for uint(i) < uint(len(b)) {
				switch b[i] {
				case ' ', '\n', '\r', '\t', '/', '{', '[', ':', ',', '}', ']', '"':
					s.state.comment = withinWhitespace
					s.state.comma = afterValueEnd
					continue stateMachine
				default:
					i += len("?")
				}
			}
		}
	}
	if s.state.comma != afterPossibleTrailingComma {
		s.commaOffset = i
	}
	s.hujsonOffset = i
	s.buffer = b
}

// expectingMore reports whether there might be more standard JSON data to read.
func (s *standardizerBuffer) expectingMore() bool {
	return s.commaOffset < len(s.buffer) || s.state.comment == withinLineComment || s.state.comment == withinBlockComment
}