Skip to content

Commit faa8f0a

Browse files
committed
Improve string decoding
1 parent 1fdec37 commit faa8f0a

File tree

2 files changed

+75
-71
lines changed

2 files changed

+75
-71
lines changed

jawk

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -172,29 +172,37 @@ function __hextodec(h) {
172172
return 256 * __HEX[substr(h, 1, 2)] + __HEX[substr(h, 3)]
173173
}
174174
175-
function __unescape(s, i, s2, c, u, h) {
176-
i = match(s, /\\([bfnrt"\\\/]|u[0-9a-fA-F]{4})/)
177-
if (!i) return s
178-
s2 = ""
179-
while (i) {
180-
c = substr(s, RSTART, RLENGTH)
175+
function __error(msg) {
176+
printf "%s: %s\n", __ARGV0, msg >"/dev/stderr"
177+
exit 1
178+
}
179+
180+
function __unescape(s, i, out, c, u, h, l) {
181+
out = ""
182+
while ((i = match(s, /\\/))) {
183+
c = substr(s, i, 2)
181184
if (c in __UNESCAPE) u = __UNESCAPE[c]
182-
else {
185+
else if (match(substr(s, i), /^\\u[0-9a-fA-F]{4}/)) {
186+
c = substr(s, i, RLENGTH)
183187
h = __hextodec(substr(c, 3))
184188
if (h >= 55296 && h <= 56319) {
185-
c = substr(s, RSTART + RLENGTH, 6)
186-
RLENGTH += 6
187-
h = 65536 + ((h - 55296) * 1024) + \
188-
(__hextodec(substr(c, 3)) - 56320)
189+
if (!match( \
190+
substr(s, i+length(c)),
191+
/^\\u[0-9a-fA-F]{4}/ \
192+
))
193+
__error("unpaired high surrogate " c)
194+
c = c substr(s, i+length(c), RLENGTH)
195+
l = __hextodec(substr(c, 9))
196+
if (l < 56320 || l > 57343)
197+
__error("invalid surrogate pair " c)
198+
h = 65536 + ((h - 55296) * 1024) + (l - 56320)
189199
}
190200
u = __utf8enc(h)
191-
}
192-
s2 = s2 substr(s, 1, RSTART - 1) u
193-
s = substr(s, RSTART + RLENGTH)
194-
i = match(s, /\\([bfnrt"\\\/]|u[0-9a-fA-F]{4})/)
201+
} else __error("invalid escape sequence " c)
202+
out = out substr(s, 1, i-1) u
203+
s = substr(s, i+length(c))
195204
}
196-
s2 = s2 s
197-
return s2
205+
return out s
198206
}
199207
200208
function keys(a, o, n, i) {
@@ -219,20 +227,13 @@ function keys(a, o, n, i) {
219227
return n
220228
}
221229
222-
function __error(t) {
223-
printf "%s: unexpected token %s\n", __ARGV0, t >"/dev/stderr"
224-
exit 1
230+
function __terror(t) {
231+
__error("unexpected token " t)
225232
}
226233
227234
function __get_token(t) {
228-
if (getline t == -1) {
229-
printf "%s: read error\n", __ARGV0 >"/dev/stderr"
230-
exit 1
231-
}
232-
if (t == "") {
233-
printf "%s: unexpected EOF\n", __ARGV0 >"/dev/stderr"
234-
exit 1
235-
}
235+
if (getline t == -1) __error("read error")
236+
if (t == "") __error("unexpected EOF")
236237
return t
237238
}
238239
@@ -243,13 +244,13 @@ function __parse_array(path, i, sep, raw_value, value) {
243244
while (sep != "]") {
244245
value = __get_token()
245246
if (value == "]") {
246-
if (sep) __error(value)
247+
if (sep) __terror(value)
247248
raw_value = raw_value value
248249
break
249250
}
250251
value = __parse_value(value, __getpath(path, ++i))
251252
sep = __get_token()
252-
if (sep != "," && sep != "]") __error(sep)
253+
if (sep != "," && sep != "]") __terror(sep)
253254
raw_value = raw_value value sep
254255
}
255256
_[__getpath(path, "length")] = i
@@ -288,7 +289,7 @@ function __parse_value(value, path, raw_value, start, type) {
288289
type = "number"
289290
}
290291
else {
291-
__error(value)
292+
__terror(value)
292293
}
293294
if (path == "" && path == 0)
294295
_[0] = value
@@ -311,18 +312,18 @@ function __parse_object(path, sep, i, raw_value, key, colon, value, raw_key) {
311312
while (sep != "}") {
312313
key = __get_token()
313314
if (key == "}") {
314-
if (sep) __error(key)
315+
if (sep) __terror(key)
315316
raw_value = raw_value key
316317
break
317318
}
318-
if (length(key) < 2 || substr(key, 1, 1) != "\"") __error(key)
319+
if (length(key) < 2 || substr(key, 1, 1) != "\"") __terror(key)
319320
raw_key = key
320321
key = substr(key, 2, length(key) - 2)
321322
colon = __get_token()
322-
if (colon != ":") __error(colon)
323+
if (colon != ":") __terror(colon)
323324
value = __parse_value(__get_token(), __getpath(path, key))
324325
sep = __get_token()
325-
if (sep != "," && sep != "}") __error(sep)
326+
if (sep != "," && sep != "}") __terror(sep)
326327
raw_value = raw_value raw_key colon value sep
327328
++i
328329
_[__getpath(path, __KEYS SUBSEP i)] = key

src/jawk.awk

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -130,31 +130,41 @@ function __hextodec(h) {
130130
return 256 * __HEX[substr(h, 1, 2)] + __HEX[substr(h, 3)]
131131
}
132132

133-
function __unescape(s, i, s2, c, u, h) {
134-
i = match(s, /\\([bfnrt"\\\/]|u[0-9a-fA-F]{4})/)
135-
if (!i) return s
136-
s2 = ""
137-
while (i) {
138-
c = substr(s, RSTART, RLENGTH)
133+
function __error(msg) {
134+
printf "%s: %s\n", __ARGV0, msg >"/dev/stderr"
135+
exit 1
136+
}
137+
138+
function __unescape(s, out, i, c, u, h, l) {
139+
out = ""
140+
while ((i = match(s, /\\/))) {
141+
c = substr(s, i, 2)
139142
if (c in __UNESCAPE) u = __UNESCAPE[c]
140-
else {
143+
else if (match(substr(s, i), /^\\u[0-9a-fA-F]{4}/)) {
144+
c = substr(s, i, RLENGTH)
141145
h = __hextodec(substr(c, 3))
142-
# high surrogate pair
146+
# high surrogate
143147
# 0xd800 - 0xdbff
144148
if (h >= 55296 && h <= 56319) {
145-
c = substr(s, RSTART + RLENGTH, 6)
146-
RLENGTH += 6
147-
h = 65536 + ((h - 55296) * 1024) + \
148-
(__hextodec(substr(c, 3)) - 56320)
149+
if (!match( \
150+
substr(s, i+length(c)),
151+
/^\\u[0-9a-fA-F]{4}/ \
152+
))
153+
__error("unpaired high surrogate " c)
154+
c = c substr(s, i+length(c), RLENGTH)
155+
l = __hextodec(substr(c, 9))
156+
# low surrogate
157+
# 0xdc00 - 0xdfff
158+
if (l < 56320 || l > 57343)
159+
__error("invalid surrogate pair " c)
160+
h = 65536 + ((h - 55296) * 1024) + (l - 56320)
149161
}
150162
u = __utf8enc(h)
151-
}
152-
s2 = s2 substr(s, 1, RSTART - 1) u
153-
s = substr(s, RSTART + RLENGTH)
154-
i = match(s, /\\([bfnrt"\\\/]|u[0-9a-fA-F]{4})/)
163+
} else __error("invalid escape sequence " c)
164+
out = out substr(s, 1, i-1) u
165+
s = substr(s, i+length(c))
155166
}
156-
s2 = s2 s
157-
return s2
167+
return out s
158168
}
159169

160170
function keys(a, o, n, i) {
@@ -185,20 +195,13 @@ function keys(a, o, n, i) {
185195
return n
186196
}
187197

188-
function __error(t) {
189-
printf "%s: unexpected token %s\n", __ARGV0, t >"/dev/stderr"
190-
exit 1
198+
function __terror(t) {
199+
__error("unexpected token " t)
191200
}
192201

193202
function __get_token(t) {
194-
if (getline t == -1) {
195-
printf "%s: read error\n", __ARGV0 >"/dev/stderr"
196-
exit 1
197-
}
198-
if (t == "") {
199-
printf "%s: unexpected EOF\n", __ARGV0 >"/dev/stderr"
200-
exit 1
201-
}
203+
if (getline t == -1) __error("read error")
204+
if (t == "") __error("unexpected EOF")
202205
return t
203206
}
204207

@@ -209,13 +212,13 @@ function __parse_array(path, i, sep, raw_value, value) {
209212
while (sep != "]") {
210213
value = __get_token()
211214
if (value == "]") {
212-
if (sep) __error(value)
215+
if (sep) __terror(value)
213216
raw_value = raw_value value
214217
break
215218
}
216219
value = __parse_value(value, __getpath(path, ++i))
217220
sep = __get_token()
218-
if (sep != "," && sep != "]") __error(sep)
221+
if (sep != "," && sep != "]") __terror(sep)
219222
raw_value = raw_value value sep
220223
}
221224
_[__getpath(path, "length")] = i
@@ -255,7 +258,7 @@ function __parse_value(value, path, raw_value, start, type) {
255258
type = "number"
256259
}
257260
else {
258-
__error(value)
261+
__terror(value)
259262
}
260263
if (path == "" && path == 0)
261264
_[0] = value
@@ -279,18 +282,18 @@ function __parse_object(path, sep, i, raw_value, key, colon, value, raw_key) {
279282
while (sep != "}") {
280283
key = __get_token()
281284
if (key == "}") {
282-
if (sep) __error(key)
285+
if (sep) __terror(key)
283286
raw_value = raw_value key
284287
break
285288
}
286-
if (length(key) < 2 || substr(key, 1, 1) != "\"") __error(key)
289+
if (length(key) < 2 || substr(key, 1, 1) != "\"") __terror(key)
287290
raw_key = key
288291
key = substr(key, 2, length(key) - 2)
289292
colon = __get_token()
290-
if (colon != ":") __error(colon)
293+
if (colon != ":") __terror(colon)
291294
value = __parse_value(__get_token(), __getpath(path, key))
292295
sep = __get_token()
293-
if (sep != "," && sep != "}") __error(sep)
296+
if (sep != "," && sep != "}") __terror(sep)
294297
raw_value = raw_value raw_key colon value sep
295298
++i
296299
_[__getpath(path, __KEYS SUBSEP i)] = key

0 commit comments

Comments
 (0)