Skip to content

Commit 68dec50

Browse files
author
David Kilzer
committed
tidy-html5:tidy_fuzzer: Heap-buffer-overflow in prvTidyEncodeCharToUTF8Bytes
<https://issues.oss-fuzz.com/issues/42498297> Found by oss-fuzz. Fixes potential out-of-bounds write in both NormalizeSpaces() and DowngradeTypography(). Adds assert() statements to catch more bugs with fuzzing. * src/clean.c: (NormalizeSpaces): (DowngradeTypography): - Use a temporary buffer when calling PutUTF8() to avoid a heap buffer overflow write and to avoid clobbering data in-place. - Handle all possible return values after calling PutUTF8(). * src/utf8.c: (DecodeUTF8BytesToChar): (GetUTF8): (PutUTF8): - Add assert() statements to catch bugs during fuzzing.
1 parent d08ddc2 commit 68dec50

File tree

2 files changed

+58
-7
lines changed

2 files changed

+58
-7
lines changed

src/clean.c

+54-6
Original file line numberDiff line numberDiff line change
@@ -1821,18 +1821,42 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
18211821

18221822
for (i = node->start; i < node->end; ++i)
18231823
{
1824+
uint utf8BytesRead = 1, utf8BytesWritten = 0;
1825+
tmbchar tempbuf[10] = {0};
1826+
tmbstr result = NULL;
18241827
c = (byte) lexer->lexbuf[i];
18251828

18261829
/* look for UTF-8 multibyte character */
18271830
if ( c > 0x7F )
1828-
i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1831+
utf8BytesRead += TY_(GetUTF8)( lexer->lexbuf + i, &c );
18291832

18301833
if ( c == 160 )
18311834
c = ' ';
18321835

1833-
p = TY_(PutUTF8)(p, c);
1836+
result = TY_(PutUTF8)(&tempbuf[0], c);
1837+
utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0;
1838+
if ( utf8BytesWritten == 0 ) {
1839+
(lexer->lexbuf + i)[0] = (tmbchar) c;
1840+
++p;
1841+
}
1842+
else if ( utf8BytesRead >= utf8BytesWritten ) {
1843+
memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten);
1844+
i += utf8BytesRead - 1; /* Offset ++i in for loop. */
1845+
p += utf8BytesWritten;
1846+
} else {
1847+
/* Error; keep this byte and move to the next. */
1848+
if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) {
1849+
#if 1 && defined(_DEBUG)
1850+
fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten);
1851+
fprintf(stderr, "i = %d, c = %u\n", i, c);
1852+
#endif
1853+
assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */
1854+
}
1855+
++p;
1856+
}
18341857
}
1835-
node->end = p - lexer->lexbuf;
1858+
intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0;
1859+
node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end;
18361860
}
18371861

18381862
node = node->next;
@@ -2504,10 +2528,13 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
25042528

25052529
for (i = node->start; i < node->end; ++i)
25062530
{
2531+
uint utf8BytesRead = 1, utf8BytesWritten = 0;
2532+
tmbchar tempbuf[10] = {0};
2533+
tmbstr result = NULL;
25072534
c = (unsigned char) lexer->lexbuf[i];
25082535

25092536
if (c > 0x7F)
2510-
i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2537+
utf8BytesRead += TY_(GetUTF8)(lexer->lexbuf + i, &c);
25112538

25122539
if (c >= 0x2013 && c <= 0x201E)
25132540
{
@@ -2530,10 +2557,31 @@ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
25302557
}
25312558
}
25322559

2533-
p = TY_(PutUTF8)(p, c);
2560+
result = TY_(PutUTF8)(&tempbuf[0], c);
2561+
utf8BytesWritten = (result > &tempbuf[0]) ? (uint)(result - &tempbuf[0]) : 0;
2562+
if ( utf8BytesWritten == 0 ) {
2563+
(lexer->lexbuf + i)[0] = (tmbchar) c;
2564+
++p;
2565+
}
2566+
else if ( utf8BytesRead >= utf8BytesWritten ) {
2567+
memmove(&(lexer->lexbuf + i)[0], &tempbuf[0], utf8BytesWritten);
2568+
i += utf8BytesRead - 1; /* Offset ++i in for loop. */
2569+
p += utf8BytesWritten;
2570+
} else {
2571+
/* Error; keep this byte and move to the next. */
2572+
if ( c != 0xFFFD && utf8BytesRead != utf8BytesWritten ) {
2573+
#if 1 && defined(_DEBUG)
2574+
fprintf(stderr, "utf8BytesRead = %u, utf8BytesWritten = %u\n", utf8BytesRead, utf8BytesWritten);
2575+
fprintf(stderr, "i = %d, c = %u\n", i, c);
2576+
#endif
2577+
assert( utf8BytesRead == utf8BytesWritten ); /* Can't extend buffer. */
2578+
}
2579+
++p;
2580+
}
25342581
}
25352582

2536-
node->end = p - lexer->lexbuf;
2583+
intptr_t pos = (p > lexer->lexbuf) ? (p - lexer->lexbuf) : 0;
2584+
node->end = (pos >= node->start && pos <= node->end) ? (uint)pos : node->end;
25372585
}
25382586

25392587
if (node->content)

src/utf8.c

+4-1
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
321321
}
322322
#endif
323323

324+
assert(bytes > 0);
324325
*count = bytes;
325326
*c = n;
326327
if ( hasError )
@@ -443,7 +444,8 @@ uint TY_(GetUTF8)( ctmbstr str, uint *ch )
443444
}
444445

445446
*ch = n;
446-
return bytes - 1;
447+
assert(bytes > 0);
448+
return (bytes > 0) ? (bytes - 1) : 0;
447449
}
448450

449451
/* store char c as UTF-8 encoded byte stream */
@@ -464,6 +466,7 @@ tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
464466
count = 3;
465467
}
466468

469+
assert(count >= 0);
467470
buf += count;
468471
return buf;
469472
}

0 commit comments

Comments
 (0)