Skip to content

Commit eef6a6e

Browse files
committed
Charset: Replace polyfill wp_has_noncharacters() with direct PCRE version.
Found during fuzzing work on the HTML API and adjacent code. The previous version of this function used a Unicode PCRE to detect noncharacter code points, but that invocation failed if the input string contained sequences of invalid UTF-8 bytes. This patch replaces the Unicode PCRE with a mapped sequence of raw bytes. This version works in environments without Unicode support and it works when invalid bytes are present, making it possible to remove the fallback function as well. Developed in: WordPress#12148 Discussed in: https://core.trac.wordpress.org/ticket/65372 Follow-up to [61000]. Props dmsnell, jonsurrell. See #65372. git-svn-id: https://develop.svn.wordpress.org/trunk@62485 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 6c363d4 commit eef6a6e

3 files changed

Lines changed: 49 additions & 116 deletions

File tree

src/wp-includes/compat-utf8.php

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,7 @@ function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_
404404
* Fallback support for determining if a string contains Unicode noncharacters.
405405
*
406406
* @since 6.9.0
407+
* @deprecated 7.1.0
407408
* @access private
408409
*
409410
* @see \wp_has_noncharacters()
@@ -412,17 +413,9 @@ function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_
412413
* @return bool Whether noncharacters were found in the string.
413414
*/
414415
function _wp_has_noncharacters_fallback( string $text ): bool {
415-
$at = 0;
416-
$invalid_length = 0;
417-
$has_noncharacters = false;
418-
$end = strlen( $text );
419-
420-
while ( $at < $end && ! $has_noncharacters ) {
421-
_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
422-
$at += $invalid_length;
423-
}
416+
_deprecated_function( __FUNCTION__, '7.1.0' );
424417

425-
return $has_noncharacters;
418+
return wp_has_noncharacters( $text );
426419
}
427420

428421
/**

src/wp-includes/utf8.php

Lines changed: 36 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -134,44 +134,40 @@ function wp_scrub_utf8( $text ) {
134134
}
135135
endif;
136136

137-
if ( _wp_can_use_pcre_u() ) :
138-
/**
139-
* Returns whether the given string contains Unicode noncharacters.
140-
*
141-
* XML recommends against using noncharacters and HTML forbids their
142-
* use in attribute names. Unicode recommends that they not be used
143-
* in open exchange of data.
144-
*
145-
* Noncharacters are code points within the following ranges:
146-
* - U+FDD0–U+FDEF
147-
* - U+FFFE–U+FFFF
148-
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
149-
*
150-
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
151-
* @see https://www.w3.org/TR/xml/#charsets
152-
* @see https://html.spec.whatwg.org/#attributes-2
153-
*
154-
* @since 6.9.0
155-
*
156-
* @param string $text Are there noncharacters in this string?
157-
* @return bool Whether noncharacters were found in the string.
158-
*/
159-
function wp_has_noncharacters( string $text ): bool {
160-
return 1 === preg_match(
161-
'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
162-
$text
163-
);
164-
}
165-
else :
166-
/**
167-
* Fallback function for detecting noncharacters in a text.
168-
*
169-
* @ignore
170-
* @private
171-
*
172-
* @since 6.9.0
137+
/**
138+
* Returns whether the given string contains Unicode noncharacters.
139+
*
140+
* XML recommends against using noncharacters and HTML forbids their
141+
* use in attribute names. Unicode recommends that they not be used
142+
* in open exchange of data.
143+
*
144+
* Noncharacters are code points within the following ranges:
145+
* - U+FDD0–U+FDEF
146+
* - U+FFFE–U+FFFF
147+
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
148+
*
149+
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
150+
* @see https://www.w3.org/TR/xml/#charsets
151+
* @see https://html.spec.whatwg.org/#attributes-2
152+
*
153+
* @since 6.9.0
154+
*
155+
* @param string $text Are there noncharacters in this string?
156+
* @return bool Whether noncharacters were found in the string.
157+
*/
158+
function wp_has_noncharacters( string $text ): bool {
159+
/*
160+
* Match the UTF-8 byte sequences directly so malformed UTF-8 elsewhere
161+
* in the subject does not cause PCRE's Unicode mode to reject the string.
173162
*/
174-
function wp_has_noncharacters( string $text ): bool {
175-
return _wp_has_noncharacters_fallback( $text );
176-
}
177-
endif;
163+
return 1 === preg_match(
164+
'~
165+
# U+FDD0-U+FDEF, U+FFFE-U+FFFF
166+
\xEF(?:\xB7[\x90-\xAF]|\xBF[\xBE\xBF])
167+
|
168+
# U+nFFFE/U+nFFFF
169+
(?:\xF0[\x9F\xAF\xBF]|[\xF1-\xF3][\x8F\x9F\xAF\xBF]|\xF4\x8F)\xBF[\xBE\xBF]
170+
~x',
171+
$text
172+
);
173+
}

tests/phpunit/tests/unicode/wpHasNoncharacters.php

Lines changed: 10 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -41,34 +41,24 @@ public function test_detects_non_characters( string $noncharacter ) {
4141
}
4242

4343
/**
44-
* Ensures that a noncharacter inside a string will be properly detected
45-
* using the fallback function when Unicode PCRE support is missing.
44+
* Ensures that invalid UTF-8 does not prevent noncharacter detection.
4645
*
47-
* @ticket 63863
48-
*
49-
* @dataProvider data_noncharacters
50-
*
51-
* @param string $noncharacter Noncharacter as a UTF-8 string.
46+
* @ticket 65372
5247
*/
53-
public function test_fallback_detects_non_characters( string $noncharacter ) {
54-
$this->assertTrue(
55-
_wp_has_noncharacters_fallback( $noncharacter ),
56-
'Failed to detect entire string as noncharacter.'
57-
);
58-
48+
public function test_detects_non_characters_when_string_contains_invalid_utf8() {
5949
$this->assertTrue(
60-
_wp_has_noncharacters_fallback( "{$noncharacter} and more." ),
61-
'Failed to detect noncharacter prefix.'
50+
wp_has_noncharacters( "Invalid byte \xF1 before \u{FDD0}." ),
51+
'Failed to detect noncharacter after invalid UTF-8.'
6252
);
6353

6454
$this->assertTrue(
65-
_wp_has_noncharacters_fallback( "Some text and then a {$noncharacter} and more." ),
66-
'Failed to detect medial noncharacter.'
55+
wp_has_noncharacters( "Noncharacter \u{10FFFF} before invalid byte \xF1." ),
56+
'Failed to detect noncharacter before invalid UTF-8.'
6757
);
6858

69-
$this->assertTrue(
70-
_wp_has_noncharacters_fallback( "Some text and a {$noncharacter}." ),
71-
'Failed to detect noncharacter suffix.'
59+
$this->assertFalse(
60+
wp_has_noncharacters( "Invalid byte \xF1 without noncharacters." ),
61+
'Falsely detected noncharacter in invalid UTF-8.'
7262
);
7363
}
7464

@@ -117,52 +107,6 @@ static function ( $c ) {
117107
}
118108
}
119109

120-
/**
121-
* Ensures that Unicode characters are not falsely detect as noncharacters
122-
* using the fallback function when Unicode PCRE support is missing.
123-
*
124-
* @ticket 63863
125-
*/
126-
public function test_fallback_avoids_false_positives() {
127-
// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
128-
$noncharacters = implode(
129-
"\x00",
130-
array_map(
131-
static function ( $c ) {
132-
return "\x00{$c}";
133-
},
134-
array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 )
135-
)
136-
) . "\x00";
137-
138-
$this->assertFalse(
139-
_wp_has_noncharacters_fallback( "\x00" ),
140-
'Falsely detected noncharacter in U+0000'
141-
);
142-
143-
for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
144-
// Surrogate halves are invalid UTF-8.
145-
if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
146-
continue;
147-
}
148-
149-
$char = mb_chr( $code_point );
150-
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
151-
152-
if ( str_contains( $noncharacters, $char ) ) {
153-
$this->assertTrue(
154-
_wp_has_noncharacters_fallback( $char ),
155-
"Failed to detect noncharacter as test verification for U+{$hex_char}"
156-
);
157-
} else {
158-
$this->assertFalse(
159-
_wp_has_noncharacters_fallback( $char ),
160-
"Falsely detected noncharacter in U+{$hex_char}."
161-
);
162-
}
163-
}
164-
}
165-
166110
/**
167111
* Data provider
168112
*

0 commit comments

Comments
 (0)