Skip to content

Commit d61a343

Browse files
author
Sajjad Hossain Sagor
committed
Modified Regex Pattern To Encode Emoji Unicodes Better
1 parent fbec019 commit d61a343

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

src/bpe.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ public function __construct()
9090
// - we are special casing a few common apostrophe constructs ('s, 't, 're, ...) and making those into separate tokens
9191
// - we then separate out strings into consecutive chunks of 1) letters, 2) numbers, 3) non-letter-numbers, 4) whitespaces
9292

93-
$this->regex_pattern = "/'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/";
93+
$this->regex_pattern = "/(?:\\\\u[a-f0-9]+)+|\'[stdm]|\'[rv]e|\'ll| ?\p{L}+| ?\p{N}+| ?(?!\\\\u[a-f0-9]+\b)[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/m";
9494

9595
$this->cache = [];
9696
}

0 commit comments

Comments
 (0)