Skip to content

Commit 22874dc

Browse files
committed
Added support for translation of Unicode classes.
1 parent b5da6f9 commit 22874dc

File tree

3 files changed

+101
-0
lines changed

3 files changed

+101
-0
lines changed

lib/new.cc

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "./wrapped_re2.h"
22
#include "./util.h"
33

4+
#include <map>
45
#include <memory>
56
#include <string>
67
#include <unordered_set>
@@ -18,6 +19,47 @@ inline bool isHexadecimal(char ch)
1819
return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z');
1920
}
2021

22+
static std::map<std::string, std::string> unicodeClasses = {
23+
{"Uppercase_Letter", "Lu"},
24+
{"Lowercase_Letter", "Ll"},
25+
{"Titlecase_Letter", "Lt"},
26+
{"Cased_Letter", "LC"},
27+
{"Modifier_Letter", "Lm"},
28+
{"Other_Letter", "Lo"},
29+
{"Letter", "L"},
30+
{"Nonspacing_Mark", "Mn"},
31+
{"Spacing_Mark", "Mc"},
32+
{"Enclosing_Mark", "Me"},
33+
{"Mark", "M"},
34+
{"Decimal_Number", "Nd"},
35+
{"Letter_Number", "Nl"},
36+
{"Other_Number", "No"},
37+
{"Number", "N"},
38+
{"Connector_Punctuation", "Pc"},
39+
{"Dash_Punctuation", "Pd"},
40+
{"Open_Punctuation", "Ps"},
41+
{"Close_Punctuation", "Pe"},
42+
{"Initial_Punctuation", "Pi"},
43+
{"Final_Punctuation", "Pf"},
44+
{"Other_Punctuation", "Po"},
45+
{"Punctuation", "P"},
46+
{"Math_Symbol", "Sm"},
47+
{"Currency_Symbol", "Sc"},
48+
{"Modifier_Symbol", "Sk"},
49+
{"Other_Symbol", "So"},
50+
{"Symbol", "S"},
51+
{"Space_Separator", "Zs"},
52+
{"Line_Separator", "Zl"},
53+
{"Paragraph_Separator", "Zp"},
54+
{"Separator", "Z"},
55+
{"Control", "Cc"},
56+
{"Format", "Cf"},
57+
{"Surrogate", "Cs"},
58+
{"Private_Use", "Co"},
59+
{"Unassigned", "Cn"},
60+
{"Other", "C"},
61+
};
62+
2163
static bool translateRegExp(const char *data, size_t size, bool multiline, std::vector<char> &buffer)
2264
{
2365
std::string result;
@@ -98,6 +140,36 @@ static bool translateRegExp(const char *data, size_t size, bool multiline, std::
98140
result += "\\u";
99141
i += 2;
100142
continue;
143+
case 'p':
144+
case 'P':
145+
if (i + 2 < size) {
146+
if (data[i + 2] == '{') {
147+
size_t j = i + 3;
148+
while (j < size && data[j] != '}') ++j;
149+
if (j < size) {
150+
result += "\\";
151+
result += data[i + 1];
152+
std::string name(data + i + 3, j - i - 3);
153+
if (unicodeClasses.find(name) != unicodeClasses.end()) {
154+
name = unicodeClasses[name];
155+
}
156+
if (name.size() == 1) {
157+
result += name;
158+
} else {
159+
result += "{";
160+
result += name;
161+
result += "}";
162+
}
163+
i = j + 1;
164+
changed = true;
165+
continue;
166+
}
167+
}
168+
}
169+
result += "\\";
170+
result += data[i + 1];
171+
i += 2;
172+
continue;
101173
default:
102174
result += "\\";
103175
size_t sym_size = getUtf8CharSize(ch);

tests/test_unicode_classes.js

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
'use strict';
2+
3+
var unit = require('heya-unit');
4+
var RE2 = require('../re2');
5+
6+
// tests
7+
8+
unit.add(module, [
9+
function test_unicodeClasses(t) {
10+
'use strict';
11+
12+
let re2 = new RE2(/\p{L}/u);
13+
eval(t.TEST("re2.test('a') === true"));
14+
eval(t.TEST("re2.test('1') === false"));
15+
16+
re2 = new RE2(/\p{Letter}/u);
17+
eval(t.TEST("re2.test('a') === true"));
18+
eval(t.TEST("re2.test('1') === false"));
19+
20+
re2 = new RE2(/\p{Lu}/u);
21+
eval(t.TEST("re2.test('A') === true"));
22+
eval(t.TEST("re2.test('a') === false"));
23+
24+
re2 = new RE2(/\p{Uppercase_Letter}/u);
25+
eval(t.TEST("re2.test('A') === true"));
26+
eval(t.TEST("re2.test('a') === false"));
27+
}
28+
]);

tests/tests.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ require('./test_symbols');
1717
require('./test_prototype');
1818
require('./test_new');
1919
require('./test_groups');
20+
require('./test_unicode_classes');
2021

2122
unit.run();

0 commit comments

Comments
 (0)