Skip to content

Commit 57f6867

Browse files
committed
Create script to make character regexes
1 parent 9c1e0e5 commit 57f6867

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

advertools/_regex_helpers.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import re
2+
from unicodedata import name
3+
4+
APOSTROPHE = []
5+
BRACKET = []
6+
COLON = []
7+
COMMA = []
8+
CURRENCY = []
9+
CURRENCY_RAW = []
10+
EXCLAMATION_MARK_RAW = []
11+
FULL_STOP = []
12+
HASHTAG = []
13+
HASHTAG_RAW = []
14+
MENTION = []
15+
MENTION_RAW = []
16+
PAREN = []
17+
QUESTION_MARK_RAW = []
18+
QUOTE = []
19+
SENTENCE_END = []
20+
URL = []
21+
URL_RAW = []
22+
WORD_DELIM = []
23+
24+
for i in range(2_000_000):
25+
try:
26+
if "APOSTROPHE" in name(chr(i)) and (chr(i) not in ["ʼn", "\U000e0027"]):
27+
APOSTROPHE.append(chr(i))
28+
if (
29+
"BRACKET" in name(chr(i))
30+
and "IDEOGRAPH" not in name(chr(i))
31+
and "TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S" not in name(chr(i))
32+
):
33+
BRACKET.append(chr(i))
34+
if "COLON" in name(chr(i)) and i != 8353: # remove the colon currency sign (₡)
35+
COLON.append(chr(i))
36+
if (
37+
("COMMA" in name(chr(i)))
38+
and not re.match("LATIN (SMALL|CAPITAL) LETTER", name(chr(i)))
39+
and not re.match("DIGIT", name(chr(i)))
40+
):
41+
COMMA.append(chr(i))
42+
if "EXCLAMATION" in name(chr(i)):
43+
EXCLAMATION_MARK_RAW.append(chr(i))
44+
if (
45+
"FULL STOP" in name(chr(i))
46+
and (not name(chr(i)).startswith("DIGIT"))
47+
and (not name(chr(i)).startswith("NUMBER"))
48+
):
49+
FULL_STOP.append(chr(i))
50+
if "QUOT" in name(chr(i)) and name(chr(i)) != "YI SYLLABLE QUOT":
51+
QUOTE.append(chr(i))
52+
if "CURRENC" in name(chr(i)):
53+
CURRENCY.append(chr(i))
54+
if ("PAREN" in name(chr(i))) and not re.match("PARENTHESIZED", name(chr(i))):
55+
PAREN.append(chr(i))
56+
if "QUESTION" in name(chr(i)) and "IDEOGRAPH" not in name(chr(i)):
57+
QUESTION_MARK_RAW.append(chr(i))
58+
except Exception:
59+
continue

0 commit comments

Comments
 (0)