1
+ UNICODE_TO_REMOVE = {
2
+ "\u0000 " : "" , # Null
3
+ "\u0001 " : "" , # Start of Heading
4
+ "\u0002 " : "" , # Start of Text
5
+ "\u0003 " : "" , # End of Text
6
+ "\u0004 " : "" , # End of Transmission
7
+ "\u0005 " : "" , # Enquiry
8
+ "\u0006 " : "" , # Acknowledge
9
+ "\u0007 " : "" , # Bell
10
+ "\u0008 " : "" , # Backspace
11
+ "\u0009 " : "" , # Horizontal Tab
12
+ "\u000B " : "" , # Vertical Tab
13
+ "\u000C " : "" , # Form Feed
14
+ "\u000D " : "" , # Carriage Return
15
+ "\u000E " : "" , # Shift Out
16
+ "\u000F " : "" , # Shift In
17
+ "\u0010 " : "" , # Data Link Escape
18
+ "\u0011 " : "" , # Device Control 1
19
+ "\u0012 " : "" , # Device Control 2
20
+ "\u0013 " : "" , # Device Control 3
21
+ "\u0014 " : "" , # Device Control 4
22
+ "\u0015 " : "" , # Negative Acknowledge
23
+ "\u0016 " : "" , # Synchronous Idle
24
+ "\u0017 " : "" , # End of Transmission Block
25
+ "\u0018 " : "" , # Cancel
26
+ "\u0019 " : "" , # End of Medium
27
+ "\u001A " : "" , # Substitute
28
+ "\u001B " : "" , # Escape
29
+ "\u001C " : "" , # File Separator
30
+ "\u001D " : "" , # Group Separator
31
+ "\u001E " : "" , # Record Separator
32
+ "\u001F " : "" , # Unit Separator
33
+ "\u007F " : "" , # Delete
34
+ "\u0080 " : "" , # Padding Character
35
+ "\u0081 " : "" , # High Octet Preset
36
+ "\u0082 " : "" , # Break Permitted Here
37
+ "\u0083 " : "" , # No Break Here
38
+ "\u0084 " : "" , # Index
39
+ "\u0085 " : "" , # Next Line
40
+ "\u0086 " : "" , # Start of Selected Area
41
+ "\u0087 " : "" , # End of Selected Area
42
+ "\u0088 " : "" , # Character Tabulation Set
43
+ "\u0089 " : "" , # Character Tabulation with Justification
44
+ "\u008A " : "" , # Line Tabulation Set
45
+ "\u008B " : "" , # Partial Line Forward
46
+ "\u008C " : "" , # Partial Line Backward
47
+ "\u008D " : "" , # Reverse Line Feed
48
+ "\u008E " : "" , # Single-Shift Two
49
+ "\u008F " : "" , # Single-Shift Three
50
+ "\u0090 " : "" , # Device Control String
51
+ "\u0091 " : "" , # Private Use 1
52
+ "\u0092 " : "" , # Private Use 2
53
+ "\u0093 " : "" , # Set Transmit State
54
+ "\u0094 " : "" , # Cancel Character
55
+ "\u0095 " : "" , # Message Waiting
56
+ "\u0096 " : "" , # Start of Guarded Area
57
+ "\u0097 " : "" , # End of Guarded Area
58
+ "\u0098 " : "" , # Start of String
59
+ "\u0099 " : "" , # Single Graphic Character Introducer
60
+ "\u009A " : "" , # Single Character Introducer
61
+ "\u009B " : "" , # Control Sequence Introducer
62
+ "\u009C " : "" , # String Terminator
63
+ "\u009D " : "" , # Operating System Command
64
+ "\u009E " : "" , # Privacy Message
65
+ "\u009F " : "" , # Application Program Command
66
+ "\u00A0 " : "" , # No-Break Space
67
+ "\u00AD " : "" , # Soft Hyphen
68
+ "\u061C " : "" , # Arabic Letter Mark
69
+ "\u115f " : "" , # Hangul Choseong Filler
70
+ "\u1160 " : "" , # Hangul Jungseong Filler
71
+ "\u1680 " : "" , # Ogham Space Mark
72
+ "\u17B4 " : "" , # Khmer Vowel Inherent AQ
73
+ "\u17B5 " : "" , # Khmer Vowel Inherent AA
74
+ "\u180B " : "" , # Mongolian Free Variation Selector One
75
+ "\u180C " : "" , # Mongolian Free Variation Selector Two
76
+ "\u180D " : "" , # Mongolian Free Variation Selector Three
77
+ "\u180E " : "" , # Mongolian Vowel Separator
78
+ "\u2000 " : "" , # En Quad
79
+ "\u2001 " : "" , # Em Quad
80
+ "\u2002 " : "" , # En Space
81
+ "\u2003 " : "" , # Em Space
82
+ "\u2004 " : "" , # Three-Per-Em Space
83
+ "\u2005 " : "" , # Four-Per-Em Space
84
+ "\u2006 " : "" , # Six-Per-Em Space
85
+ "\u2007 " : "" , # Figure Space
86
+ "\u2008 " : "" , # Punctuation Space
87
+ "\u2009 " : "" , # Thin Space
88
+ "\u200A " : "" , # Hair Space
89
+ "\u200B " : "" , # Zero Width Space
90
+ "\u200C " : "" , # Zero Width Non-Joiner
91
+ "\u200D " : "" , # Zero Width Joiner
92
+ "\u200E " : "" , # Left-to-Right Mark
93
+ "\u200F " : "" , # Right-to-Left Mark
94
+ "\u202A " : "" , # Left-to-Right Embedding
95
+ "\u202B " : "" , # Right-to-Left Embedding
96
+ "\u202C " : "" , # Pop Directional Formatting
97
+ "\u202D " : "" , # Left-to-Right Override
98
+ "\u202E " : "" , # Right-to-Left Override
99
+ "\u202F " : "" , # Narrow No-Break Space
100
+ "\u2060 " : "" , # Word Joiner
101
+ "\u2061 " : "" , # Function Application
102
+ "\u2062 " : "" , # Invisible Times
103
+ "\u2063 " : "" , # Invisible Separator
104
+ "\u2064 " : "" , # Invisible Plus
105
+ "\u2066 " : "" , # Left-to-Right Isolate
106
+ "\u2067 " : "" , # Right-to-Left Isolate
107
+ "\u2068 " : "" , # First Strong Isolate
108
+ "\u2069 " : "" , # Pop Directional Isolate
109
+ "\u206A " : "" , # Inhibit Symmetric Swapping
110
+ "\u206B " : "" , # Activate Symmetric Swapping
111
+ "\u206C " : "" , # Inhibit Arabic Form Shaping
112
+ "\u206D " : "" , # Activate Arabic Form Shaping
113
+ "\u206E " : "" , # National Digit Shapes
114
+ "\u206F " : "" , # Nominal Digit Shapes
115
+ "\u3164 " : "" , # Hangul Filler
116
+ "\uFEFF " : "" , # Zero Width No-Break Space
117
+ "\uFFA0 " : "" , # Halfwidth Hangul Filler
118
+ "\uFFFC " : "" , # Object Replacement Character
119
+ "\uFFFE " : "" , # Byte Order Mark
120
+ "\uFFFF " : "" , # Non character
121
+ "\U0001307B " : "" , # Egyptian Hieroglyph Z015B
122
+ "\U0001BCA0 " : "" , # Shorthand Format Letter Overlap
123
+ }
124
+ UNICODE_TO_REMOVE .update ({chr (char ): "" for char in range (0xE0000 , 0xF8FF + 1 )}) # Tag + PUA characters
125
+ UNICODE_TO_REMOVE .update ({chr (char ): "" for char in range (0xF0000 , 0xFFFFF + 1 )}) # PUA characters
126
+ UNICODE_TO_REMOVE .update ({chr (char ): "" for char in range (0x100000 , 0x10FFFF + 1 )}) # PUA characters
127
+ UNICODE_TO_REMOVE .update ({chr (char ): "" for char in range (0x1D100 , 0x1D1FF + 1 )}) # Musical symbols
128
+
129
+ TO_REPLACE = {
130
+ "\u0020 " : " " , # Space
131
+ "\u0009 " : "\t " , # Horizontal Tab
132
+ "\u000A " : "\n " , # Line Feed
133
+ "\u034F " : "\u034F " , # Combining Grapheme Joiner
134
+ "\u2028 " : "\n " , # Line Separator
135
+ "\u2029 " : "\n \n " , # Paragraph Separator
136
+ "\u2000 " : " " , # En Quad
137
+ "\u2001 " : " " , # Em Quad
138
+ "\u2002 " : " " , # En Space
139
+ "\u2003 " : " " , # Em Space
140
+ "\u2004 " : " " , # Three-Per-Em Space
141
+ "\u2005 " : " " , # Four-Per-Em Space
142
+ "\u2006 " : " " , # Six-Per-Em Space
143
+ "\u2007 " : " " , # Figure Space
144
+ "\u2008 " : " " , # Punctuation Space
145
+ "\u2009 " : " " , # Thin Space
146
+ "\u200A " : " " , # Hair Space
147
+ "\u205F " : " " , # Medium Mathematical Space
148
+ "\u3000 " : " " , # Ideographic Space
149
+ "\u2800 " : "\u2800 " , # Braille Pattern Blank
150
+ "\u200D " : "\u200D " , # Zero Width Joiner
151
+ }
152
+ # TO_REPLACE.update({chr(char): chr(char) for char in range(0x02B0, 0x02FF + 1)}) # Modifier letters
153
+ # TO_REPLACE.update({chr(char): chr(char) for char in range(0x0300, 0x036F + 1)}) # Combining Diacritical Marks
154
+ # TO_REPLACE.update({chr(char): chr(char) for char in range(0xFE00, 0xFE0F + 1)}) # Variation Selectors
155
+ # TO_REPLACE.update({chr(char): chr(char) for char in range(0x1E0100, 0x1E01EF + 1)}) # Variation Selectors
156
+
157
+ if __name__ == '__main__' :
158
+ text = "string\u0000 \u0001 \u0002 \u0003 \u0004 \u0005 \u0006 \u0007 \u0008 \u0009 \u000B \u000C \u000D \u000E \u000F \u0010 \u0011 \u0012 \u0013 \u0014 \u0015 \u0016 \u0017 \u0018 \u0019 \u001A \u001B \u001C \u001D \u001E \u001F \u007F \u0080 \u0081 \u0082 \u0083 \u0084 \u0085 \u0086 \u0087 \u0088 \u0089 \u008A \u008B \u008C \u008D \u008E \u008F \u0090 \u0091 \u0092 \u0093 \u0094 \u0095 \u0096 \u0097 \u0098 \u0099 \u009A \u009B \u009C \u009D \u009E \u009F \u00A0 \u00AD \u061C \u115f \u1160 \u1680 \u17B4 \u17B5 \u180B \u180C \u180D \u180E \u2000 \u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 \u2008 \u2009 \u200A \u200B \u200C \u200D \u200E \u200F \u202A \u202B \u202C \u202D \u202E \u202F \u2060 \u2061 \u2062 \u2063 \u2064 \u2066 \u2067 \u2068 \u2069 \u206A \u206B \u206C \u206D \u206E \u206F \u3164 \uFEFF \uFFA0 \uFFFC \uFFFE \uFFFF \U0001307B \U0001BCA0 "
159
+ text += "" .join (chr (char ) for char in range (0xE0000 , 0xF8FF + 1 )) # Tag + PUA characters
160
+ text += "" .join (chr (char ) for char in range (0xF0000 , 0xFFFFF + 1 )) # PUA characters
161
+ text += "" .join (chr (char ) for char in range (0x100000 , 0x10FFFF + 1 )) # PUA characters
162
+ text += "" .join (chr (char ) for char in range (0x1D100 , 0x1D1FF + 1 )) # Musical symbols
163
+ text += "\u0020 \u0009 \u000A \u034F \u2028 \u2029 \u2000 \u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 \u2008 \u2009 \u200A \u205F \u3000 \u2800 \u200D "
164
+ print (len (text ))
165
+ print (text )
0 commit comments