20
20
try :
21
21
from unidecode import unidecode
22
22
23
- except :
23
+ except ImportError :
24
24
from unicodedata import normalize
25
25
26
26
unidecode = lambda x : normalize ("NFD" , x ).encode ("ASCII" , "ignore" ).decode ("utf-8" )
@@ -52,8 +52,6 @@ def fix_bad_unicode(text, normalization="NFC"):
52
52
if 'NFKC', additional normalizations are applied that can change
53
53
the meanings of characters, e.g. ellipsis characters will be replaced
54
54
with three periods
55
- Returns:
56
- str
57
55
"""
58
56
# trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
59
57
try :
@@ -126,27 +124,37 @@ def _normalize_whitespace(*kwargs):
126
124
127
125
128
126
def replace_urls (text , replace_with = "<URL>" ):
129
- """Replace all URLs in ``text`` str with ``replace_with`` str."""
127
+ """
128
+ Replace all URLs in ``text`` str with ``replace_with`` str.
129
+ """
130
130
return constants .URL_REGEX .sub (replace_with , text )
131
131
132
132
133
133
def replace_emails (text , replace_with = "<EMAIL>" ):
134
- """Replace all emails in ``text`` str with ``replace_with`` str."""
134
+ """
135
+ Replace all emails in ``text`` str with ``replace_with`` str.
136
+ """
135
137
return constants .EMAIL_REGEX .sub (replace_with , text )
136
138
137
139
138
140
def replace_phone_numbers (text , replace_with = "<PHONE>" ):
139
- """Replace all phone numbers in ``text`` str with ``replace_with`` str."""
141
+ """
142
+ Replace all phone numbers in ``text`` str with ``replace_with`` str.
143
+ """
140
144
return constants .PHONE_REGEX .sub (replace_with , text )
141
145
142
146
143
147
def replace_numbers (text , replace_with = "<NUMBER>" ):
144
- """Replace all numbers in ``text`` str with ``replace_with`` str."""
148
+ """
149
+ Replace all numbers in ``text`` str with ``replace_with`` str.
150
+ """
145
151
return constants .NUMBERS_REGEX .sub (replace_with , text )
146
152
147
153
148
154
def replace_digits (text , replace_with = "0" ):
149
- """Replace all digits in ``text`` str with ``replace_with`` str, i.e., 123.34 to 000.00"""
155
+ """
156
+ Replace all digits in ``text`` str with ``replace_with`` str, i.e., 123.34 to 000.00
157
+ """
150
158
return re .sub (r"\d" , replace_with , text )
151
159
152
160
@@ -159,8 +167,6 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
159
167
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
160
168
otherwise, pass in a string with which to replace all symbols
161
169
(e.g. "*CURRENCY*")
162
- Returns:
163
- str
164
170
"""
165
171
if replace_with is None :
166
172
for k , v in constants .CURRENCIES .items ():
@@ -171,6 +177,9 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
171
177
172
178
173
179
def replace_punct (text , replace_with = " " ):
180
+ """
181
+ Replace punctuations from ``text`` with whitespaces (or other tokens).
182
+ """
174
183
return text .translate (
175
184
dict .fromkeys (
176
185
(i for i in range (sys .maxunicode ) if category (chr (i )).startswith ("P" )),
@@ -181,11 +190,7 @@ def replace_punct(text, replace_with=" "):
181
190
182
191
def remove_punct (text ):
183
192
"""
184
- Replace punctuations from ``text`` with whitespaces.
185
- Args:
186
- text (str): raw text
187
- Returns:
188
- str
193
+ Remove punctuations from ``text``.
189
194
"""
190
195
return text .translate (constants .PUNCT_TRANSLATE_UNICODE )
191
196
@@ -252,9 +257,6 @@ def clean(
252
257
253
258
Returns:
254
259
str: input ``text`` processed according to function args
255
- Warning:
256
- These changes may negatively affect subsequent NLP analysis performed
257
- on the text, so choose carefully, and preprocess at your own risk!
258
260
"""
259
261
260
262
if text is None :
0 commit comments