Skip to content

Commit 692eb4b

Browse files
committed
Improve documenation
1 parent bffc5a3 commit 692eb4b

File tree

3 files changed

+25
-19
lines changed

3 files changed

+25
-19
lines changed

cleantext/clean.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
try:
2121
from unidecode import unidecode
2222

23-
except:
23+
except ImportError:
2424
from unicodedata import normalize
2525

2626
unidecode = lambda x: normalize("NFD", x).encode("ASCII", "ignore").decode("utf-8")
@@ -52,8 +52,6 @@ def fix_bad_unicode(text, normalization="NFC"):
5252
if 'NFKC', additional normalizations are applied that can change
5353
the meanings of characters, e.g. ellipsis characters will be replaced
5454
with three periods
55-
Returns:
56-
str
5755
"""
5856
# trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
5957
try:
@@ -126,27 +124,37 @@ def _normalize_whitespace(*kwargs):
126124

127125

128126
def replace_urls(text, replace_with="<URL>"):
129-
"""Replace all URLs in ``text`` str with ``replace_with`` str."""
127+
"""
128+
Replace all URLs in ``text`` str with ``replace_with`` str.
129+
"""
130130
return constants.URL_REGEX.sub(replace_with, text)
131131

132132

133133
def replace_emails(text, replace_with="<EMAIL>"):
134-
"""Replace all emails in ``text`` str with ``replace_with`` str."""
134+
"""
135+
Replace all emails in ``text`` str with ``replace_with`` str.
136+
"""
135137
return constants.EMAIL_REGEX.sub(replace_with, text)
136138

137139

138140
def replace_phone_numbers(text, replace_with="<PHONE>"):
139-
"""Replace all phone numbers in ``text`` str with ``replace_with`` str."""
141+
"""
142+
Replace all phone numbers in ``text`` str with ``replace_with`` str.
143+
"""
140144
return constants.PHONE_REGEX.sub(replace_with, text)
141145

142146

143147
def replace_numbers(text, replace_with="<NUMBER>"):
144-
"""Replace all numbers in ``text`` str with ``replace_with`` str."""
148+
"""
149+
Replace all numbers in ``text`` str with ``replace_with`` str.
150+
"""
145151
return constants.NUMBERS_REGEX.sub(replace_with, text)
146152

147153

148154
def replace_digits(text, replace_with="0"):
149-
"""Replace all digits in ``text`` str with ``replace_with`` str, i.e., 123.34 to 000.00"""
155+
"""
156+
Replace all digits in ``text`` str with ``replace_with`` str, i.e., 123.34 to 000.00
157+
"""
150158
return re.sub(r"\d", replace_with, text)
151159

152160

@@ -159,8 +167,6 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
159167
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
160168
otherwise, pass in a string with which to replace all symbols
161169
(e.g. "*CURRENCY*")
162-
Returns:
163-
str
164170
"""
165171
if replace_with is None:
166172
for k, v in constants.CURRENCIES.items():
@@ -171,6 +177,9 @@ def replace_currency_symbols(text, replace_with="<CUR>"):
171177

172178

173179
def replace_punct(text, replace_with=" "):
180+
"""
181+
Replace punctuations from ``text`` with whitespaces (or other tokens).
182+
"""
174183
return text.translate(
175184
dict.fromkeys(
176185
(i for i in range(sys.maxunicode) if category(chr(i)).startswith("P")),
@@ -181,11 +190,7 @@ def replace_punct(text, replace_with=" "):
181190

182191
def remove_punct(text):
183192
"""
184-
Replace punctuations from ``text`` with whitespaces.
185-
Args:
186-
text (str): raw text
187-
Returns:
188-
str
193+
Remove punctuations from ``text``.
189194
"""
190195
return text.translate(constants.PUNCT_TRANSLATE_UNICODE)
191196

@@ -252,9 +257,6 @@ def clean(
252257
253258
Returns:
254259
str: input ``text`` processed according to function args
255-
Warning:
256-
These changes may negatively affect subsequent NLP analysis performed
257-
on the text, so choose carefully, and preprocess at your own risk!
258260
"""
259261

260262
if text is None:

cleantext/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
2+
Constant symbols and compiled RegExs use for cleaning.
33
"""
44

55
import re

cleantext/specials.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
Language-specific edge case handling.
3+
"""
4+
15
import unicodedata
26

37
# add new languages here

0 commit comments

Comments
 (0)