Skip to content

Commit e193920

Browse files
committed
Modernize project tooling and Python version support
Replace black + pylint with ruff for linting and formatting. Update Python requirement to >=3.9 and classifiers to 3.9-3.13. Migrate to poetry-core build backend and modern dev dependency groups. Update CI actions to v4/v5 with ruff lint step and Python 3.9-3.13 matrix. Fix bare except, lambda assignment, deprecated typing imports, and duplicate test name.
1 parent 7a0b4f5 commit e193920

File tree

9 files changed

+148
-1230
lines changed

9 files changed

+148
-1230
lines changed

.github/workflows/test.yml

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
name: Test
22

3-
on: [push, workflow_dispatch]
3+
on: [push, pull_request, workflow_dispatch]
44

55
jobs:
6-
build:
6+
lint:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: actions/checkout@v4
10+
- uses: actions/setup-python@v5
11+
with:
12+
python-version: "3.12"
13+
- run: pip install ruff
14+
- run: ruff check cleantext/ tests/
15+
- run: ruff format --check cleantext/ tests/
16+
17+
test:
718
runs-on: ubuntu-latest
819
strategy:
920
matrix:
10-
python-version: [3.7, 3.8, 3.9]
11-
# Python 3.10 is not working due to a bug with numpy
12-
# python-version: [3.7, 3.8, 3.9, "3.10"]
21+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
1322
extras:
1423
[
1524
"",
@@ -19,17 +28,11 @@ jobs:
1928
]
2029

2130
steps:
22-
- uses: actions/checkout@v2
23-
- uses: actions/setup-python@v1
31+
- uses: actions/checkout@v4
32+
- uses: actions/setup-python@v5
2433
with:
2534
python-version: ${{ matrix.python-version }}
26-
- uses: Gr1N/setup-poetry@v7
27-
- uses: actions/cache@v2
28-
with:
29-
path: ~/.cache/pypoetry/virtualenvs
30-
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.extras }}-poetry-${{ hashFiles('poetry.lock') }}
31-
32-
- name: Install depedencies with all extras combinations
33-
run: poetry install --no-dev ${{ matrix.extras }} && poetry run pip install pytest
35+
- run: pip install poetry
36+
- run: poetry install --only main ${{ matrix.extras }} && poetry run pip install pytest
3437

3538
- run: poetry run pytest

cleantext/clean.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@
3030
except ImportError:
3131
from unicodedata import normalize
3232

33-
unidecode = lambda x: normalize("NFD", x).encode("ASCII", "ignore").decode("utf-8")
33+
def unidecode(x):
34+
return normalize("NFD", x).encode("ASCII", "ignore").decode("utf-8")
35+
3436
log.warning(
35-
"Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results."
37+
"Since the GPL-licensed package `unidecode` is not installed, "
38+
"using Python's `unicodedata` package which yields worse results."
3639
)
3740

3841

@@ -63,7 +66,7 @@ def fix_bad_unicode(text, normalization="NFC"):
6366
# trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
6467
try:
6568
text = text.encode("latin", "backslashreplace").decode("unicode-escape")
66-
except:
69+
except Exception:
6770
pass
6871

6972
return fix_text(text, normalization=normalization)
@@ -100,9 +103,7 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
100103
return text
101104

102105

103-
def normalize_whitespace(
104-
text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False
105-
):
106+
def normalize_whitespace(text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False):
106107
"""
107108
Given ``text`` str, replace one or more spacings with a single space, and one
108109
or more line breaks with a single newline. Also strip leading/trailing whitespace.
@@ -114,13 +115,9 @@ def normalize_whitespace(
114115
text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
115116
else:
116117
if keep_two_line_breaks:
117-
text = constants.NONBREAKING_SPACE_REGEX.sub(
118-
" ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
119-
)
118+
text = constants.NONBREAKING_SPACE_REGEX.sub(" ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text))
120119
else:
121-
text = constants.NONBREAKING_SPACE_REGEX.sub(
122-
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
123-
)
120+
text = constants.NONBREAKING_SPACE_REGEX.sub(" ", constants.LINEBREAK_REGEX.sub(r"\n", text))
124121

125122
return text.strip()
126123

@@ -235,7 +232,8 @@ def clean(
235232
lang="en",
236233
):
237234
"""
238-
Normalize various aspects of a raw text. A convenience function for applying all other preprocessing functions in one go.
235+
Normalize various aspects of a raw text. A convenience function for applying all other
236+
preprocessing functions in one go.
239237
Args:
240238
text (str): raw text to preprocess
241239
fix_unicode (bool): if True, fix "broken" unicode such as
@@ -262,7 +260,8 @@ def clean(
262260
replace_with_digit (str): special DIGIT token, default "0",
263261
replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
264262
replace_with_punct (str): replace punctuations with this token, default "",
265-
lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
263+
lang (str): special language-depended preprocessing.
264+
Besides the default English ('en'), only German ('de') is supported
266265
267266
Returns:
268267
str: input ``text`` processed according to function args
@@ -302,8 +301,6 @@ def clean(
302301
text = text.lower()
303302

304303
if normalize_whitespace:
305-
text = _normalize_whitespace(
306-
text, no_line_breaks, strip_lines, keep_two_line_breaks
307-
)
304+
text = _normalize_whitespace(text, no_line_breaks, strip_lines, keep_two_line_breaks)
308305

309306
return text

cleantext/constants.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,7 @@
2323
"₴": "UAH",
2424
"₹": "INR",
2525
}
26-
CURRENCY_REGEX = re.compile(
27-
"({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
28-
)
26+
CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys())))
2927

3028
PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
3129
(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
@@ -65,7 +63,8 @@
6563
# r"(?:(?:https?|ftp)://)" <-- alt?
6664
r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
6765
# user:pass authentication
68-
r"(?:\S+(?::\S*)?@)?" r"(?:"
66+
r"(?:\S+(?::\S*)?@)?"
67+
r"(?:"
6968
# IP address exclusion
7069
# private & local networks
7170
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
@@ -85,7 +84,10 @@
8584
# domain name
8685
r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
8786
# TLD identifier
88-
r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
87+
r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))"
88+
r"|"
89+
r"(?:(localhost))"
90+
r")"
8991
# port number
9092
r"(?::\d{2,5})?"
9193
# resource path

cleantext/sklearn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Pipeline transformer for scikit-learn to clean text
33
"""
44

5-
from typing import Any, List, Union
5+
from typing import Any, Union
66

77
import pandas as pd
88
from sklearn.base import BaseEstimator, TransformerMixin
@@ -74,7 +74,7 @@ def fit(self, X: Any):
7474
"""
7575
return self
7676

77-
def transform(self, X: Union[List[str], pd.Series]) -> Union[List[str], pd.Series]:
77+
def transform(self, X: Union[list[str], pd.Series]) -> Union[list[str], pd.Series]:
7878
"""
7979
Normalize various aspects of each item in raw text array-like.
8080
Args:

cleantext/specials.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,7 @@ def save_replace(text, lang, back=False):
2828
possibilities = (
2929
specials[lang]["case_sensitive"]
3030
+ [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]]
31-
+ [
32-
[norm(x[0].upper()), x[1].upper()]
33-
for x in specials[lang]["case_insensitive"]
34-
]
31+
+ [[norm(x[0].upper()), x[1].upper()] for x in specials[lang]["case_insensitive"]]
3532
)
3633
for pattern, target in possibilities:
3734
if back:

0 commit comments

Comments
 (0)