jfilter
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 18 additions & 15 deletions b/‎.github/workflows/test.yml‎
Lines changed: 18 additions & 15 deletions
diff --git a/‎cleantext/clean.py‎
Lines changed: 14 additions & 17 deletions b/‎cleantext/clean.py‎
Lines changed: 14 additions & 17 deletions
diff --git a/‎cleantext/constants.py‎
Lines changed: 7 additions & 5 deletions b/‎cleantext/constants.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎cleantext/sklearn.py‎
Lines changed: 2 additions & 2 deletions b/‎cleantext/sklearn.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cleantext/specials.py‎
Lines changed: 1 addition & 4 deletions b/‎cleantext/specials.py‎
Lines changed: 1 addition & 4 deletions
@@ -1,15 +1,24 @@
 name: Test
 
-on: [push, workflow_dispatch]
+on: [push, pull_request, workflow_dispatch]
 
 jobs:
-    build:
+    lint:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-python@v5
+              with:
+                  python-version: "3.12"
+            - run: pip install ruff
+            - run: ruff check cleantext/ tests/
+            - run: ruff format --check cleantext/ tests/
+
+    test:
         runs-on: ubuntu-latest
         strategy:
             matrix:
-                python-version: [3.7, 3.8, 3.9]
-                # Python 3.10 is not working due to a bug with numpy
-                # python-version: [3.7, 3.8, 3.9, "3.10"]
+                python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
                 extras:
                     [
                         "",
@@ -19,17 +28,11 @@ jobs:
                     ]
 
         steps:
-            - uses: actions/checkout@v2
-            - uses: actions/setup-python@v1
+            - uses: actions/checkout@v4
+            - uses: actions/setup-python@v5
               with:
                   python-version: ${{ matrix.python-version }}
-            - uses: Gr1N/setup-poetry@v7
-            - uses: actions/cache@v2
-              with:
-                  path: ~/.cache/pypoetry/virtualenvs
-                  key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.extras }}-poetry-${{ hashFiles('poetry.lock') }}
-
-            - name: Install depedencies with all extras combinations
-              run: poetry install --no-dev ${{ matrix.extras }} && poetry run pip install pytest
+            - run: pip install poetry
+            - run: poetry install --only main ${{ matrix.extras }} && poetry run pip install pytest
 
             - run: poetry run pytest
@@ -30,9 +30,12 @@
 except ImportError:
     from unicodedata import normalize
 
-    unidecode = lambda x: normalize("NFD", x).encode("ASCII", "ignore").decode("utf-8")
+    def unidecode(x):
+        return normalize("NFD", x).encode("ASCII", "ignore").decode("utf-8")
+
     log.warning(
-        "Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results."
+        "Since the GPL-licensed package `unidecode` is not installed, "
+        "using Python's `unicodedata` package which yields worse results."
     )
 
 
@@ -63,7 +66,7 @@ def fix_bad_unicode(text, normalization="NFC"):
     # trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
     try:
         text = text.encode("latin", "backslashreplace").decode("unicode-escape")
-    except:
+    except Exception:
         pass
 
     return fix_text(text, normalization=normalization)
@@ -100,9 +103,7 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
     return text
 
 
-def normalize_whitespace(
-    text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False
-):
+def normalize_whitespace(text, no_line_breaks=False, strip_lines=True, keep_two_line_breaks=False):
     """
     Given ``text`` str, replace one or more spacings with a single space, and one
     or more line breaks with a single newline. Also strip leading/trailing whitespace.
@@ -114,13 +115,9 @@ def normalize_whitespace(
         text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
     else:
         if keep_two_line_breaks:
-            text = constants.NONBREAKING_SPACE_REGEX.sub(
-                " ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
-            )
+            text = constants.NONBREAKING_SPACE_REGEX.sub(" ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text))
         else:
-            text = constants.NONBREAKING_SPACE_REGEX.sub(
-                " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
-            )
+            text = constants.NONBREAKING_SPACE_REGEX.sub(" ", constants.LINEBREAK_REGEX.sub(r"\n", text))
 
     return text.strip()
 
@@ -235,7 +232,8 @@ def clean(
     lang="en",
 ):
     """
-    Normalize various aspects of a raw text. A convenience function for applying all other preprocessing functions in one go.
+    Normalize various aspects of a raw text. A convenience function for applying all other
+    preprocessing functions in one go.
     Args:
         text (str): raw text to preprocess
         fix_unicode (bool): if True, fix "broken" unicode such as
@@ -262,7 +260,8 @@ def clean(
         replace_with_digit (str): special DIGIT token, default "0",
         replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
         replace_with_punct (str): replace punctuations with this token, default "",
-        lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
+        lang (str): special language-depended preprocessing.
+            Besides the default English ('en'), only German ('de') is supported
 
     Returns:
         str: input ``text`` processed according to function args
@@ -302,8 +301,6 @@ def clean(
         text = text.lower()
 
     if normalize_whitespace:
-        text = _normalize_whitespace(
-            text, no_line_breaks, strip_lines, keep_two_line_breaks
-        )
+        text = _normalize_whitespace(text, no_line_breaks, strip_lines, keep_two_line_breaks)
 
     return text
@@ -23,9 +23,7 @@
     "₴": "UAH",
     "₹": "INR",
 }
-CURRENCY_REGEX = re.compile(
-    "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
-)
+CURRENCY_REGEX = re.compile("({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys())))
 
 PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
     (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
@@ -65,7 +63,8 @@
     # r"(?:(?:https?|ftp)://)"  <-- alt?
     r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
     # user:pass authentication
-    r"(?:\S+(?::\S*)?@)?" r"(?:"
+    r"(?:\S+(?::\S*)?@)?"
+    r"(?:"
     # IP address exclusion
     # private & local networks
     r"(?!(?:10|127)(?:\.\d{1,3}){3})"
@@ -85,7 +84,10 @@
     # domain name
     r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
     # TLD identifier
-    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
+    r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))"
+    r"|"
+    r"(?:(localhost))"
+    r")"
     # port number
     r"(?::\d{2,5})?"
     # resource path
 
@@ -2,7 +2,7 @@
 Pipeline transformer for scikit-learn to clean text
 """
 
-from typing import Any, List, Union
+from typing import Any, Union
 
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -74,7 +74,7 @@ def fit(self, X: Any):
         """
         return self
 
-    def transform(self, X: Union[List[str], pd.Series]) -> Union[List[str], pd.Series]:
+    def transform(self, X: Union[list[str], pd.Series]) -> Union[list[str], pd.Series]:
         """
         Normalize various aspects of each item in raw text array-like.
         Args:
 
@@ -28,10 +28,7 @@ def save_replace(text, lang, back=False):
     possibilities = (
         specials[lang]["case_sensitive"]
         + [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]]
-        + [
-            [norm(x[0].upper()), x[1].upper()]
-            for x in specials[lang]["case_insensitive"]
-        ]
+        + [[norm(x[0].upper()), x[1].upper()] for x in specials[lang]["case_insensitive"]]
     )
     for pattern, target in possibilities:
         if back: