test passed

astorfi · astorfi · commit 76dcedcdb92d · 2024-11-23T05:03:26.000-05:00
diff --git a/.coverage b/.coverage
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,3 @@
 [tool.isort]
 profile = "black"
-line_length = 88
+line_length = 88
diff --git a/src/preprocessing/cleaning.py b/src/preprocessing/cleaning.py
@@ -1,12 +1,15 @@
 import re
-
 import pandas as pd
 
-
 class DataCleaning:
     def __init__(self, df):
         self.df = df
 
+    @staticmethod
+    def clean_text(text):
+        # Basic cleaning such as removing unwanted characters and converting to lowercase
+        return re.sub(r'\W+', ' ', text.lower()).strip()
+
     def remove_duplicates(self):
         # Remove duplicate rows
         self.df = self.df.drop_duplicates()
@@ -16,14 +19,14 @@ def remove_biases(self):
         biased_phrases = ["offensive term 1", "offensive term 2"]
         self.df = self.df[~self.df['text'].str.contains('|'.join(biased_phrases), case=False)]
 
-    def clean_text(self):
-        # Basic cleaning such as removing unwanted characters
-        self.df['text'] = self.df['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))
+    def clean_all_text(self):
+        # Apply the clean_text method to every row in the 'text' column
+        self.df['text'] = self.df['text'].apply(DataCleaning.clean_text)
 
     def get_cleaned_data(self):
         self.remove_duplicates()
         self.remove_biases()
-        self.clean_text()
+        self.clean_all_text()
         return self.df
 
 if __name__ == "__main__":
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -1,5 +1,8 @@
-from src.preprocessing.cleaning import clean_text
-
+from src.preprocessing.cleaning import DataCleaning
 
 def test_clean_text():
-    assert clean_text("Hello!!!") == "hello"
+    # Testing basic cleaning: punctuation removal and lowercasing
+    assert DataCleaning.clean_text("Hello!!!") == "hello"
+    assert DataCleaning.clean_text("Python   is    GREAT!!!") == "python is great"
+    assert DataCleaning.clean_text("12345!!") == "12345"
+    assert DataCleaning.clean_text("   Mixed CASE with @#$ Special!   ") == "mixed case with special"