Skip to content

Commit 76dcedc

Browse files
committed
test passed
1 parent 4221356 commit 76dcedc

File tree

4 files changed

+16
-10
lines changed

4 files changed

+16
-10
lines changed

.coverage

0 Bytes
Binary file not shown.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[tool.isort]
22
profile = "black"
3-
line_length = 88
3+
line_length = 88

src/preprocessing/cleaning.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import re
2-
32
import pandas as pd
43

5-
64
class DataCleaning:
75
def __init__(self, df):
86
self.df = df
97

8+
@staticmethod
9+
def clean_text(text):
10+
# Basic cleaning such as removing unwanted characters and converting to lowercase
11+
return re.sub(r'\W+', ' ', text.lower()).strip()
12+
1013
def remove_duplicates(self):
1114
# Remove duplicate rows
1215
self.df = self.df.drop_duplicates()
@@ -16,14 +19,14 @@ def remove_biases(self):
1619
biased_phrases = ["offensive term 1", "offensive term 2"]
1720
self.df = self.df[~self.df['text'].str.contains('|'.join(biased_phrases), case=False)]
1821

19-
def clean_text(self):
20-
# Basic cleaning such as removing unwanted characters
21-
self.df['text'] = self.df['text'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))
22+
def clean_all_text(self):
23+
# Apply the clean_text method to every row in the 'text' column
24+
self.df['text'] = self.df['text'].apply(DataCleaning.clean_text)
2225

2326
def get_cleaned_data(self):
2427
self.remove_duplicates()
2528
self.remove_biases()
26-
self.clean_text()
29+
self.clean_all_text()
2730
return self.df
2831

2932
if __name__ == "__main__":

tests/test_preprocessing.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
from src.preprocessing.cleaning import clean_text
2-
1+
from src.preprocessing.cleaning import DataCleaning
32

43
def test_clean_text():
5-
assert clean_text("Hello!!!") == "hello"
4+
# Testing basic cleaning: punctuation removal and lowercasing
5+
assert DataCleaning.clean_text("Hello!!!") == "hello"
6+
assert DataCleaning.clean_text("Python is GREAT!!!") == "python is great"
7+
assert DataCleaning.clean_text("12345!!") == "12345"
8+
assert DataCleaning.clean_text(" Mixed CASE with @#$ Special! ") == "mixed case with special"

0 commit comments

Comments
 (0)