File tree Expand file tree Collapse file tree 4 files changed +16
-10
lines changed Expand file tree Collapse file tree 4 files changed +16
-10
lines changed Original file line number Diff line number Diff line change 11[tool .isort ]
22profile = " black"
3- line_length = 88
3+ line_length = 88
Original file line number Diff line number Diff line change 11import re
2-
32import pandas as pd
43
5-
64class DataCleaning :
75 def __init__ (self , df ):
86 self .df = df
97
8+ @staticmethod
9+ def clean_text (text ):
10+ # Basic cleaning such as removing unwanted characters and converting to lowercase
11+ return re .sub (r'\W+' , ' ' , text .lower ()).strip ()
12+
1013 def remove_duplicates (self ):
1114 # Remove duplicate rows
1215 self .df = self .df .drop_duplicates ()
@@ -16,14 +19,14 @@ def remove_biases(self):
1619 biased_phrases = ["offensive term 1" , "offensive term 2" ]
1720 self .df = self .df [~ self .df ['text' ].str .contains ('|' .join (biased_phrases ), case = False )]
1821
19- def clean_text (self ):
20- # Basic cleaning such as removing unwanted characters
21- self .df ['text' ] = self .df ['text' ].apply (lambda x : re . sub ( r'\W+' , ' ' , x . lower ()) )
22+ def clean_all_text (self ):
23+ # Apply the clean_text method to every row in the 'text' column
24+ self .df ['text' ] = self .df ['text' ].apply (DataCleaning . clean_text )
2225
2326 def get_cleaned_data (self ):
2427 self .remove_duplicates ()
2528 self .remove_biases ()
26- self .clean_text ()
29+ self .clean_all_text ()
2730 return self .df
2831
2932if __name__ == "__main__" :
Original file line number Diff line number Diff line change 1- from src .preprocessing .cleaning import clean_text
2-
1+ from src .preprocessing .cleaning import DataCleaning
32
43def test_clean_text ():
5- assert clean_text ("Hello!!!" ) == "hello"
4+ # Testing basic cleaning: punctuation removal and lowercasing
5+ assert DataCleaning .clean_text ("Hello!!!" ) == "hello"
6+ assert DataCleaning .clean_text ("Python is GREAT!!!" ) == "python is great"
7+ assert DataCleaning .clean_text ("12345!!" ) == "12345"
8+ assert DataCleaning .clean_text (" Mixed CASE with @#$ Special! " ) == "mixed case with special"
You can’t perform that action at this time.
0 commit comments