@@ -60,11 +60,8 @@ def read_input_file(self):
6060 def read_txt_file (self ):
6161 with codecs .open (self .input_filename , "r" , encoding = "utf-8" ) as infile :
6262 content = infile .read ()
63- self .logger .debug (f"Raw content read from file: { repr (content )} " )
6463 lines = content .splitlines ()
65- self .logger .debug (f"Number of lines read: { len (lines )} " )
66- for i , line in enumerate (lines ):
67- self .logger .debug (f"Line { i } : { repr (line )} " )
64+ self .logger .debug (f"Read { len (lines )} lines from { self .input_filename } " )
6865 return self .clean_text (content ).splitlines ()
6966
7067 def read_doc_file (self ):
@@ -78,16 +75,29 @@ def read_rtf_file(self):
7875 return self .clean_text (plain_text ).splitlines ()
7976
8077 def clean_text (self , text ):
81- self . logger . debug ( f"Cleaning text: { repr ( text ) } " )
82- # Remove any non-printable characters except newlines and U+2005 (four-per-em space )
78+ # Remove any non-printable characters except newlines and U+2005
79+ original_len = len ( text )
8380 cleaned = "" .join (char for char in text if char .isprintable () or char in ["\n " , "\u2005 " ])
84- self .logger .debug (f"Text after removing non-printable characters: { repr (cleaned )} " )
81+ if len (cleaned ) != original_len :
82+ self .logger .debug (f"Removed { original_len - len (cleaned )} non-printable characters" )
83+
8584 # Replace multiple newlines with a single newline
85+ newlines_before = cleaned .count ("\n " )
8686 cleaned = re .sub (r"\n{2,}" , "\n " , cleaned )
87- self .logger .debug (f"Text after replacing multiple newlines: { repr (cleaned )} " )
87+ newlines_after = cleaned .count ("\n " )
88+ if newlines_before != newlines_after :
89+ self .logger .debug (f"Consolidated { newlines_before - newlines_after } extra newlines" )
90+
8891 # Remove leading/trailing whitespace from each line
89- cleaned = "\n " .join (line .strip () for line in cleaned .splitlines ())
90- self .logger .debug (f"Final cleaned text: { repr (cleaned )} " )
92+ lines_before = cleaned .splitlines ()
93+ cleaned = "\n " .join (line .strip () for line in lines_before )
94+ lines_after = cleaned .splitlines ()
95+
96+ # Count lines that changed due to stripping
97+ changed_lines = sum (1 for before , after in zip (lines_before , lines_after ) if before != after )
98+ if changed_lines > 0 :
99+ self .logger .debug (f"Stripped whitespace from { changed_lines } lines" )
100+
91101 return cleaned
92102
93103 def find_best_split_point (self , line ):
@@ -147,8 +157,6 @@ def replace_non_printable_spaces(self, text):
147157 Replace non-printable space-like characters, tabs, and other whitespace with regular spaces,
148158 excluding newline characters.
149159 """
150- self .logger .debug (f"Replacing non-printable spaces in: { repr (text )} " )
151-
152160 # Log each character and its Unicode code point
153161 # for i, char in enumerate(text):
154162 # self.logger.debug(f"Character at position {i}: {repr(char)} (Unicode: U+{ord(char):04X})")
@@ -159,35 +167,29 @@ def replace_non_printable_spaces(self, text):
159167 # Replace matched characters with a regular space
160168 cleaned_text = re .sub (space_pattern , " " , text )
161169
162- # Log the result of the replacement
163- self .logger .debug (f"Text after replacing non-printable spaces: { repr (cleaned_text )} " )
164-
165170 # Remove leading/trailing spaces and collapse multiple spaces into one, preserving newlines
166171 final_text = re .sub (r" +" , " " , cleaned_text ).strip ()
167172
168- # Log the final result
169- self .logger .debug (f"Final text after cleaning: { repr (final_text )} " )
170-
171173 return final_text
172174
173175 def clean_punctuation_spacing (self , text ):
174176 """
175177 Remove unnecessary spaces before punctuation marks.
176178 """
177- self .logger .debug (f"Cleaning punctuation spacing in: { text } " )
179+ self .logger .debug (f"Cleaning punctuation spacing" )
178180 # Remove space before comma, period, exclamation mark, question mark, colon, and semicolon
179181 cleaned_text = re .sub (r"\s+([,\.!?:;])" , r"\1" , text )
180- self . logger . debug ( f"Text after cleaning punctuation spacing: { cleaned_text } " )
182+
181183 return cleaned_text
182184
183185 def fix_commas_inside_quotes (self , text ):
184186 """
185187 Move commas inside quotes to after the closing quote.
186188 """
187- self .logger .debug (f"Fixing commas inside quotes in: { text } " )
189+ self .logger .debug (f"Fixing commas inside quotes" )
188190 # Use regex to find patterns where a comma is inside quotes and move it outside
189191 fixed_text = re .sub (r'(".*?)(,)(\s*")' , r"\1\3\2" , text )
190- self . logger . debug ( f"Text after fixing commas inside quotes: { fixed_text } " )
192+
191193 return fixed_text
192194
193195 def process_line (self , line ):
@@ -307,7 +309,7 @@ def process(self):
307309 processed_lyrics_text = self .clean_punctuation_spacing (processed_lyrics_text )
308310
309311 self .processed_lyrics_text = processed_lyrics_text
310-
312+
311313 # Try to copy to clipboard, but don't fail if it's not available
312314 try :
313315 pyperclip .copy (processed_lyrics_text )
0 commit comments