@@ -159,29 +159,27 @@ def _fix_german_umlauts(self, text: str) -> str:
159159
160160 # Replace '"a' with 'ä' and '"A' with 'Ä'
161161 text = re .sub (r'"a' , 'ä' , text )
162+ text = re .sub (r'¨a' , 'ä' , text )
163+ text = re .sub (r'“a' , 'ä' , text )
162164 text = re .sub (r'"A' , 'Ä' , text )
165+ text = re .sub (r'¨A' , 'Ä' , text )
166+ text = re .sub (r'“A' , 'Ä' , text )
163167
164168 # Replace '"o' with 'ö' and '"O' with 'Ö'
165169 text = re .sub (r'"o' , 'ö' , text )
170+ text = re .sub (r'¨o' , 'ö' , text )
171+ text = re .sub (r'“o' , 'ö' , text )
166172 text = re .sub (r'"O' , 'Ö' , text )
173+ text = re .sub (r'¨O' , 'Ö' , text )
174+ text = re .sub (r'“O' , 'Ö' , text )
167175
168176 # Replace '"u' with 'ü' and '"U' with 'Ü'
169177 text = re .sub (r'"u' , 'ü' , text )
170- text = re .sub (r'"U' , 'Ü' , text )
171-
172- # Replace '¨a' with 'ä' and '¨A' with 'Ä'
173- text = re .sub (r'¨a' , 'ä' , text )
174- text = re .sub (r'¨A' , 'Ä' , text )
175-
176- # Replace '¨o' with 'ö' and '¨O' with 'Ö'
177- text = re .sub (r'¨o' , 'ö' , text )
178- text = re .sub (r'¨O' , 'Ö' , text )
179-
180- # Replace '¨u' with 'ü' and '¨U' with 'Ü'
181178 text = re .sub (r'¨u' , 'ü' , text )
179+ text = re .sub (r'“u' , 'ü' , text )
180+ text = re .sub (r'"U' , 'Ü' , text )
182181 text = re .sub (r'¨U' , 'Ü' , text )
183-
184-
182+ text = re .sub (r'“U' , 'Ü' , text )
185183
186184 return text
187185
@@ -235,7 +233,7 @@ def _clean_extracted_text(self, text: str) -> str:
235233
236234 # Step 6: Fix common spacing issues
237235 text = re .sub (r'\s+([.,;:!?])' , r'\1' , text ) # Remove space before punctuation
238-
236+
239237 # Protect URLs and email addresses to avoid inserting spaces within them
240238 protected = []
241239 def _mask (match ):
@@ -388,6 +386,7 @@ def extract_bibliography_section(self, pdf_path: str) -> Optional[str]:
388386 for page_num in range (len (doc )):
389387 page = doc [page_num ]
390388 page_text = page .get_text ()
389+ page_text = self ._clean_extracted_text (page_text )
391390
392391 # Look for bibliography section headers
393392 bib_headers = [
0 commit comments