@@ -101,7 +101,7 @@ def _find_best_substring_match(
101101 expected_len = len (expected_heading )
102102 best_score = 0.0
103103 substring_threshold = max (threshold - 10 , 60 )
104- best_match : tuple [str , int , int ] | None = None
104+ best_match : tuple [str , int , int , float ] | None = None
105105
106106 if utils .detect_text_case (line_stripped ) == "all_caps" :
107107 full_line_score = fuzz .ratio (expected_lower , line_lower .strip ())
@@ -111,10 +111,12 @@ def _find_best_substring_match(
111111 actual_end = actual_start + len (line_stripped )
112112 return line_stripped , actual_start , actual_end
113113
114- # Try different window sizes around the expected length
115- for window_size in range (
116- max (expected_len - 5 , 1 ), min (expected_len + 10 , len (line ) + 1 )
117- ):
114+ # Try different window sizes around the expected length with extended range
115+ # to handle cases where actual text is significantly shorter or longer
116+ min_window = max (expected_len - 20 , 1 )
117+ max_window = min (expected_len + 30 , len (line ) + 1 )
118+
119+ for window_size in range (min_window , max_window ):
118120 for start in range (len (line ) - window_size + 1 ):
119121 end = start + window_size
120122 substring = line [start :end ]
@@ -124,15 +126,19 @@ def _find_best_substring_match(
124126
125127 if score > best_score :
126128 best_score = score
127- best_match = (substring .strip (), start , end )
129+ substring_stripped = substring .strip ()
130+ strip_offset = len (substring ) - len (substring .lstrip ())
131+ best_match = (
132+ substring_stripped ,
133+ start + strip_offset ,
134+ start + strip_offset + len (substring_stripped ),
135+ score ,
136+ )
128137
129- if best_match and best_score >= substring_threshold :
130- matched_text , _ , _ = best_match
131- if matched_text :
132- actual_start = line .find (matched_text )
133- if actual_start != - 1 :
134- actual_end = actual_start + len (matched_text )
135- return matched_text , actual_start , actual_end
138+ if best_match and best_match [3 ] >= substring_threshold :
139+ matched_text , char_start , char_end , _ = best_match
140+ if matched_text and char_start >= 0 and char_end <= len (line ):
141+ return matched_text , char_start , char_end
136142
137143 return None
138144
0 commit comments