@@ -76,69 +76,48 @@ def _find_best_substring_match(
7676) -> tuple [str , int , int ] | None :
7777 """Finds the best matching substring in a line for the expected heading.
7878
79- Uses a sliding window approach to find the substring that best matches the expected
80- heading text. For all-caps standalone lines, prefers the full line to avoid
81- incorrect inline detection.
82-
83- The substring extraction uses fuzz.ratio (full string comparison) which is typically
84- stricter than the partial_ratio used for line screening. To maintain consistency,
85- the effective threshold is reduced by 10 points with a minimum floor of 60 to
86- prevent overly permissive matches.
79+ Uses rapidfuzz's partial_ratio_alignment to find the substring that best matches
80+ the expected heading text. For all-caps standalone lines, prefers the full line
81+ to avoid incorrect inline detection.
8782
8883 Args:
8984 line: Line of text to search.
9085 expected_heading: Expected heading text (original case).
9186 expected_lower: Expected heading text (lowercase).
92- threshold: Minimum fuzzy match score from partial_ratio screening. The actual
93- threshold used is max(threshold - 10, 60) to account for ratio being
94- stricter than partial_ratio.
87+ threshold: Minimum fuzzy match score to accept.
9588
9689 Returns:
9790 Tuple of (matched_substring, char_start, char_end) if found, None otherwise.
9891 """
9992 line_lower = line .lower ()
10093 line_stripped = line .strip ()
101- expected_len = len (expected_heading )
102- best_score = 0.0
103- substring_threshold = max (threshold - 10 , 60 )
104- best_match : tuple [str , int , int , float ] | None = None
10594
10695 if utils .detect_text_case (line_stripped ) == "all_caps" :
10796 full_line_score = fuzz .ratio (expected_lower , line_lower .strip ())
108- if full_line_score >= substring_threshold :
97+ if full_line_score >= threshold :
10998 actual_start = line .find (line_stripped )
11099 if actual_start != - 1 :
111100 actual_end = actual_start + len (line_stripped )
112101 return line_stripped , actual_start , actual_end
113102
114- # Try different window sizes around the expected length with extended range
115- # to handle cases where actual text is significantly shorter or longer
116- min_window = max (expected_len - 20 , 1 )
117- max_window = min (expected_len + 30 , len (line ) + 1 )
118-
119- for window_size in range (min_window , max_window ):
120- for start in range (len (line ) - window_size + 1 ):
121- end = start + window_size
122- substring = line [start :end ]
123- substring_lower = line_lower [start :end ]
124-
125- score = fuzz .ratio (expected_lower , substring_lower )
126-
127- if score > best_score :
128- best_score = score
129- substring_stripped = substring .strip ()
130- strip_offset = len (substring ) - len (substring .lstrip ())
131- best_match = (
132- substring_stripped ,
133- start + strip_offset ,
134- start + strip_offset + len (substring_stripped ),
135- score ,
136- )
103+ alignment = fuzz .partial_ratio_alignment (expected_lower , line_lower )
104+
105+ if alignment is None or alignment .score < threshold :
106+ return None
107+
108+ raw_match = line [alignment .dest_start : alignment .dest_end ]
109+ matched_text = raw_match .strip ()
110+
111+ if not matched_text :
112+ return None
113+
114+ # Calculate adjusted positions accounting for leading whitespace stripped
115+ strip_offset = len (raw_match ) - len (raw_match .lstrip ())
116+ char_start = alignment .dest_start + strip_offset
117+ char_end = char_start + len (matched_text )
137118
138- if best_match and best_match [3 ] >= substring_threshold :
139- matched_text , char_start , char_end , _ = best_match
140- if matched_text and char_start >= 0 and char_end <= len (line ):
141- return matched_text , char_start , char_end
119+ if char_start >= 0 and char_end <= len (line ):
120+ return matched_text , char_start , char_end
142121
143122 return None
144123
0 commit comments