Skip to content

Commit 916ec0b

Browse files
committed
Refactor substring matching to use rapidfuzz's partial_ratio_alignment to simplify threshold handling.
1 parent 7c19a81 commit 916ec0b

1 file changed

Lines changed: 22 additions & 43 deletions

File tree

src/headhunter/matcher.py

Lines changed: 22 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -76,69 +76,48 @@ def _find_best_substring_match(
7676
) -> tuple[str, int, int] | None:
7777
"""Finds the best matching substring in a line for the expected heading.
7878
79-
Uses a sliding window approach to find the substring that best matches the expected
80-
heading text. For all-caps standalone lines, prefers the full line to avoid
81-
incorrect inline detection.
82-
83-
The substring extraction uses fuzz.ratio (full string comparison) which is typically
84-
stricter than the partial_ratio used for line screening. To maintain consistency,
85-
the effective threshold is reduced by 10 points with a minimum floor of 60 to
86-
prevent overly permissive matches.
79+
Uses rapidfuzz's partial_ratio_alignment to find the substring that best matches
80+
the expected heading text. For all-caps standalone lines, prefers the full line
81+
to avoid incorrect inline detection.
8782
8883
Args:
8984
line: Line of text to search.
9085
expected_heading: Expected heading text (original case).
9186
expected_lower: Expected heading text (lowercase).
92-
threshold: Minimum fuzzy match score from partial_ratio screening. The actual
93-
threshold used is max(threshold - 10, 60) to account for ratio being
94-
stricter than partial_ratio.
87+
threshold: Minimum fuzzy match score to accept.
9588
9689
Returns:
9790
Tuple of (matched_substring, char_start, char_end) if found, None otherwise.
9891
"""
9992
line_lower = line.lower()
10093
line_stripped = line.strip()
101-
expected_len = len(expected_heading)
102-
best_score = 0.0
103-
substring_threshold = max(threshold - 10, 60)
104-
best_match: tuple[str, int, int, float] | None = None
10594

10695
if utils.detect_text_case(line_stripped) == "all_caps":
10796
full_line_score = fuzz.ratio(expected_lower, line_lower.strip())
108-
if full_line_score >= substring_threshold:
97+
if full_line_score >= threshold:
10998
actual_start = line.find(line_stripped)
11099
if actual_start != -1:
111100
actual_end = actual_start + len(line_stripped)
112101
return line_stripped, actual_start, actual_end
113102

114-
# Try different window sizes around the expected length with extended range
115-
# to handle cases where actual text is significantly shorter or longer
116-
min_window = max(expected_len - 20, 1)
117-
max_window = min(expected_len + 30, len(line) + 1)
118-
119-
for window_size in range(min_window, max_window):
120-
for start in range(len(line) - window_size + 1):
121-
end = start + window_size
122-
substring = line[start:end]
123-
substring_lower = line_lower[start:end]
124-
125-
score = fuzz.ratio(expected_lower, substring_lower)
126-
127-
if score > best_score:
128-
best_score = score
129-
substring_stripped = substring.strip()
130-
strip_offset = len(substring) - len(substring.lstrip())
131-
best_match = (
132-
substring_stripped,
133-
start + strip_offset,
134-
start + strip_offset + len(substring_stripped),
135-
score,
136-
)
103+
alignment = fuzz.partial_ratio_alignment(expected_lower, line_lower)
104+
105+
if alignment is None or alignment.score < threshold:
106+
return None
107+
108+
raw_match = line[alignment.dest_start : alignment.dest_end]
109+
matched_text = raw_match.strip()
110+
111+
if not matched_text:
112+
return None
113+
114+
# Calculate adjusted positions accounting for leading whitespace stripped
115+
strip_offset = len(raw_match) - len(raw_match.lstrip())
116+
char_start = alignment.dest_start + strip_offset
117+
char_end = char_start + len(matched_text)
137118

138-
if best_match and best_match[3] >= substring_threshold:
139-
matched_text, char_start, char_end, _ = best_match
140-
if matched_text and char_start >= 0 and char_end <= len(line):
141-
return matched_text, char_start, char_end
119+
if char_start >= 0 and char_end <= len(line):
120+
return matched_text, char_start, char_end
142121

143122
return None
144123

0 commit comments

Comments
 (0)