Skip to content

Commit 7c19a81

Browse files
committed
Improve substring matching by extending window size range and including score in best match tuple
1 parent ca4c3bb commit 7c19a81

1 file changed

Lines changed: 19 additions & 13 deletions

File tree

src/headhunter/matcher.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def _find_best_substring_match(
101101
expected_len = len(expected_heading)
102102
best_score = 0.0
103103
substring_threshold = max(threshold - 10, 60)
104-
best_match: tuple[str, int, int] | None = None
104+
best_match: tuple[str, int, int, float] | None = None
105105

106106
if utils.detect_text_case(line_stripped) == "all_caps":
107107
full_line_score = fuzz.ratio(expected_lower, line_lower.strip())
@@ -111,10 +111,12 @@ def _find_best_substring_match(
111111
actual_end = actual_start + len(line_stripped)
112112
return line_stripped, actual_start, actual_end
113113

114-
# Try different window sizes around the expected length
115-
for window_size in range(
116-
max(expected_len - 5, 1), min(expected_len + 10, len(line) + 1)
117-
):
114+
# Try different window sizes around the expected length with extended range
115+
# to handle cases where actual text is significantly shorter or longer
116+
min_window = max(expected_len - 20, 1)
117+
max_window = min(expected_len + 30, len(line) + 1)
118+
119+
for window_size in range(min_window, max_window):
118120
for start in range(len(line) - window_size + 1):
119121
end = start + window_size
120122
substring = line[start:end]
@@ -124,15 +126,19 @@ def _find_best_substring_match(
124126

125127
if score > best_score:
126128
best_score = score
127-
best_match = (substring.strip(), start, end)
129+
substring_stripped = substring.strip()
130+
strip_offset = len(substring) - len(substring.lstrip())
131+
best_match = (
132+
substring_stripped,
133+
start + strip_offset,
134+
start + strip_offset + len(substring_stripped),
135+
score,
136+
)
128137

129-
if best_match and best_score >= substring_threshold:
130-
matched_text, _, _ = best_match
131-
if matched_text:
132-
actual_start = line.find(matched_text)
133-
if actual_start != -1:
134-
actual_end = actual_start + len(matched_text)
135-
return matched_text, actual_start, actual_end
138+
if best_match and best_match[3] >= substring_threshold:
139+
matched_text, char_start, char_end, _ = best_match
140+
if matched_text and char_start >= 0 and char_end <= len(line):
141+
return matched_text, char_start, char_end
136142

137143
return None
138144

0 commit comments

Comments
 (0)