Skip to content

Commit b4dee39

Browse files
committed
Adds fuzzy heading matching and extraction capabilities.
Introduces fuzzy matching and extraction functionality for headings, enabling validation and recovery of headings even when they are embedded within content or use non-standard markdown syntax. Extends the API and data models to support expected heading lists, configurable match thresholds, and hierarchical rebuilding after extraction. Batch processing now aggregates match statistics, computes average match rates, and tracks missing/matched headings across documents. Improves heading metadata, restructures tokenization for consistency, and adds utility functions and tests for new functionality.
1 parent 7dffdd2 commit b4dee39

18 files changed

Lines changed: 1910 additions & 259 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
experiments/
22

3+
# VS Code settings
4+
.vscode/
5+
36
# MacOS system files
47
.DS_Store
58

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ license = "LGPL-2.1-only"
1010
readme = "README.md"
1111
requires-python = ">=3.12"
1212
dependencies = [
13-
"pandas>=2.3.3"
13+
"pandas>=2.3.3",
14+
"rapidfuzz>=3.14.3"
1415
]
1516

1617
[dependency-groups]

src/headhunter/api.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ def process_text(
1414
text: str,
1515
config: _config.ParserConfig | dict[str, int | str] | None = None,
1616
metadata: dict[str, object] | None = None,
17+
expected_headings: list[str] | None = None,
18+
match_threshold: int = 80,
1719
) -> models.ParsedText:
1820
"""Processes a single markdown text string.
1921
@@ -22,6 +24,10 @@ def process_text(
2224
config: Parser configuration. Can be a ParserConfig object or a dictionary
2325
of configuration parameters. If None, uses default configuration.
2426
metadata: Optional metadata to attach to the parsed document.
27+
expected_headings: Optional list of expected heading strings to match.
28+
If provided, performs fuzzy matching and extraction.
29+
match_threshold: Minimum fuzzy match score (0-100) for heading matching.
30+
Defaults to 80. Only used if expected_headings is provided.
2531
2632
Returns:
2733
ParsedText object containing tokens, hierarchy, and warnings.
@@ -46,7 +52,7 @@ def process_text(
4652

4753
all_warnings = tokenizer_warnings + hierarchy_warnings
4854

49-
return models.ParsedText(
55+
parsed_text = models.ParsedText(
5056
text=text,
5157
config=config,
5258
metadata=metadata,
@@ -55,6 +61,11 @@ def process_text(
5561
warnings=all_warnings,
5662
)
5763

64+
if expected_headings:
65+
parsed_text = parsed_text.match_headings(expected_headings, match_threshold)
66+
67+
return parsed_text
68+
5869
except Exception as e:
5970
# Wrap in ParsingError - traceback will be captured by caller
6071
logger.error(f"Fatal error during parsing: {str(e)}", exc_info=True)
@@ -71,6 +82,8 @@ def process_batch_df(
7182
id_column: str | None = None,
7283
metadata_columns: list[str] | None = None,
7384
config: _config.ParserConfig | dict[str, int | str] | None = None,
85+
expected_headings: list[str] | None = None,
86+
match_threshold: int = 80,
7487
) -> models.ParsedBatch:
7588
"""Processes a batch of markdown documents from a DataFrame.
7689
@@ -84,6 +97,10 @@ def process_batch_df(
8497
as document metadata. Defaults to None.
8598
config: Parser configuration. Can be a ParserConfig object or a dictionary
8699
of configuration parameters. If None, uses default configuration.
100+
expected_headings: Optional list of expected heading strings to match
101+
across all documents. If provided, performs fuzzy matching.
102+
match_threshold: Minimum fuzzy match score (0-100) for heading matching.
103+
Defaults to 80. Only used if expected_headings is provided.
87104
88105
Returns:
89106
ParsedBatch object containing successfully parsed documents
@@ -142,7 +159,7 @@ def process_batch_df(
142159
documents.append(parsed_doc)
143160

144161
except models.ParsingError as e:
145-
doc_id = doc_metadata.get("id", "unknown")
162+
doc_id = doc_metadata["id"]
146163
logger.warning(f"Parsing error for doc_id {doc_id} at row {idx}: {str(e)}")
147164
tb = traceback.format_exc()
148165
error_dict = {
@@ -157,7 +174,7 @@ def process_batch_df(
157174

158175
except Exception as e:
159176
# Unexpected error - still collect it
160-
doc_id = doc_metadata.get("id", "unknown")
177+
doc_id = doc_metadata["id"]
161178
logger.error(f"Unexpected error for doc_id {doc_id} at row {idx}: {str(e)}")
162179
tb = traceback.format_exc()
163180
error_dict = {
@@ -170,12 +187,23 @@ def process_batch_df(
170187
}
171188
errors.append(error_dict)
172189

190+
all_warnings: list[str] = []
191+
for doc in documents:
192+
doc_id = str(doc.metadata["id"])
193+
for warning in doc.warnings:
194+
all_warnings.append(f"[{doc_id}] {warning}")
195+
173196
batch = models.ParsedBatch(
174197
documents=documents,
175198
config=config,
176199
errors=errors,
200+
warnings=all_warnings,
177201
metadata_columns=metadata_columns,
178202
)
203+
204+
if expected_headings:
205+
batch = batch.match_headings(expected_headings, match_threshold)
206+
179207
logger.info(
180208
f"Batch processing complete: {len(documents)} successful, {len(errors)} errors"
181209
)

src/headhunter/config.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import dataclasses
44
import logging
55
import re
6+
import typing
67

78

89
def get_logger(name: str) -> logging.Logger:
@@ -33,21 +34,33 @@ class ParserConfig:
3334
"""Configuration for markdown parsing.
3435
3536
Attributes:
36-
heading_hash_pattern: Regex pattern for hash-style headings
37-
(e.g., # Heading).
38-
heading_asterisk_pattern: Regex pattern for asterisk-style headings
39-
(e.g., **Bold**).
40-
inline_colon_pattern: Regex pattern for inline headings with colon
41-
(e.g., **Label:** value).
4237
heading_max_words: Maximum number of words to consider a line as a heading.
38+
heading_hash_pattern: Regex pattern for hash-style headings (e.g., # Heading).
39+
heading_asterisk_pattern: Regex pattern for asterisk-style headings (e.g.,
40+
**Bold**).
41+
inline_colon_pattern: Regex pattern for inline headings with colon (e.g.,
42+
**Label:** value).
43+
match_hash_pattern: Relaxed regex pattern for matching hash headings anywhere
44+
in text (used in matcher).
45+
match_asterisk_pattern: Relaxed regex pattern for matching asterisk headings
46+
anywhere in text (used in matcher).
47+
match_inline_colon_pattern: Relaxed regex pattern for matching inline colon
48+
headings anywhere in text (used in matcher).
4349
"""
4450

51+
heading_max_words: int = 10
52+
4553
heading_hash_pattern: re.Pattern[str] = re.compile(r"^(#{1,6})\s*(.*)")
4654
heading_asterisk_pattern: re.Pattern[str] = re.compile(r"^(\*{1,3})\s*(.*?)\s*\1$")
4755
inline_colon_pattern: re.Pattern[str] = re.compile(
4856
r"^(\*{1,3})\s*(.*?):\s*\1\s*(.+)$|^(\*{1,3})\s*(.*?)\s*\4:\s*(.+)$"
4957
)
50-
heading_max_words: int = 10
58+
59+
match_hash_pattern: re.Pattern[str] = re.compile(r"(#{1,6})\s+(.+?)(?:\s|$)")
60+
match_asterisk_pattern: re.Pattern[str] = re.compile(r"(\*{1,3})\s*(.+?)\s*\1")
61+
match_inline_colon_pattern: re.Pattern[str] = re.compile(
62+
r"(\*{1,3})\s*(.+?):\s*\1|^(\*{1,3})\s*(.+?)\s*\3:"
63+
)
5164

5265
@classmethod
5366
def from_dict(cls, config_dict: dict[str, int | str]) -> "ParserConfig":
@@ -57,31 +70,40 @@ def from_dict(cls, config_dict: dict[str, int | str]) -> "ParserConfig":
5770
config_dict: Dictionary with configuration parameters. Supported keys:
5871
- heading_max_words (int): Maximum words in a heading
5972
- heading_hash_pattern (str): Regex pattern for hash headings
60-
- heading_asterisk_pattern (str): Regex pattern for asterisk headings
61-
- inline_colon_pattern (str): Regex pattern for inline headings
73+
- heading_asterisk_pattern (str): Regex for asterisk headings
74+
- inline_colon_pattern (str): Regex for inline headings
75+
- match_hash_pattern (str): Regex for matcher hash headings
76+
- match_asterisk_pattern (str): Regex for matcher asterisk headings
77+
- match_inline_colon_pattern (str): Regex for matcher inline headings
6278
6379
Returns:
6480
ParserConfig instance with custom parameters merged with defaults.
6581
"""
6682
logger = get_logger(__name__)
67-
valid_params = {f.name for f in cls.__dataclass_fields__.values()}
68-
kwargs: dict[str, object] = {}
83+
valid_params = {f.name for f in dataclasses.fields(cls)}
84+
unknown_params = [key for key in config_dict.keys() if key not in valid_params]
85+
kwargs: dict[str, typing.Any] = {}
86+
87+
if unknown_params:
88+
valid_param_names = ", ".join(sorted(valid_params))
89+
unknown_param_names = ", ".join(f"'{key}'" for key in unknown_params)
90+
logger.warning(
91+
"Unknown custom configuration parameter(s) will be ignored: "
92+
f"{unknown_param_names}. Valid parameters are: {valid_param_names}"
93+
)
6994

7095
for key, value in config_dict.items():
7196
if key in valid_params:
7297
if key in [
7398
"heading_hash_pattern",
7499
"heading_asterisk_pattern",
75100
"inline_colon_pattern",
101+
"match_hash_pattern",
102+
"match_asterisk_pattern",
103+
"match_inline_colon_pattern",
76104
] and isinstance(value, str):
77105
kwargs[key] = re.compile(value)
78106
elif key == "heading_max_words" and isinstance(value, int):
79107
kwargs[key] = value
80-
else:
81-
valid_param_names = ", ".join(sorted(valid_params))
82-
logger.warning(
83-
f"Unknown configuration parameter will be ignored: '{key}'. "
84-
f"Valid parameters are: {valid_param_names}"
85-
)
86-
87-
return cls(**kwargs) # type: ignore[arg-type]
108+
109+
return cls(**kwargs)

0 commit comments

Comments
 (0)