childmindresearch
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/headhunter/api.py‎
Lines changed: 31 additions & 3 deletions b/‎src/headhunter/api.py‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎src/headhunter/config.py‎
Lines changed: 41 additions & 19 deletions b/‎src/headhunter/config.py‎
Lines changed: 41 additions & 19 deletions
@@ -1,5 +1,8 @@
 experiments/
 
+# VS Code settings
+.vscode/
+
 # MacOS system files
 .DS_Store
 
 
@@ -10,7 +10,8 @@ license = "LGPL-2.1-only"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
-  "pandas>=2.3.3"
+  "pandas>=2.3.3",
+  "rapidfuzz>=3.14.3"
 ]
 
 [dependency-groups]
 
@@ -14,6 +14,8 @@ def process_text(
     text: str,
     config: _config.ParserConfig | dict[str, int | str] | None = None,
     metadata: dict[str, object] | None = None,
+    expected_headings: list[str] | None = None,
+    match_threshold: int = 80,
 ) -> models.ParsedText:
     """Processes a single markdown text string.
 
@@ -22,6 +24,10 @@ def process_text(
         config: Parser configuration. Can be a ParserConfig object or a dictionary
             of configuration parameters. If None, uses default configuration.
         metadata: Optional metadata to attach to the parsed document.
+        expected_headings: Optional list of expected heading strings to match.
+            If provided, performs fuzzy matching and extraction.
+        match_threshold: Minimum fuzzy match score (0-100) for heading matching.
+            Defaults to 80. Only used if expected_headings is provided.
 
     Returns:
         ParsedText object containing tokens, hierarchy, and warnings.
@@ -46,7 +52,7 @@ def process_text(
 
         all_warnings = tokenizer_warnings + hierarchy_warnings
 
-        return models.ParsedText(
+        parsed_text = models.ParsedText(
             text=text,
             config=config,
             metadata=metadata,
@@ -55,6 +61,11 @@ def process_text(
             warnings=all_warnings,
         )
 
+        if expected_headings:
+            parsed_text = parsed_text.match_headings(expected_headings, match_threshold)
+
+        return parsed_text
+
     except Exception as e:
         # Wrap in ParsingError - traceback will be captured by caller
         logger.error(f"Fatal error during parsing: {str(e)}", exc_info=True)
@@ -71,6 +82,8 @@ def process_batch_df(
     id_column: str | None = None,
     metadata_columns: list[str] | None = None,
     config: _config.ParserConfig | dict[str, int | str] | None = None,
+    expected_headings: list[str] | None = None,
+    match_threshold: int = 80,
 ) -> models.ParsedBatch:
     """Processes a batch of markdown documents from a DataFrame.
 
@@ -84,6 +97,10 @@ def process_batch_df(
             as document metadata. Defaults to None.
         config: Parser configuration. Can be a ParserConfig object or a dictionary
             of configuration parameters. If None, uses default configuration.
+        expected_headings: Optional list of expected heading strings to match
+            across all documents. If provided, performs fuzzy matching.
+        match_threshold: Minimum fuzzy match score (0-100) for heading matching.
+            Defaults to 80. Only used if expected_headings is provided.
 
     Returns:
         ParsedBatch object containing successfully parsed documents
@@ -142,7 +159,7 @@ def process_batch_df(
             documents.append(parsed_doc)
 
         except models.ParsingError as e:
-            doc_id = doc_metadata.get("id", "unknown")
+            doc_id = doc_metadata["id"]
             logger.warning(f"Parsing error for doc_id {doc_id} at row {idx}: {str(e)}")
             tb = traceback.format_exc()
             error_dict = {
@@ -157,7 +174,7 @@ def process_batch_df(
 
         except Exception as e:
             # Unexpected error - still collect it
-            doc_id = doc_metadata.get("id", "unknown")
+            doc_id = doc_metadata["id"]
             logger.error(f"Unexpected error for doc_id {doc_id} at row {idx}: {str(e)}")
             tb = traceback.format_exc()
             error_dict = {
@@ -170,12 +187,23 @@ def process_batch_df(
             }
             errors.append(error_dict)
 
+    all_warnings: list[str] = []
+    for doc in documents:
+        doc_id = str(doc.metadata["id"])
+        for warning in doc.warnings:
+            all_warnings.append(f"[{doc_id}] {warning}")
+
     batch = models.ParsedBatch(
         documents=documents,
         config=config,
         errors=errors,
+        warnings=all_warnings,
         metadata_columns=metadata_columns,
     )
+
+    if expected_headings:
+        batch = batch.match_headings(expected_headings, match_threshold)
+
     logger.info(
         f"Batch processing complete: {len(documents)} successful, {len(errors)} errors"
     )
 
@@ -3,6 +3,7 @@
 import dataclasses
 import logging
 import re
+import typing
 
 
 def get_logger(name: str) -> logging.Logger:
@@ -33,21 +34,33 @@ class ParserConfig:
     """Configuration for markdown parsing.
 
     Attributes:
-        heading_hash_pattern: Regex pattern for hash-style headings
-            (e.g., # Heading).
-        heading_asterisk_pattern: Regex pattern for asterisk-style headings
-            (e.g., **Bold**).
-        inline_colon_pattern: Regex pattern for inline headings with colon
-            (e.g., **Label:** value).
         heading_max_words: Maximum number of words to consider a line as a heading.
+        heading_hash_pattern: Regex pattern for hash-style headings (e.g., # Heading).
+        heading_asterisk_pattern: Regex pattern for asterisk-style headings (e.g.,
+            **Bold**).
+        inline_colon_pattern: Regex pattern for inline headings with colon (e.g.,
+            **Label:** value).
+        match_hash_pattern: Relaxed regex pattern for matching hash headings anywhere
+            in text (used in matcher).
+        match_asterisk_pattern: Relaxed regex pattern for matching asterisk headings
+            anywhere in text (used in matcher).
+        match_inline_colon_pattern: Relaxed regex pattern for matching inline colon
+            headings anywhere in text (used in matcher).
     """
 
+    heading_max_words: int = 10
+
     heading_hash_pattern: re.Pattern[str] = re.compile(r"^(#{1,6})\s*(.*)")
     heading_asterisk_pattern: re.Pattern[str] = re.compile(r"^(\*{1,3})\s*(.*?)\s*\1$")
     inline_colon_pattern: re.Pattern[str] = re.compile(
         r"^(\*{1,3})\s*(.*?):\s*\1\s*(.+)$|^(\*{1,3})\s*(.*?)\s*\4:\s*(.+)$"
     )
-    heading_max_words: int = 10
+
+    match_hash_pattern: re.Pattern[str] = re.compile(r"(#{1,6})\s+(.+?)(?:\s|$)")
+    match_asterisk_pattern: re.Pattern[str] = re.compile(r"(\*{1,3})\s*(.+?)\s*\1")
+    match_inline_colon_pattern: re.Pattern[str] = re.compile(
+        r"(\*{1,3})\s*(.+?):\s*\1|^(\*{1,3})\s*(.+?)\s*\3:"
+    )
 
     @classmethod
     def from_dict(cls, config_dict: dict[str, int | str]) -> "ParserConfig":
@@ -57,31 +70,40 @@ def from_dict(cls, config_dict: dict[str, int | str]) -> "ParserConfig":
             config_dict: Dictionary with configuration parameters. Supported keys:
                 - heading_max_words (int): Maximum words in a heading
                 - heading_hash_pattern (str): Regex pattern for hash headings
-                - heading_asterisk_pattern (str): Regex pattern for asterisk headings
-                - inline_colon_pattern (str): Regex pattern for inline headings
+                - heading_asterisk_pattern (str): Regex for asterisk headings
+                - inline_colon_pattern (str): Regex for inline headings
+                - match_hash_pattern (str): Regex for matcher hash headings
+                - match_asterisk_pattern (str): Regex for matcher asterisk headings
+                - match_inline_colon_pattern (str): Regex for matcher inline headings
 
         Returns:
             ParserConfig instance with custom parameters merged with defaults.
         """
         logger = get_logger(__name__)
-        valid_params = {f.name for f in cls.__dataclass_fields__.values()}
-        kwargs: dict[str, object] = {}
+        valid_params = {f.name for f in dataclasses.fields(cls)}
+        unknown_params = [key for key in config_dict.keys() if key not in valid_params]
+        kwargs: dict[str, typing.Any] = {}
+
+        if unknown_params:
+            valid_param_names = ", ".join(sorted(valid_params))
+            unknown_param_names = ", ".join(f"'{key}'" for key in unknown_params)
+            logger.warning(
+                "Unknown custom configuration parameter(s) will be ignored: "
+                f"{unknown_param_names}. Valid parameters are: {valid_param_names}"
+            )
 
         for key, value in config_dict.items():
             if key in valid_params:
                 if key in [
                     "heading_hash_pattern",
                     "heading_asterisk_pattern",
                     "inline_colon_pattern",
+                    "match_hash_pattern",
+                    "match_asterisk_pattern",
+                    "match_inline_colon_pattern",
                 ] and isinstance(value, str):
                     kwargs[key] = re.compile(value)
                 elif key == "heading_max_words" and isinstance(value, int):
                     kwargs[key] = value
-            else:
-                valid_param_names = ", ".join(sorted(valid_params))
-                logger.warning(
-                    f"Unknown configuration parameter will be ignored: '{key}'. "
-                    f"Valid parameters are: {valid_param_names}"
-                )
-
-        return cls(**kwargs)  # type: ignore[arg-type]
+
+        return cls(**kwargs)
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,8 @@ license = "LGPL-2.1-only"`
`10`	`10`	`readme = "README.md"`
`11`	`11`	`requires-python = ">=3.12"`
`12`	`12`	`dependencies = [`
`13`		`- "pandas>=2.3.3"`
	`13`	`+ "pandas>=2.3.3",`
	`14`	`+ "rapidfuzz>=3.14.3"`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`[dependency-groups]`