childmindresearch
diff --git a/‎README.md‎
Lines changed: 27 additions & 5 deletions b/‎README.md‎
Lines changed: 27 additions & 5 deletions
diff --git a/‎src/headhunter/models.py‎
Lines changed: 26 additions & 0 deletions b/‎src/headhunter/models.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/headhunter/regenerate.py‎
Lines changed: 75 additions & 0 deletions b/‎src/headhunter/regenerate.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 85 additions & 30 deletions b/‎tests/conftest.py‎
Lines changed: 85 additions & 30 deletions
diff --git a/‎…ts/fixtures/sample_data_match_parsed.csv‎ ‎…xpected_csv/sample_data_match_parsed.csv‎tests/fixtures/sample_data_match_parsed.csv renamed to tests/fixtures/expected_csv/sample_data_match_parsed.csv b/‎…ts/fixtures/sample_data_match_parsed.csv‎ ‎…xpected_csv/sample_data_match_parsed.csv‎tests/fixtures/sample_data_match_parsed.csv renamed to tests/fixtures/expected_csv/sample_data_match_parsed.csv
diff --git a/‎tests/fixtures/sample_data_parsed.csv‎ ‎…ures/expected_csv/sample_data_parsed.csv‎tests/fixtures/sample_data_parsed.csv renamed to tests/fixtures/expected_csv/sample_data_parsed.csv b/‎tests/fixtures/sample_data_parsed.csv‎ ‎…ures/expected_csv/sample_data_parsed.csv‎tests/fixtures/sample_data_parsed.csv renamed to tests/fixtures/expected_csv/sample_data_parsed.csv
diff --git a/‎tests/fixtures/match.json‎ ‎tests/fixtures/expected_json/match.json‎tests/fixtures/match.json renamed to tests/fixtures/expected_json/match.json b/‎tests/fixtures/match.json‎ ‎tests/fixtures/expected_json/match.json‎tests/fixtures/match.json renamed to tests/fixtures/expected_json/match.json
diff --git a/‎tests/fixtures/mixed.json‎ ‎tests/fixtures/expected_json/mixed.json‎tests/fixtures/mixed.json renamed to tests/fixtures/expected_json/mixed.json b/‎tests/fixtures/mixed.json‎ ‎tests/fixtures/expected_json/mixed.json‎tests/fixtures/mixed.json renamed to tests/fixtures/expected_json/mixed.json
diff --git a/‎tests/fixtures/expected_markdown/doc1.md‎
Lines changed: 14 additions & 0 deletions b/‎tests/fixtures/expected_markdown/doc1.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/fixtures/expected_markdown/doc2.md‎
Lines changed: 16 additions & 0 deletions b/‎tests/fixtures/expected_markdown/doc2.md‎
Lines changed: 16 additions & 0 deletions
@@ -13,7 +13,7 @@ A parser for extracting headings and hierarchical structure from Markdown files.
 
 - Parse multiple heading formats (hash `#`, asterisk `**`, inline with colon, all-caps)
 - Build hierarchical structure from headings
-- **Fuzzy heading matching** to extract expected headings from improperly formatted documents
+- **Fuzzy heading matching** to extract expected headings from improperly formatted documents, even with typos or spelling variations
 - Process single documents or batches from DataFrames
 - Export results to DataFrame, JSON, or tree visualizations
 - Configurable parsing rules and word limits
@@ -48,6 +48,10 @@ parsed_result.to_json("output.json")
 # Print tree visualization
 print(parsed_result.to_tree())
 
+# Regenerate clean markdown from parsed structure
+regenerated_md = parsed_result.to_markdown()
+print(regenerated_md)
+
 # View results in a pandas DataFrame
 df_parsed = parsed_result.to_dataframe()
 print(df_parsed)
@@ -117,11 +121,11 @@ df_parsed.to_csv("parsed_data.csv")
 ```python
 import headhunter
 
-# Document where headings are embedded inline or lack proper formatting
+# Document where headings are embedded inline, lack proper formatting or have typos
 messy_doc = """
 This document has ## Heading 1 embedded in text without line breaks.
 Then we have **heading 2** in bold but inline.
-**Inline Heading:** with content on the same line.
+**Inline Haedign:** with content on the same line.
 """
 
 # Specify expected headings to extract via fuzzy matching
@@ -137,7 +141,7 @@ print(parsed.metadata)  # includes: matched_count, expected_count, match_percent
 
 ## How Hierarchy is Built
 
-Headhunter recognizes different heading styles in Markdown and builds a hierarchical structure by assigning levels to each heading. The following rules govern this process:
+`headhunter` recognizes different heading styles in Markdown and builds a hierarchical structure by assigning levels to each heading. The following rules govern this process:
 
 ### Basic Principles
 
@@ -199,7 +203,7 @@ Different heading styles can be mixed in the same document. When switching from
 
 ## Fuzzy Heading Matching
 
-When documents have inconsistent formatting, such as headings embedded inline within text, missing markdown markers, or improper line breaks, headhunter can use fuzzy matching to extract expected headings.
+When documents have inconsistent formatting, such as headings embedded inline within text, missing markdown markers, or improper line breaks, `headhunter` can use fuzzy matching to extract expected headings.
 
 **How it works:**
 
@@ -217,3 +221,21 @@ Provide a list of `expected_headings` to `process_text()` or `process_batch_df()
   - 80-100: Strict matching, reduces false positives
   - 60-79: Moderate matching, allows more variation
   - Below 60: Lenient matching, may produce unexpected matches
+
+## Markdown Regeneration
+
+After parsing a document, `headhunter` can regenerate clean, standardized Markdown from the parsed structure. This is useful for:
+
+- **Cleaning up messy documents**: Convert inconsistent formatting into standard Markdown
+- **Standardizing format**: Ensure all documents use the same heading style
+- **Post-processing extracted headings**: Apply fuzzy matching to extract headings, then export the cleaned result
+
+### How Regeneration Works
+
+The `to_markdown()` method converts the parsed hierarchical structure back into Markdown:
+
+- **Standard headings**: Converted to hash format (`#`, `##`, `###`, etc.) based on hierarchical level
+- **Inline headings**: Preserved as bold format with colon (`**Heading:** content`)
+- **YAML front matter**: Metadata is included as YAML front matter at the top of the document
+- **Consistent spacing**: Single blank lines between sections for readability
+- **Case preservation**: Original text case is maintained (including ALL CAPS)
@@ -287,6 +287,20 @@ def to_tree(self, show_line_numbers: bool = True, show_type: bool = True) -> str
             self.hierarchy, show_line_numbers, show_type, self.metadata
         )
 
+    def to_markdown(self) -> str:
+        """Regenerates clean Markdown from the parsed structure.
+
+        Converts the hierarchical structure back into properly formatted Markdown,
+        using hash (#) syntax for standard headings and bold (**) format for inline
+        colon headings. Includes YAML front matter if metadata exists.
+
+        Returns:
+            Regenerated Markdown string.
+        """
+        from headhunter import regenerate
+
+        return regenerate.to_markdown(self.hierarchy, self.metadata)
+
     def to_dataframe(self) -> pd.DataFrame:
         """Converts the document to a pandas DataFrame.
 
@@ -451,6 +465,18 @@ def to_tree(
             self.documents, output_dir, show_line_numbers, show_type
         )
 
+    def to_markdown(self) -> dict[str, str]:
+        """Regenerates Markdown for all documents in the batch.
+
+        Returns:
+            Dictionary mapping document IDs to their regenerated Markdown strings.
+        """
+        result: dict[str, str] = {}
+        for doc in self.documents:
+            doc_id = str(doc.metadata["id"])
+            result[doc_id] = doc.to_markdown()
+        return result
+
     def to_dataframe(self) -> pd.DataFrame:
         """Combines all documents into a single pandas DataFrame.
 
 
@@ -0,0 +1,75 @@
+"""Markdown regeneration from parsed hierarchical structures."""
+
+from headhunter.models import HierarchyContext
+
+
+def to_markdown(
+    hierarchy: list[HierarchyContext],
+    metadata: dict[str, object] | None = None,
+) -> str:
+    """Regenerates Markdown from parsed hierarchical structure.
+
+    This function converts a parsed document structure back into clean, properly
+    formatted Markdown. It processes the hierarchy linearly, converting headings
+    and content blocks according to the following rules:
+
+    - YAML front matter is generated from metadata if provided
+    - Standard headings use hash (#) format based on hierarchical level
+    - Inline headings use bold (**text:**) format
+    - Inline headings are merged with immediate child content on the same line
+    - Content blocks are preserved as-is with single blank line spacing
+    - Original text case is preserved (including ALL CAPS)
+
+    Args:
+        hierarchy: List of HierarchyContext objects representing the document structure.
+        metadata: Optional metadata dictionary to include as YAML front matter.
+
+    Returns:
+        Regenerated Markdown string with YAML front matter (if metadata provided)
+        and properly formatted headings and content.
+    """
+    lines: list[str] = []
+
+    if metadata:
+        lines.append("---")
+        for key, value in metadata.items():
+            lines.append(f"{key}: {value}")
+        lines.append("---")
+        lines.append("")
+
+    i = 0
+    while i < len(hierarchy):
+        ctx = hierarchy[i]
+        token = ctx.token
+
+        if token.type == "heading":
+            is_inline = token.metadata and token.metadata.is_inline
+            has_next = i + 1 < len(hierarchy)
+            next_is_content = (
+                has_next
+                and hierarchy[i + 1].token.type == "content"
+                and hierarchy[i + 1].level == ctx.level + 1
+            )
+
+            if is_inline and has_next and next_is_content:
+                content = hierarchy[i + 1].token.content
+                lines.append(f"**{token.content}:** {content}")
+                lines.append("")
+                i += 2
+            elif is_inline:
+                lines.append(f"**{token.content}:**")
+                lines.append("")
+                i += 1
+            else:
+                hash_count = min(ctx.level, 6)
+                hashes = "#" * hash_count
+                lines.append(f"{hashes} {token.content}")
+                lines.append("")
+                i += 1
+
+        else:  # token.type == "content"
+            lines.append(token.content)
+            lines.append("")
+            i += 1
+
+    return "\n".join(lines).rstrip() + "\n"
@@ -7,43 +7,89 @@
 import pandas as pd
 import pytest
 
+# Sample data fixtures for tests
+
 
 @pytest.fixture
 def sample_mixed_markdown() -> str:
     """Sample markdown text with mixed heading styles for testing."""
-    return (pathlib.Path(__file__).parent / "fixtures" / "mixed.md").read_text()
+    return (
+        pathlib.Path(__file__).parent / "fixtures" / "sample_data" / "mixed.md"
+    ).read_text()
+
+
+@pytest.fixture
+def sample_dataframe() -> pd.DataFrame:
+    """Sample DataFrame with markdown content for batch processing tests."""
+    return pd.read_csv(
+        pathlib.Path(__file__).parent / "fixtures" / "sample_data" / "sample_data.csv"
+    )
+
+
+@pytest.fixture
+def sample_match_markdown() -> str:
+    """Sample markdown text for matcher testing."""
+    return (
+        pathlib.Path(__file__).parent / "fixtures" / "sample_data" / "match.md"
+    ).read_text()
+
+
+@pytest.fixture
+def sample_dataframe_match() -> pd.DataFrame:
+    """Sample DataFrame with markdown content for batch processing with matcher."""
+    return pd.read_csv(
+        pathlib.Path(__file__).parent
+        / "fixtures"
+        / "sample_data"
+        / "sample_data_match.csv"
+    )
+
+
+# Expected output fixtures for tests
+## JSON outputs
 
 
 @pytest.fixture
 def sample_mixed_json() -> dict:
     """Expected JSON output for mixed markdown fixture."""
-    with open(pathlib.Path(__file__).parent / "fixtures" / "mixed.json") as f:
+    with open(
+        pathlib.Path(__file__).parent / "fixtures" / "expected_json" / "mixed.json"
+    ) as f:
         return json.load(f)
 
 
 @pytest.fixture
-def sample_match_markdown() -> str:
-    """Sample markdown text for matcher testing."""
-    return (pathlib.Path(__file__).parent / "fixtures" / "match.md").read_text()
+def expected_json_files() -> dict[str, dict]:
+    """Expected JSON output files for batch processing tests."""
+    json_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_json"
+    result = {}
+    for json_file in sorted(json_dir.glob("doc*.json")):
+        with open(json_file) as f:
+            result[json_file.name] = json.load(f)
+    return result
 
 
 @pytest.fixture
 def sample_match_json() -> dict:
     """Expected JSON output for match markdown fixture."""
-    with open(pathlib.Path(__file__).parent / "fixtures" / "match.json") as f:
+    with open(
+        pathlib.Path(__file__).parent / "fixtures" / "expected_json" / "match.json"
+    ) as f:
         return json.load(f)
 
 
-@pytest.fixture
-def sample_dataframe() -> pd.DataFrame:
-    """Sample DataFrame with markdown content for batch processing tests."""
-    return pd.read_csv(pathlib.Path(__file__).parent / "fixtures" / "sample_data.csv")
+## CSV outputs
 
 
 @pytest.fixture
 def sample_dataframe_parsed() -> pd.DataFrame:
     """Expected parsed output for sample_dataframe."""
-    path = pathlib.Path(__file__).parent / "fixtures" / "sample_data_parsed.csv"
+    path = (
+        pathlib.Path(__file__).parent
+        / "fixtures"
+        / "expected_csv"
+        / "sample_data_parsed.csv"
+    )
     df = pd.read_csv(path)
 
     # Convert string representations of lists back to actual lists
@@ -53,18 +99,15 @@ def sample_dataframe_parsed() -> pd.DataFrame:
     return df
 
 
-@pytest.fixture
-def sample_dataframe_match() -> pd.DataFrame:
-    """Sample DataFrame with markdown content for batch processing with matcher."""
-    return pd.read_csv(
-        pathlib.Path(__file__).parent / "fixtures" / "sample_data_match.csv"
-    )
-
-
 @pytest.fixture
 def sample_dataframe_match_parsed() -> pd.DataFrame:
     """Expected parsed output for sample_dataframe_match with matcher."""
-    path = pathlib.Path(__file__).parent / "fixtures" / "sample_data_match_parsed.csv"
+    path = (
+        pathlib.Path(__file__).parent
+        / "fixtures"
+        / "expected_csv"
+        / "sample_data_match_parsed.csv"
+    )
     df = pd.read_csv(path)
 
     # Convert string representations of lists back to actual lists
@@ -76,22 +119,34 @@ def sample_dataframe_match_parsed() -> pd.DataFrame:
     return df
 
 
-@pytest.fixture
-def expected_json_files() -> dict[str, dict]:
-    """Expected JSON output files for batch processing tests."""
-    json_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_json"
-    result = {}
-    for json_file in sorted(json_dir.glob("*.json")):
-        with open(json_file) as f:
-            result[json_file.name] = json.load(f)
-    return result
+## Tree outputs
 
 
 @pytest.fixture
 def expected_tree_files() -> dict[str, str]:
     """Expected tree output files for batch processing tests."""
     tree_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_tree"
     result = {}
-    for tree_file in sorted(tree_dir.glob("*.txt")):
+    for tree_file in sorted(tree_dir.glob("doc*.txt")):
         result[tree_file.name] = tree_file.read_text()
     return result
+
+
+## Markdown outputs
+
+
+@pytest.fixture
+def expected_markdown_files() -> dict[str, str]:
+    """Expected markdown output files for batch processing tests."""
+    md_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_markdown"
+    result = {}
+    for md_file in sorted(md_dir.glob("doc*.md")):
+        result[md_file.stem] = md_file.read_text()
+    return result
+
+
+@pytest.fixture
+def expected_markdown_match() -> str:
+    """Expected markdown output for match.md fixture with matcher."""
+    path = pathlib.Path(__file__).parent / "fixtures" / "expected_markdown" / "match.md"
+    return path.read_text()
@@ -0,0 +1,14 @@
+---
+row_index: 0
+id: doc1
+category: A
+priority: 1
+---
+
+# Document 1
+
+This is the first document with some content.
+
+## Section 1.1
+
+More details here.
@@ -0,0 +1,16 @@
+---
+row_index: 1
+id: doc2
+category: B
+priority: 2
+---
+
+# Document 2
+
+## Overview
+
+Second document overview.
+
+### Details
+
+Nested content.