childmindresearch
diff --git a/‎src/headhunter/output.py‎
Lines changed: 2 additions & 1 deletion b/‎src/headhunter/output.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/conftest.py‎
Lines changed: 44 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎tests/fixtures/expected_json/doc1.json‎
Lines changed: 54 additions & 0 deletions b/‎tests/fixtures/expected_json/doc1.json‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tests/fixtures/expected_json/doc2.json‎
Lines changed: 71 additions & 0 deletions b/‎tests/fixtures/expected_json/doc2.json‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎tests/fixtures/expected_json/doc3.json‎
Lines changed: 60 additions & 0 deletions b/‎tests/fixtures/expected_json/doc3.json‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎tests/fixtures/expected_json/doc4.json‎
Lines changed: 71 additions & 0 deletions b/‎tests/fixtures/expected_json/doc4.json‎
Lines changed: 71 additions & 0 deletions
@@ -118,6 +118,7 @@ def to_json_file(
 
     with open(filepath, "w", encoding="utf-8") as f:
         json.dump(hierarchical_data, f, indent=indent, ensure_ascii=False)
+        f.write("\n")
 
     logger.debug(f"Exported JSON to {filepath}")
     return str(filepath)
@@ -204,7 +205,7 @@ def to_tree_string(
 
         lines.append(f"{prefix}{label}")
 
-    return "\n".join(lines)
+    return "\n".join(lines) + "\n"
 
 
 def _ensure_output_directory(output_dir: str | pathlib.Path) -> pathlib.Path:
 
@@ -51,3 +51,47 @@ def sample_dataframe_parsed() -> pd.DataFrame:
     df["parent_types"] = df["parent_types"].apply(ast.literal_eval)
 
     return df
+
+
+@pytest.fixture
+def sample_dataframe_match() -> pd.DataFrame:
+    """Sample DataFrame with markdown content for batch processing with matcher."""
+    return pd.read_csv(
+        pathlib.Path(__file__).parent / "fixtures" / "sample_data_match.csv"
+    )
+
+
+@pytest.fixture
+def sample_dataframe_match_parsed() -> pd.DataFrame:
+    """Expected parsed output for sample_dataframe_match with matcher."""
+    path = pathlib.Path(__file__).parent / "fixtures" / "sample_data_match_parsed.csv"
+    df = pd.read_csv(path)
+
+    # Convert string representations of lists back to actual lists
+    df["parents"] = df["parents"].apply(ast.literal_eval)
+    df["parent_types"] = df["parent_types"].apply(ast.literal_eval)
+    df["matched_headings"] = df["matched_headings"].apply(ast.literal_eval)
+    df["missing_headings"] = df["missing_headings"].apply(ast.literal_eval)
+
+    return df
+
+
+@pytest.fixture
+def expected_json_files() -> dict[str, dict]:
+    """Expected JSON output files for batch processing tests."""
+    json_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_json"
+    result = {}
+    for json_file in sorted(json_dir.glob("*.json")):
+        with open(json_file) as f:
+            result[json_file.name] = json.load(f)
+    return result
+
+
+@pytest.fixture
+def expected_tree_files() -> dict[str, str]:
+    """Expected tree output files for batch processing tests."""
+    tree_dir = pathlib.Path(__file__).parent / "fixtures" / "expected_tree"
+    result = {}
+    for tree_file in sorted(tree_dir.glob("*.txt")):
+        result[tree_file.name] = tree_file.read_text()
+    return result
@@ -0,0 +1,54 @@
+{
+    "row_index": 0,
+    "id": "doc1",
+    "category": "A",
+    "priority": 1,
+    "sections": [
+        {
+            "type": "heading",
+            "text": "Document 1",
+            "level": 1,
+            "line_number": 1,
+            "metadata": {
+                "marker": "#",
+                "marker_count": 1,
+                "case": "title_case",
+                "is_inline": false,
+                "is_extracted": false,
+                "extraction_position": null,
+                "signature": "#1"
+            },
+            "sections": [
+                {
+                    "type": "content",
+                    "text": "This is the first document with some content.",
+                    "level": 2,
+                    "line_number": 3
+                },
+                {
+                    "type": "heading",
+                    "text": "Section 1.1",
+                    "level": 2,
+                    "line_number": 5,
+                    "metadata": {
+                        "marker": "#",
+                        "marker_count": 2,
+                        "case": "title_case",
+                        "is_inline": false,
+                        "is_extracted": false,
+                        "extraction_position": null,
+                        "signature": "#2"
+                    },
+                    "sections": [
+                        {
+                            "type": "content",
+                            "text": "More details here.",
+                            "level": 3,
+                            "line_number": 7
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
@@ -0,0 +1,71 @@
+{
+    "row_index": 1,
+    "id": "doc2",
+    "category": "B",
+    "priority": 2,
+    "sections": [
+        {
+            "type": "heading",
+            "text": "Document 2",
+            "level": 1,
+            "line_number": 1,
+            "metadata": {
+                "marker": "#",
+                "marker_count": 1,
+                "case": "title_case",
+                "is_inline": false,
+                "is_extracted": false,
+                "extraction_position": null,
+                "signature": "#1"
+            },
+            "sections": [
+                {
+                    "type": "heading",
+                    "text": "Overview",
+                    "level": 2,
+                    "line_number": 3,
+                    "metadata": {
+                        "marker": "#",
+                        "marker_count": 2,
+                        "case": "title_case",
+                        "is_inline": false,
+                        "is_extracted": false,
+                        "extraction_position": null,
+                        "signature": "#2"
+                    },
+                    "sections": [
+                        {
+                            "type": "content",
+                            "text": "Second document overview.",
+                            "level": 3,
+                            "line_number": 5
+                        },
+                        {
+                            "type": "heading",
+                            "text": "Details",
+                            "level": 3,
+                            "line_number": 7,
+                            "metadata": {
+                                "marker": "#",
+                                "marker_count": 3,
+                                "case": "title_case",
+                                "is_inline": false,
+                                "is_extracted": false,
+                                "extraction_position": null,
+                                "signature": "#3"
+                            },
+                            "sections": [
+                                {
+                                    "type": "content",
+                                    "text": "Nested content.",
+                                    "level": 4,
+                                    "line_number": 9
+                                }
+                            ]
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
@@ -0,0 +1,60 @@
+{
+    "row_index": 2,
+    "id": "doc3",
+    "category": "A",
+    "priority": 3,
+    "sections": [
+        {
+            "type": "heading",
+            "text": "Document number",
+            "level": 1,
+            "line_number": 1,
+            "metadata": {
+                "marker": "*",
+                "marker_count": 2,
+                "case": "sentence_case",
+                "is_inline": true,
+                "is_extracted": false,
+                "extraction_position": null,
+                "signature": "*2-inline"
+            },
+            "sections": [
+                {
+                    "type": "content",
+                    "text": "3",
+                    "level": 2,
+                    "line_number": 1
+                }
+            ]
+        },
+        {
+            "type": "heading",
+            "text": "Document type",
+            "level": 1,
+            "line_number": 3,
+            "metadata": {
+                "marker": "*",
+                "marker_count": 2,
+                "case": "title_case",
+                "is_inline": true,
+                "is_extracted": false,
+                "extraction_position": null,
+                "signature": "*2-inline"
+            },
+            "sections": [
+                {
+                    "type": "content",
+                    "text": "Markdown",
+                    "level": 2,
+                    "line_number": 3
+                }
+            ]
+        },
+        {
+            "type": "content",
+            "text": "Simple document with minimal structure.",
+            "level": 1,
+            "line_number": 5
+        }
+    ]
+}
@@ -0,0 +1,71 @@
+{
+    "row_index": 3,
+    "id": "doc4",
+    "category": "C",
+    "priority": 1,
+    "sections": [
+        {
+            "type": "heading",
+            "text": "Document 4",
+            "level": 1,
+            "line_number": 1,
+            "metadata": {
+                "marker": "#",
+                "marker_count": 1,
+                "case": "title_case",
+                "is_inline": false,
+                "is_extracted": false,
+                "extraction_position": null,
+                "signature": "#1"
+            },
+            "sections": [
+                {
+                    "type": "heading",
+                    "text": "Introduction",
+                    "level": 2,
+                    "line_number": 3,
+                    "metadata": {
+                        "marker": "#",
+                        "marker_count": 2,
+                        "case": "title_case",
+                        "is_inline": false,
+                        "is_extracted": false,
+                        "extraction_position": null,
+                        "signature": "#2"
+                    },
+                    "sections": [
+                        {
+                            "type": "content",
+                            "text": "Fourth document.",
+                            "level": 3,
+                            "line_number": 5
+                        }
+                    ]
+                },
+                {
+                    "type": "heading",
+                    "text": "Conclusion",
+                    "level": 2,
+                    "line_number": 7,
+                    "metadata": {
+                        "marker": "#",
+                        "marker_count": 2,
+                        "case": "title_case",
+                        "is_inline": false,
+                        "is_extracted": false,
+                        "extraction_position": null,
+                        "signature": "#2"
+                    },
+                    "sections": [
+                        {
+                            "type": "content",
+                            "text": "Final thoughts.",
+                            "level": 3,
+                            "line_number": 9
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}