Refactor to_markdown in ParsedBatch to export documents as individual Markdown files and add batch_to_markdown_files function for batch processing.

alperkent-cmi · alperkent-cmi · commit b80a7cb6f80b · 2026-01-14T12:19:20.000-05:00
diff --git a/src/headhunter/models.py b/src/headhunter/models.py
@@ -474,17 +474,18 @@ def to_tree(
             self.documents, output_dir, show_line_numbers, show_type
         )
 
-    def to_markdown(self) -> dict[str, str]:
-        """Regenerates Markdown for all documents in the batch.
+    def to_markdown(self, output_dir: str) -> list[str]:
+        """Exports each document to individual Markdown files in the output directory.
+
+        Args:
+            output_dir: Directory path where Markdown files will be saved.
 
         Returns:
-            Dictionary mapping document IDs to their regenerated Markdown strings.
+            List of created file paths.
         """
-        result: dict[str, str] = {}
-        for doc in self.documents:
-            doc_id = str(doc.metadata["id"])
-            result[doc_id] = doc.to_markdown()
-        return result
+        from headhunter import output
+
+        return output.batch_to_markdown_files(self.documents, output_dir)
 
     def to_dataframe(self) -> pd.DataFrame:
         """Combines all documents into a single pandas DataFrame.
diff --git a/src/headhunter/output.py b/src/headhunter/output.py
@@ -280,6 +280,30 @@ def to_markdown(
     return "\n".join(lines).rstrip() + "\n"
 
 
+def write_markdown(
+    hierarchy: list[models.HierarchyContext],
+    filepath: str | pathlib.Path,
+    metadata: dict[str, object] | None = None,
+) -> str:
+    """Exports hierarchy to a Markdown file.
+
+    Args:
+        hierarchy: List of HierarchyContext objects.
+        filepath: Path to output Markdown file.
+        metadata: Optional metadata to include as YAML front matter.
+
+    Returns:
+        Path to the created file as a string.
+    """
+    markdown_content = to_markdown(hierarchy, metadata)
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+
+    logger.debug(f"Exported Markdown to {filepath}")
+    return str(filepath)
+
+
 def _ensure_output_directory(output_dir: str | pathlib.Path) -> pathlib.Path:
     """Creates output directory if it doesn't exist.
 
@@ -369,6 +393,39 @@ def batch_to_tree_files(
     return created_files
 
 
+def batch_to_markdown_files(
+    documents: list[models.ParsedText],
+    output_dir: str | pathlib.Path,
+) -> list[str]:
+    """Exports each document to individual Markdown file.
+
+    Args:
+        documents: List of ParsedText objects.
+        output_dir: Directory where Markdown files will be saved.
+
+    Returns:
+        List of created file paths.
+    """
+    output_path = _ensure_output_directory(output_dir)
+
+    logger.debug(
+        f"Exporting {len(documents)} documents to Markdown files in {output_dir}"
+    )
+    created_files: list[str] = []
+
+    for doc in tqdm(documents, desc="Exporting Markdown"):
+        doc_id = str(doc.metadata["id"])
+        filepath = output_path / f"{doc_id}.md"
+        created_file = write_markdown(
+            doc.hierarchy,
+            filepath,
+            metadata=doc.metadata,
+        )
+        created_files.append(str(created_file))
+
+    return created_files
+
+
 def _to_dataframe_rows(
     hierarchy: list[models.HierarchyContext],
     doc_id: str,
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -173,26 +173,29 @@ def test_to_markdown(
 def test_to_markdown_batch(
     sample_dataframe: pd.DataFrame,
     expected_markdown_files: dict[str, str],
+    tmp_path: pathlib.Path,
 ) -> None:
     """Test markdown regeneration for batch processing."""
     content_column = "content"
     id_column = "doc_id"
     metadata_columns = ["category", "priority"]
-    df = sample_dataframe
+    output_dir = tmp_path / "markdown_output"
 
     parsed_batch = process_batch_df(
-        df,
+        sample_dataframe,
         content_column=content_column,
         id_column=id_column,
         metadata_columns=metadata_columns,
     )
+    created_files = parsed_batch.to_markdown(str(output_dir))
 
-    markdown_dict = parsed_batch.to_markdown()
+    assert isinstance(created_files, list)
+    assert len(created_files) == len(sample_dataframe)
 
-    assert isinstance(markdown_dict, dict)
-    assert len(markdown_dict) == len(df)
-    assert set(markdown_dict.keys()) == set(df[id_column])
+    for filepath in created_files:
+        assert pathlib.Path(filepath).exists()
+        assert pathlib.Path(filepath).stem in expected_markdown_files
 
-    for doc_id, markdown in markdown_dict.items():
-        assert doc_id in expected_markdown_files
-        assert markdown == expected_markdown_files[doc_id]
+        with open(filepath, "r", encoding="utf-8") as f:
+            markdown = f.read()
+        assert markdown == expected_markdown_files[pathlib.Path(filepath).stem]