Skip to content

Commit b80a7cb

Browse files
committed
Refactor to_markdown in ParsedBatch to export documents as individual Markdown files and add batch_to_markdown_files function for batch processing.
1 parent ac85e35 commit b80a7cb

3 files changed

Lines changed: 78 additions & 17 deletions

File tree

src/headhunter/models.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -474,17 +474,18 @@ def to_tree(
474474
self.documents, output_dir, show_line_numbers, show_type
475475
)
476476

477-
def to_markdown(self) -> dict[str, str]:
478-
"""Regenerates Markdown for all documents in the batch.
477+
def to_markdown(self, output_dir: str) -> list[str]:
478+
"""Exports each document to individual Markdown files in the output directory.
479+
480+
Args:
481+
output_dir: Directory path where Markdown files will be saved.
479482
480483
Returns:
481-
Dictionary mapping document IDs to their regenerated Markdown strings.
484+
List of created file paths.
482485
"""
483-
result: dict[str, str] = {}
484-
for doc in self.documents:
485-
doc_id = str(doc.metadata["id"])
486-
result[doc_id] = doc.to_markdown()
487-
return result
486+
from headhunter import output
487+
488+
return output.batch_to_markdown_files(self.documents, output_dir)
488489

489490
def to_dataframe(self) -> pd.DataFrame:
490491
"""Combines all documents into a single pandas DataFrame.

src/headhunter/output.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,30 @@ def to_markdown(
280280
return "\n".join(lines).rstrip() + "\n"
281281

282282

283+
def write_markdown(
284+
hierarchy: list[models.HierarchyContext],
285+
filepath: str | pathlib.Path,
286+
metadata: dict[str, object] | None = None,
287+
) -> str:
288+
"""Exports hierarchy to a Markdown file.
289+
290+
Args:
291+
hierarchy: List of HierarchyContext objects.
292+
filepath: Path to output Markdown file.
293+
metadata: Optional metadata to include as YAML front matter.
294+
295+
Returns:
296+
Path to the created file as a string.
297+
"""
298+
markdown_content = to_markdown(hierarchy, metadata)
299+
300+
with open(filepath, "w", encoding="utf-8") as f:
301+
f.write(markdown_content)
302+
303+
logger.debug(f"Exported Markdown to {filepath}")
304+
return str(filepath)
305+
306+
283307
def _ensure_output_directory(output_dir: str | pathlib.Path) -> pathlib.Path:
284308
"""Creates output directory if it doesn't exist.
285309
@@ -369,6 +393,39 @@ def batch_to_tree_files(
369393
return created_files
370394

371395

396+
def batch_to_markdown_files(
397+
documents: list[models.ParsedText],
398+
output_dir: str | pathlib.Path,
399+
) -> list[str]:
400+
"""Exports each document to individual Markdown file.
401+
402+
Args:
403+
documents: List of ParsedText objects.
404+
output_dir: Directory where Markdown files will be saved.
405+
406+
Returns:
407+
List of created file paths.
408+
"""
409+
output_path = _ensure_output_directory(output_dir)
410+
411+
logger.debug(
412+
f"Exporting {len(documents)} documents to Markdown files in {output_dir}"
413+
)
414+
created_files: list[str] = []
415+
416+
for doc in tqdm(documents, desc="Exporting Markdown"):
417+
doc_id = str(doc.metadata["id"])
418+
filepath = output_path / f"{doc_id}.md"
419+
created_file = write_markdown(
420+
doc.hierarchy,
421+
filepath,
422+
metadata=doc.metadata,
423+
)
424+
created_files.append(str(created_file))
425+
426+
return created_files
427+
428+
372429
def _to_dataframe_rows(
373430
hierarchy: list[models.HierarchyContext],
374431
doc_id: str,

tests/test_api.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -173,26 +173,29 @@ def test_to_markdown(
173173
def test_to_markdown_batch(
174174
sample_dataframe: pd.DataFrame,
175175
expected_markdown_files: dict[str, str],
176+
tmp_path: pathlib.Path,
176177
) -> None:
177178
"""Test markdown regeneration for batch processing."""
178179
content_column = "content"
179180
id_column = "doc_id"
180181
metadata_columns = ["category", "priority"]
181-
df = sample_dataframe
182+
output_dir = tmp_path / "markdown_output"
182183

183184
parsed_batch = process_batch_df(
184-
df,
185+
sample_dataframe,
185186
content_column=content_column,
186187
id_column=id_column,
187188
metadata_columns=metadata_columns,
188189
)
190+
created_files = parsed_batch.to_markdown(str(output_dir))
189191

190-
markdown_dict = parsed_batch.to_markdown()
192+
assert isinstance(created_files, list)
193+
assert len(created_files) == len(sample_dataframe)
191194

192-
assert isinstance(markdown_dict, dict)
193-
assert len(markdown_dict) == len(df)
194-
assert set(markdown_dict.keys()) == set(df[id_column])
195+
for filepath in created_files:
196+
assert pathlib.Path(filepath).exists()
197+
assert pathlib.Path(filepath).stem in expected_markdown_files
195198

196-
for doc_id, markdown in markdown_dict.items():
197-
assert doc_id in expected_markdown_files
198-
assert markdown == expected_markdown_files[doc_id]
199+
with open(filepath, "r", encoding="utf-8") as f:
200+
markdown = f.read()
201+
assert markdown == expected_markdown_files[pathlib.Path(filepath).stem]

0 commit comments

Comments
 (0)