Skip to content

Commit e2a1070

Browse files
committed
Implement structured data processing capabilities
1 parent b80a7cb commit e2a1070

24 files changed

Lines changed: 1165 additions & 43 deletions

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ repos:
3232
- id: check-case-conflict
3333
- id: end-of-file-fixer
3434
- id: trailing-whitespace
35+
exclude: tests/fixtures/expected_markdown/patient_003\.md
3536
- id: pretty-format-json
3637
args:
3738
- --autofix

README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ A parser for extracting headings and hierarchical structure from Markdown files.
1515
- Build hierarchical structure from headings
1616
- Fuzzy heading matching to extract expected headings from improperly formatted documents, even with typos or spelling variations
1717
- Process single documents or batches from DataFrames
18+
- Convert structured tabular data (e.g., CSV exports) into hierarchical report format
1819
- Export results to DataFrame, JSON, tree visualizations or regenerated clean Markdown
1920
- Configurable parsing rules and word limits
2021

@@ -139,6 +140,36 @@ parsed = headhunter.process_text(
139140
print(parsed.metadata) # includes: matched_count, expected_count, match_percentage
140141
```
141142

143+
**Process structured data (CSV exports):**
144+
145+
```python
146+
import pandas as pd
147+
import headhunter
148+
149+
# DataFrame with multiple content columns (not markdown)
150+
df = pd.DataFrame(
151+
{
152+
"patient_id": ["P001", "P002"],
153+
"date": ["2025-01-10", "2025-01-11"],
154+
"name": ["John Smith", "Jane Doe"],
155+
"diagnosis": ["Anxiety Disorder", "Depression"],
156+
"notes": ["Initial consultation.", "Follow-up visit."],
157+
}
158+
)
159+
160+
# Convert to parsed structure (column headers become headings)
161+
parsed_batch = headhunter.process_structured_df(
162+
df,
163+
id_column="patient_id",
164+
metadata_columns=["date"],
165+
# rest of the columns auto-detected as content_columns if not specified
166+
)
167+
168+
# All output formats work the same as markdown parsing
169+
parsed_batch.to_markdown("reports/") # Creates report-style markdown files with column headers as inline colon headings and cell values as content
170+
parsed_batch.to_dataframe() # Long-form DataFrame in the same format as markdown parsing
171+
```
172+
142173
## How Hierarchy is Built
143174

144175
`headhunter` recognizes different heading styles in Markdown and builds a hierarchical structure by assigning levels to each heading. The following rules govern this process:
@@ -239,3 +270,22 @@ The `to_markdown()` method converts the parsed hierarchical structure back into
239270
- **YAML front matter**: Metadata is included as YAML front matter at the top of the document
240271
- **Consistent spacing**: Single blank lines between sections for readability
241272
- **Case preservation**: Original text case is maintained (including ALL CAPS)
273+
274+
## Structured Data Processing
275+
276+
In addition to parsing markdown documents, `headhunter` can convert CSVs with multiple content columns into the same parsed structure. This enables use of all the same downstream analysis logic and output formats for tabular data.
277+
278+
**Use cases:**
279+
280+
- Convert flat database exports into long-form dataframe formats
281+
- Generate markdown reports from structured data
282+
- Apply consistent analysis pipelines to both markdown and tabular data sources
283+
284+
**How it works:**
285+
286+
`process_structured_df()` treats each row as a separate document and each content column as a section:
287+
288+
- **Column headers** become level-1 headings
289+
- **Cell values** become level-2 content under their respective column heading
290+
- **Empty cells (NaN)** are converted to empty strings
291+
- **Column order** is preserved in the output

src/headhunter/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
""".. include:: ../../README.md""" # noqa: D415
22

3-
from headhunter.api import process_batch_df, process_text
3+
from headhunter.api import process_batch_df, process_structured_df, process_text
44
from headhunter.config import ParserConfig
55
from headhunter.models import ParsedBatch, ParsedText
66

77
__all__ = [
88
"process_text",
99
"process_batch_df",
10+
"process_structured_df",
1011
"ParserConfig",
1112
"ParsedText",
1213
"ParsedBatch",

src/headhunter/api.py

Lines changed: 152 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,29 +108,36 @@ def process_batch_df(
108108
- batch.to_dataframe() for pandas DataFrame
109109
- batch.to_json(output_dir) for JSON files
110110
- batch.to_tree(output_dir) for tree visualizations
111+
- batch.to_markdown(output_dir) for markdown files
111112
112113
Raises:
113114
ValueError: If required columns don't exist in DataFrame.
114115
"""
115116
if content_column not in df.columns:
116-
raise ValueError(
117+
msg = (
117118
f"Column '{content_column}' not found in dataframe. "
118119
f"Available columns: {list(df.columns)}"
119120
)
121+
logger.error(msg)
122+
raise ValueError(msg)
120123

121124
if id_column is not None and id_column not in df.columns:
122-
raise ValueError(
125+
msg = (
123126
f"Column '{id_column}' not found in dataframe. "
124127
f"Available columns: {list(df.columns)}"
125128
)
129+
logger.error(msg)
130+
raise ValueError(msg)
126131

127132
if metadata_columns is not None:
128133
missing_columns = [col for col in metadata_columns if col not in df.columns]
129134
if missing_columns:
130-
raise ValueError(
135+
msg = (
131136
f"Metadata columns {missing_columns} not found. "
132137
f"Available columns: {list(df.columns)}"
133138
)
139+
logger.error(msg)
140+
raise ValueError(msg)
134141

135142
if config is None:
136143
config = _config.ParserConfig()
@@ -209,3 +216,145 @@ def process_batch_df(
209216
)
210217

211218
return batch
219+
220+
221+
def process_structured_df(
222+
df: pd.DataFrame,
223+
id_column: str | None = None,
224+
metadata_columns: list[str] | None = None,
225+
content_columns: list[str] | None = None,
226+
) -> models.ParsedBatch:
227+
"""Processes a DataFrame with multiple content columns into ParsedBatch.
228+
229+
Converts "database dump" style CSVs where each column (other than ID and metadata)
230+
represents a content field with the column header as its parent heading. Each row
231+
becomes one ParsedText object with a flat hierarchy (level 1 for column headers,
232+
level 2 for content).
233+
234+
Args:
235+
df: Input DataFrame with multiple content columns.
236+
id_column: Name of column to use as document ID. If None, generates hash from
237+
row content. Defaults to None.
238+
metadata_columns: List of column names to include as document metadata (e.g.,
239+
'date', 'category'). Defaults to None.
240+
content_columns: List of column names to treat as content fields. If None,
241+
auto-detects as all columns except id_column and metadata_columns. Column
242+
order is preserved. Defaults to None.
243+
244+
Returns:
245+
ParsedBatch object containing one ParsedText per row. Use the object's methods
246+
to export:
247+
- batch.to_dataframe() for pandas DataFrame
248+
- batch.to_json(output_dir) for JSON files
249+
- batch.to_tree(output_dir) for tree visualizations
250+
- batch.to_markdown(output_dir) for markdown files
251+
252+
Raises:
253+
ValueError: If required columns don't exist in DataFrame.
254+
"""
255+
if id_column is not None and id_column not in df.columns:
256+
msg = (
257+
f"Column '{id_column}' not found in dataframe. "
258+
f"Available columns: {list(df.columns)}"
259+
)
260+
logger.error(msg)
261+
raise ValueError(msg)
262+
263+
if metadata_columns is not None:
264+
missing_columns = [col for col in metadata_columns if col not in df.columns]
265+
if missing_columns:
266+
msg = (
267+
f"Metadata columns {missing_columns} not found. "
268+
f"Available columns: {list(df.columns)}"
269+
)
270+
logger.error(msg)
271+
raise ValueError(msg)
272+
273+
if content_columns is None:
274+
excluded_columns = set()
275+
if id_column is not None:
276+
excluded_columns.add(id_column)
277+
if metadata_columns is not None:
278+
excluded_columns.update(metadata_columns)
279+
content_columns = [col for col in df.columns if col not in excluded_columns]
280+
else:
281+
missing_columns = [col for col in content_columns if col not in df.columns]
282+
if missing_columns:
283+
msg = (
284+
f"Content columns {missing_columns} not found. "
285+
f"Available columns: {list(df.columns)}"
286+
)
287+
logger.error(msg)
288+
raise ValueError(msg)
289+
290+
if not content_columns:
291+
msg = (
292+
"No content columns specified or detected. "
293+
"At least one content column is required."
294+
)
295+
logger.error(msg)
296+
raise ValueError(msg)
297+
298+
logger.info(
299+
f"Starting structured processing of {len(df)} rows with "
300+
f"{len(content_columns)} content columns"
301+
)
302+
303+
documents: list[models.ParsedText] = []
304+
errors: list[dict[str, str | int | None]] = []
305+
306+
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
307+
doc_metadata: dict[str, object] = {"row_index": idx}
308+
309+
if id_column is not None:
310+
doc_metadata["id"] = row[id_column]
311+
312+
if metadata_columns is not None:
313+
for col in metadata_columns:
314+
doc_metadata[col] = row[col]
315+
316+
try:
317+
tokens = parser.structured_row_to_tokens(row, content_columns)
318+
319+
hierarchies, _ = hierarchy.build_structured_hierarchy(tokens)
320+
321+
parsed_text = models.ParsedText(
322+
text="", # No original markdown text
323+
config=_config.ParserConfig(), # Default config
324+
metadata=doc_metadata,
325+
tokens=tokens,
326+
hierarchy=hierarchies,
327+
warnings=[],
328+
)
329+
documents.append(parsed_text)
330+
331+
except Exception as e:
332+
doc_id = doc_metadata["id"]
333+
logger.error(
334+
f"Error processing structured row {idx} (doc_id: {doc_id}): {str(e)}"
335+
)
336+
tb = traceback.format_exc()
337+
error_dict = {
338+
"doc_id": doc_id,
339+
"row_index": idx,
340+
"error_type": type(e).__name__,
341+
"message": str(e),
342+
"line_number": None,
343+
"traceback": tb,
344+
}
345+
errors.append(error_dict)
346+
347+
batch = models.ParsedBatch(
348+
documents=documents,
349+
config=_config.ParserConfig(),
350+
errors=errors,
351+
warnings=[],
352+
metadata_columns=metadata_columns,
353+
)
354+
355+
logger.info(
356+
"Structured processing complete: "
357+
f"{len(documents)} successful, {len(errors)} errors"
358+
)
359+
360+
return batch

src/headhunter/hierarchy.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,9 @@ def _compute_heading_level(
184184
"""
185185
metadata = token.metadata
186186
if metadata is None:
187-
raise ValueError(f"Heading token missing metadata: {token}")
187+
msg = f"Heading token missing metadata: {token}"
188+
logger.error(msg)
189+
raise ValueError(msg)
188190

189191
if metadata.is_all_caps:
190192
return self._compute_all_caps_level(metadata, state, heading_stack)
@@ -331,7 +333,7 @@ def build(
331333
if metadata is not None and not metadata.is_inline:
332334
state.last_heading_level = level
333335

334-
elif token.type == "content":
336+
else: # token.type == "content"
335337
# Calculate level: Use the level of the last heading on stack + 1
336338
# (The stack includes inline headings, while last_heading_level doesn't)
337339
level = heading_stack[-1][0] + 1 if heading_stack else 1
@@ -350,3 +352,60 @@ def build(
350352
)
351353

352354
return context_list, warnings
355+
356+
357+
def build_structured_hierarchy(
358+
tokens: list[models.Token],
359+
) -> tuple[list[models.HierarchyContext], list[str]]:
360+
"""Builds flat hierarchical structure from structured (multi-column CSV) tokens.
361+
362+
For structured data, the hierarchy is flat:
363+
- Headings (column names) are level 1
364+
- Content (cell values) are level 2, with the column name as parent
365+
366+
Args:
367+
tokens: List of tokens from structured_row_to_tokens (alternating
368+
heading/content pairs).
369+
370+
Returns:
371+
A tuple of (hierarchy_contexts, warnings) where hierarchy_contexts is a list of
372+
HierarchyContext objects and warnings is a list of warning messages.
373+
"""
374+
warnings: list[str] = []
375+
376+
if not tokens:
377+
warning_msg = "No tokens provided for structured hierarchy building"
378+
logger.debug(warning_msg)
379+
warnings.append(warning_msg)
380+
return [], warnings
381+
382+
context_list: list[models.HierarchyContext] = []
383+
current_heading: models.Token | None = None
384+
385+
for token in tokens:
386+
if token.type == "heading":
387+
context = models.HierarchyContext(
388+
token=token,
389+
level=1,
390+
parents=[],
391+
parent_types=[],
392+
)
393+
context_list.append(context)
394+
current_heading = token
395+
else: # token.type == "content"
396+
if current_heading is not None:
397+
parents = [current_heading.content]
398+
parent_types = ["column"]
399+
else:
400+
parents = []
401+
parent_types = []
402+
403+
context = models.HierarchyContext(
404+
token=token,
405+
level=2,
406+
parents=parents,
407+
parent_types=parent_types,
408+
)
409+
context_list.append(context)
410+
411+
return context_list, warnings

0 commit comments

Comments
 (0)