childmindresearch
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 50 additions & 0 deletions b/‎README.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/headhunter/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎src/headhunter/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/headhunter/api.py‎
Lines changed: 152 additions & 3 deletions b/‎src/headhunter/api.py‎
Lines changed: 152 additions & 3 deletions
diff --git a/‎src/headhunter/hierarchy.py‎
Lines changed: 61 additions & 2 deletions b/‎src/headhunter/hierarchy.py‎
Lines changed: 61 additions & 2 deletions
@@ -32,6 +32,7 @@ repos:
   - id: check-case-conflict
   - id: end-of-file-fixer
   - id: trailing-whitespace
+    exclude: tests/fixtures/expected_markdown/patient_003\.md
   - id: pretty-format-json
     args:
     - --autofix
 
@@ -15,6 +15,7 @@ A parser for extracting headings and hierarchical structure from Markdown files.
 - Build hierarchical structure from headings
 - Fuzzy heading matching to extract expected headings from improperly formatted documents, even with typos or spelling variations
 - Process single documents or batches from DataFrames
+- Convert structured tabular data (e.g., CSV exports) into hierarchical report format
 - Export results to DataFrame, JSON, tree visualizations or regenerated clean Markdown
 - Configurable parsing rules and word limits
 
@@ -139,6 +140,36 @@ parsed = headhunter.process_text(
 print(parsed.metadata)  # includes: matched_count, expected_count, match_percentage
 ```
 
+**Process structured data (CSV exports):**
+
+```python
+import pandas as pd
+import headhunter
+
+# DataFrame with multiple content columns (not markdown)
+df = pd.DataFrame(
+    {
+        "patient_id": ["P001", "P002"],
+        "date": ["2025-01-10", "2025-01-11"],
+        "name": ["John Smith", "Jane Doe"],
+        "diagnosis": ["Anxiety Disorder", "Depression"],
+        "notes": ["Initial consultation.", "Follow-up visit."],
+    }
+)
+
+# Convert to parsed structure (column headers become headings)
+parsed_batch = headhunter.process_structured_df(
+    df,
+    id_column="patient_id",
+    metadata_columns=["date"],
+    # rest of the columns auto-detected as content_columns if not specified
+)
+
+# All output formats work the same as markdown parsing
+parsed_batch.to_markdown("reports/")  # Creates report-style markdown files with column headers as inline colon headings and cell values as content
+parsed_batch.to_dataframe()  # Long-form DataFrame in the same format as markdown parsing
+```
+
 ## How Hierarchy is Built
 
 `headhunter` recognizes different heading styles in Markdown and builds a hierarchical structure by assigning levels to each heading. The following rules govern this process:
@@ -239,3 +270,22 @@ The `to_markdown()` method converts the parsed hierarchical structure back into
 - **YAML front matter**: Metadata is included as YAML front matter at the top of the document
 - **Consistent spacing**: Single blank lines between sections for readability
 - **Case preservation**: Original text case is maintained (including ALL CAPS)
+
+## Structured Data Processing
+
+In addition to parsing markdown documents, `headhunter` can convert CSVs with multiple content columns into the same parsed structure. This enables use of all the same downstream analysis logic and output formats for tabular data.
+
+**Use cases:**
+
+- Convert flat database exports into long-form dataframe formats
+- Generate markdown reports from structured data
+- Apply consistent analysis pipelines to both markdown and tabular data sources
+
+**How it works:**
+
+`process_structured_df()` treats each row as a separate document and each content column as a section:
+
+- **Column headers** become level-1 headings
+- **Cell values** become level-2 content under their respective column heading
+- **Empty cells (NaN)** are converted to empty strings
+- **Column order** is preserved in the output
@@ -1,12 +1,13 @@
 """.. include:: ../../README.md"""  # noqa: D415
 
-from headhunter.api import process_batch_df, process_text
+from headhunter.api import process_batch_df, process_structured_df, process_text
 from headhunter.config import ParserConfig
 from headhunter.models import ParsedBatch, ParsedText
 
 __all__ = [
     "process_text",
     "process_batch_df",
+    "process_structured_df",
     "ParserConfig",
     "ParsedText",
     "ParsedBatch",
 
@@ -108,29 +108,36 @@ def process_batch_df(
         - batch.to_dataframe() for pandas DataFrame
         - batch.to_json(output_dir) for JSON files
         - batch.to_tree(output_dir) for tree visualizations
+        - batch.to_markdown(output_dir) for markdown files
 
     Raises:
         ValueError: If required columns don't exist in DataFrame.
     """
     if content_column not in df.columns:
-        raise ValueError(
+        msg = (
             f"Column '{content_column}' not found in dataframe. "
             f"Available columns: {list(df.columns)}"
         )
+        logger.error(msg)
+        raise ValueError(msg)
 
     if id_column is not None and id_column not in df.columns:
-        raise ValueError(
+        msg = (
             f"Column '{id_column}' not found in dataframe. "
             f"Available columns: {list(df.columns)}"
         )
+        logger.error(msg)
+        raise ValueError(msg)
 
     if metadata_columns is not None:
         missing_columns = [col for col in metadata_columns if col not in df.columns]
         if missing_columns:
-            raise ValueError(
+            msg = (
                 f"Metadata columns {missing_columns} not found. "
                 f"Available columns: {list(df.columns)}"
             )
+            logger.error(msg)
+            raise ValueError(msg)
 
     if config is None:
         config = _config.ParserConfig()
@@ -209,3 +216,145 @@ def process_batch_df(
     )
 
     return batch
+
+
+def process_structured_df(
+    df: pd.DataFrame,
+    id_column: str | None = None,
+    metadata_columns: list[str] | None = None,
+    content_columns: list[str] | None = None,
+) -> models.ParsedBatch:
+    """Processes a DataFrame with multiple content columns into ParsedBatch.
+
+    Converts "database dump" style CSVs where each column (other than ID and metadata)
+    represents a content field with the column header as its parent heading. Each row
+    becomes one ParsedText object with a flat hierarchy (level 1 for column headers,
+    level 2 for content).
+
+    Args:
+        df: Input DataFrame with multiple content columns.
+        id_column: Name of column to use as document ID. If None, generates hash from
+            row content. Defaults to None.
+        metadata_columns: List of column names to include as document metadata (e.g.,
+            'date', 'category'). Defaults to None.
+        content_columns: List of column names to treat as content fields. If None,
+            auto-detects as all columns except id_column and metadata_columns. Column
+            order is preserved. Defaults to None.
+
+    Returns:
+        ParsedBatch object containing one ParsedText per row. Use the object's methods
+        to export:
+        - batch.to_dataframe() for pandas DataFrame
+        - batch.to_json(output_dir) for JSON files
+        - batch.to_tree(output_dir) for tree visualizations
+        - batch.to_markdown(output_dir) for markdown files
+
+    Raises:
+        ValueError: If required columns don't exist in DataFrame.
+    """
+    if id_column is not None and id_column not in df.columns:
+        msg = (
+            f"Column '{id_column}' not found in dataframe. "
+            f"Available columns: {list(df.columns)}"
+        )
+        logger.error(msg)
+        raise ValueError(msg)
+
+    if metadata_columns is not None:
+        missing_columns = [col for col in metadata_columns if col not in df.columns]
+        if missing_columns:
+            msg = (
+                f"Metadata columns {missing_columns} not found. "
+                f"Available columns: {list(df.columns)}"
+            )
+            logger.error(msg)
+            raise ValueError(msg)
+
+    if content_columns is None:
+        excluded_columns = set()
+        if id_column is not None:
+            excluded_columns.add(id_column)
+        if metadata_columns is not None:
+            excluded_columns.update(metadata_columns)
+        content_columns = [col for col in df.columns if col not in excluded_columns]
+    else:
+        missing_columns = [col for col in content_columns if col not in df.columns]
+        if missing_columns:
+            msg = (
+                f"Content columns {missing_columns} not found. "
+                f"Available columns: {list(df.columns)}"
+            )
+            logger.error(msg)
+            raise ValueError(msg)
+
+    if not content_columns:
+        msg = (
+            "No content columns specified or detected. "
+            "At least one content column is required."
+        )
+        logger.error(msg)
+        raise ValueError(msg)
+
+    logger.info(
+        f"Starting structured processing of {len(df)} rows with "
+        f"{len(content_columns)} content columns"
+    )
+
+    documents: list[models.ParsedText] = []
+    errors: list[dict[str, str | int | None]] = []
+
+    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
+        doc_metadata: dict[str, object] = {"row_index": idx}
+
+        if id_column is not None:
+            doc_metadata["id"] = row[id_column]
+
+        if metadata_columns is not None:
+            for col in metadata_columns:
+                doc_metadata[col] = row[col]
+
+        try:
+            tokens = parser.structured_row_to_tokens(row, content_columns)
+
+            hierarchies, _ = hierarchy.build_structured_hierarchy(tokens)
+
+            parsed_text = models.ParsedText(
+                text="",  # No original markdown text
+                config=_config.ParserConfig(),  # Default config
+                metadata=doc_metadata,
+                tokens=tokens,
+                hierarchy=hierarchies,
+                warnings=[],
+            )
+            documents.append(parsed_text)
+
+        except Exception as e:
+            doc_id = doc_metadata["id"]
+            logger.error(
+                f"Error processing structured row {idx} (doc_id: {doc_id}): {str(e)}"
+            )
+            tb = traceback.format_exc()
+            error_dict = {
+                "doc_id": doc_id,
+                "row_index": idx,
+                "error_type": type(e).__name__,
+                "message": str(e),
+                "line_number": None,
+                "traceback": tb,
+            }
+            errors.append(error_dict)
+
+    batch = models.ParsedBatch(
+        documents=documents,
+        config=_config.ParserConfig(),
+        errors=errors,
+        warnings=[],
+        metadata_columns=metadata_columns,
+    )
+
+    logger.info(
+        "Structured processing complete: "
+        f"{len(documents)} successful, {len(errors)} errors"
+    )
+
+    return batch
@@ -184,7 +184,9 @@ def _compute_heading_level(
         """
         metadata = token.metadata
         if metadata is None:
-            raise ValueError(f"Heading token missing metadata: {token}")
+            msg = f"Heading token missing metadata: {token}"
+            logger.error(msg)
+            raise ValueError(msg)
 
         if metadata.is_all_caps:
             return self._compute_all_caps_level(metadata, state, heading_stack)
@@ -331,7 +333,7 @@ def build(
                 if metadata is not None and not metadata.is_inline:
                     state.last_heading_level = level
 
-            elif token.type == "content":
+            else:  # token.type == "content"
                 # Calculate level: Use the level of the last heading on stack + 1
                 # (The stack includes inline headings, while last_heading_level doesn't)
                 level = heading_stack[-1][0] + 1 if heading_stack else 1
@@ -350,3 +352,60 @@ def build(
             )
 
         return context_list, warnings
+
+
+def build_structured_hierarchy(
+    tokens: list[models.Token],
+) -> tuple[list[models.HierarchyContext], list[str]]:
+    """Builds flat hierarchical structure from structured (multi-column CSV) tokens.
+
+    For structured data, the hierarchy is flat:
+    - Headings (column names) are level 1
+    - Content (cell values) are level 2, with the column name as parent
+
+    Args:
+        tokens: List of tokens from structured_row_to_tokens (alternating
+            heading/content pairs).
+
+    Returns:
+        A tuple of (hierarchy_contexts, warnings) where hierarchy_contexts is a list of
+        HierarchyContext objects and warnings is a list of warning messages.
+    """
+    warnings: list[str] = []
+
+    if not tokens:
+        warning_msg = "No tokens provided for structured hierarchy building"
+        logger.debug(warning_msg)
+        warnings.append(warning_msg)
+        return [], warnings
+
+    context_list: list[models.HierarchyContext] = []
+    current_heading: models.Token | None = None
+
+    for token in tokens:
+        if token.type == "heading":
+            context = models.HierarchyContext(
+                token=token,
+                level=1,
+                parents=[],
+                parent_types=[],
+            )
+            context_list.append(context)
+            current_heading = token
+        else:  # token.type == "content"
+            if current_heading is not None:
+                parents = [current_heading.content]
+                parent_types = ["column"]
+            else:
+                parents = []
+                parent_types = []
+
+            context = models.HierarchyContext(
+                token=token,
+                level=2,
+                parents=parents,
+                parent_types=parent_types,
+            )
+            context_list.append(context)
+
+    return context_list, warnings