childmindresearch
diff --git a/‎README.md‎
Lines changed: 62 additions & 0 deletions b/‎README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/headhunter/models.py‎
Lines changed: 18 additions & 20 deletions b/‎src/headhunter/models.py‎
Lines changed: 18 additions & 20 deletions
@@ -110,3 +110,65 @@ parsed_batch.to_tree("tree_outputs/")
 # Export parsed data to a CSV file
 df_parsed.to_csv("parsed_data.csv")
 ```
+
+## How Hierarchy is Built
+
+Headhunter recognizes different heading styles in Markdown and builds a hierarchical structure by assigning levels to each heading. The following rules govern this process:
+
+### Basic Principles
+
+- **Headings create structure**: Each heading creates a new section in the document's outline
+- **Content follows headings**: Regular text is always nested under its nearest heading above
+- **First heading starts at level 1**: The first heading in a document becomes the top level
+
+### Rules for Different Heading Types
+
+#### Hash Headings (`#`, `##`, `###`)
+
+These work as expected in standard Markdown:
+
+- More `#` symbols = deeper in the hierarchy
+- `# Title` → level 1
+- `## Subtitle` → level 2
+- `### Sub-subtitle` → level 3
+
+The level increases or decreases based on how many more or fewer `#` symbols are present compared to the previous hash heading.
+
+#### Bold and Italic Headings (`**text**`, `*text*`, `***text***`)
+
+These follow a specific hierarchy from highest to lowest:
+
+1. `**Bold text**` (2 asterisks) = highest level
+2. `***Bold and italic***` (3 asterisks) = middle level
+3. `*Italic text*` (1 asterisk) = lowest level
+
+When switching between these styles, the level adjusts by just one step up or down:
+
+- Going from bold (`**`) to italic (`*`) moves one level deeper
+- Going from italic (`*`) to bold (`**`) moves one level shallower
+- Using the same style consecutively keeps the same level
+
+#### ALL CAPS HEADINGS
+
+When a heading with hash (`#`) or asterisk (`**`) markers is written in ALL CAPITAL LETTERS, special rules apply:
+
+- The first ALL CAPS heading sets its level based on what came before it
+- Every subsequent ALL CAPS heading uses that same level (they are treated as peers)
+
+Examples:
+
+- `# ALL CAPS HEADING` - Valid heading (hash marker with ALL CAPS text)
+- `**ALL CAPS HEADING**` - Valid heading (asterisk marker with ALL CAPS text)
+- `ALL CAPS HEADING` - Not a heading (no marker, treated as content)
+
+#### Inline Headings (with colons)
+
+When a heading ends with a colon (like `**Name:** Jane Doe`), it works differently:
+
+- The heading itself goes one level deeper than the previous heading
+- The content immediately after it is always treated as the deepest level
+- After that content, we return to the normal hierarchy
+
+### Mixed Heading Styles
+
+Different heading styles can be mixed in the same document. When switching from one style to another, the new heading typically goes one level deeper than the previous one. However, the specific rules for each style (described above) still apply.
@@ -10,8 +10,7 @@ license = "LGPL-2.1-only"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
-  "pandas>=2.3.3",
-  "pyarrow>=22.0.0"
+  "pandas>=2.3.3"
 ]
 
 [dependency-groups]
@@ -26,7 +25,8 @@ dev = [
 docs = ["pdoc>=15.0.0"]
 notebooks = [
   "jupyter>=1.1.1",
-  "ipykernel>=6.29.5"
+  "ipykernel>=6.29.5",
+  "pyarrow>=22.0.0"
 ]
 
 [tool.pytest.ini_options]
 
@@ -137,9 +137,9 @@ def to_dict(self) -> dict[str, object]:
         Returns:
             A nested dictionary representation of the document structure.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        return writer.to_dict(self.hierarchy, self.metadata)
+        return output.to_dict(self.hierarchy, self.metadata)
 
     def to_json(self, filepath: str, indent: int = 2) -> str:
         """Exports the document to a JSON file.
@@ -151,9 +151,9 @@ def to_json(self, filepath: str, indent: int = 2) -> str:
         Returns:
             Path to the created file.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        return writer.to_json_file(self.hierarchy, filepath, self.metadata, indent)
+        return output.to_json_file(self.hierarchy, filepath, self.metadata, indent)
 
     def to_tree(self, show_line_numbers: bool = True, show_type: bool = True) -> str:
         """Generates an ASCII tree visualization of the document structure.
@@ -165,25 +165,23 @@ def to_tree(self, show_line_numbers: bool = True, show_type: bool = True) -> str
         Returns:
             ASCII tree representation as a string.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        # Build metadata heading from document metadata
-        metadata_heading = dict(self.metadata) if self.metadata else None
-        return writer.to_tree_string(
-            self.hierarchy, show_line_numbers, show_type, metadata_heading
+        return output.to_tree_string(
+            self.hierarchy, show_line_numbers, show_type, self.metadata
         )
 
-    def to_dataframe(self) -> list[dict[str, object]]:
-        """Converts the document to row dictionaries.
+    def to_dataframe(self) -> pd.DataFrame:
+        """Converts the document to a pandas DataFrame.
 
         Returns:
-            List of dictionaries representing content rows with
+            DataFrame where each row is a content token with
             hierarchical context.
         """
-        from headhunter import writer
+        from headhunter import output
 
         doc_id = str(self.metadata["id"])
-        return writer.to_dataframe_rows(self.hierarchy, doc_id, self.metadata)
+        return output.to_dataframe(self.hierarchy, doc_id, self.metadata)
 
 
 @dataclasses.dataclass(frozen=True)
@@ -246,9 +244,9 @@ def to_json(self, output_dir: str, indent: int = 2) -> list[str]:
         Returns:
             List of created file paths.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        return writer.batch_to_json_files(self.documents, output_dir, indent)
+        return output.batch_to_json_files(self.documents, output_dir, indent)
 
     def to_tree(
         self, output_dir: str, show_line_numbers: bool = True, show_type: bool = True
@@ -263,9 +261,9 @@ def to_tree(
         Returns:
             List of created file paths.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        return writer.batch_to_tree_files(
+        return output.batch_to_tree_files(
             self.documents, output_dir, show_line_numbers, show_type
         )
 
@@ -276,9 +274,9 @@ def to_dataframe(self) -> pd.DataFrame:
             DataFrame with all content rows from all documents,
             including any metadata columns specified during batch processing.
         """
-        from headhunter import writer
+        from headhunter import output
 
-        return writer.batch_to_dataframe(self.documents, self.metadata_columns)
+        return output.batch_to_dataframe(self.documents, self.metadata_columns)
 
 
 class ParsingError(Exception):