neo4j
diff --git a/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/neo4j_graphrag/experimental/components/filename_collision_handler.py‎
Lines changed: 80 additions & 0 deletions b/‎src/neo4j_graphrag/experimental/components/filename_collision_handler.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎src/neo4j_graphrag/experimental/components/kg_writer.py‎
Lines changed: 240 additions & 3 deletions b/‎src/neo4j_graphrag/experimental/components/kg_writer.py‎
Lines changed: 240 additions & 3 deletions
@@ -2,6 +2,10 @@
 
 ## Next
 
+### Added
+
+- Parquet export (experimental): `ParquetWriter` (extends `KGWriter`), `Neo4jGraphParquetFormatter`, and `FilenameCollisionHandler` for writing knowledge graphs to Parquet (one file per node label and per relationship type). 
+
 ### Changed
 
 - Updated examples, default values, and documentation to use `gpt-4.1` / `gpt-4.1-mini` instead of deprecated GPT-4* models (e.g. `gpt-4o`, `gpt-4`).
 
@@ -58,6 +58,7 @@ experimental = [
     "langchain-text-splitters>=0.3.0,<0.4.0",
     "neo4j-viz>=0.4.2,<0.5.0",
     "llama-index>=0.13.0,<0.14.0",
+    "pyarrow>=20.0.0",  # ParquetWriter, Neo4jGraphParquetFormatter
 ]
 examples = [
     "langchain-openai>=0.2.2,<0.3.0",
 
@@ -0,0 +1,80 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Filename collision handler for Parquet file writing."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Union
+
+
+class FilenameCollisionHandler:
+    """Handles filename collisions by adding numeric suffixes.
+
+    Tracks filename collisions per output path and generates unique filenames
+    by appending _n suffixes when the same base filename is requested more
+    than once for the same output path.
+
+    Example:
+
+    .. code-block:: python
+
+        handler = FilenameCollisionHandler()
+        filename1 = handler.get_unique_filename("Person.parquet", Path("./out"))
+        # Returns: "Person.parquet"
+        filename2 = handler.get_unique_filename("Person.parquet", Path("./out"))
+        # Returns: "Person_1.parquet"
+        filename3 = handler.get_unique_filename("Person.parquet", Path("./out"))
+        # Returns: "Person_2.parquet"
+    """
+
+    # Class-level dictionary to track filename collisions across all instances
+    _filename_counts: dict[str, int] = {}
+
+    def get_unique_filename(
+        self,
+        base_filename: str,
+        output_path: Union[str, Path],
+    ) -> str:
+        """Return a unique filename by adding a _n suffix if a collision is detected.
+
+        Args:
+            base_filename: The original filename (e.g. "Person.parquet").
+            output_path: The output directory path; collisions are tracked per path.
+
+        Returns:
+            A unique filename (e.g. "Person.parquet" or "Person_1.parquet").
+        """
+        path_str = str(Path(output_path).resolve())
+        key = f"{path_str}{base_filename}"
+
+        if key not in self._filename_counts:
+            self._filename_counts[key] = 0
+            return base_filename
+
+        self._filename_counts[key] += 1
+        count = self._filename_counts[key]
+        if base_filename.endswith(".parquet"):
+            name_without_ext = base_filename[: -len(".parquet")]
+            return f"{name_without_ext}_{count}.parquet"
+        return f"{base_filename}_{count}"
+
+    @classmethod
+    def reset(cls) -> None:
+        """Clear the collision-tracking state.
+
+        Intended for tests so each run starts with a clean state.
+        """
+        cls._filename_counts.clear()
@@ -14,13 +14,23 @@
 #  limitations under the License.
 from __future__ import annotations
 
+import os
 import logging
 from abc import abstractmethod
 from typing import Any, Generator, Literal, Optional
 
 import neo4j
 from pydantic import validate_call
 
+from neo4j_graphrag.experimental.components.filename_collision_handler import (
+    FilenameCollisionHandler,
+)
+from neo4j_graphrag.experimental.components.parquet_formatter import (
+    Neo4jGraphParquetFormatter,
+)
+from neo4j_graphrag.experimental.components.parquet_output import (
+    ParquetOutputDestination,
+)
 from neo4j_graphrag.experimental.components.types import (
     LexicalGraphConfig,
     Neo4jGraph,
@@ -43,6 +53,24 @@
 logger = logging.getLogger(__name__)
 
 
+def _build_columns_from_schema(
+    schema: Any, primary_key_names: list[str]
+) -> list[dict[str, Any]]:
+    """Build a list of column dicts (name, type, is_primary_key) from a PyArrow schema."""
+    columns: list[dict[str, Any]] = []
+    for i in range(len(schema)):
+        field = schema.field(i)
+        type_info = Neo4jGraphParquetFormatter.pyarrow_type_to_type_info(field.type)
+        columns.append(
+            {
+                "name": field.name,
+                "type": type_info.source_type,
+                "is_primary_key": field.name in primary_key_names,
+            }
+        )
+    return columns
+
+
 def batched(rows: list[Any], batch_size: int) -> Generator[list[Any], None, None]:
     index = 0
     for i in range(0, len(rows), batch_size):
@@ -53,11 +81,46 @@ def batched(rows: list[Any], batch_size: int) -> Generator[list[Any], None, None
         index += 1
 
 
+def _graph_stats(
+    graph: Neo4jGraph,
+    nodes_per_label: Optional[dict[str, int]] = None,
+    rel_per_type: Optional[dict[str, int]] = None,
+    input_files_count: int = 0,
+    input_files_total_size_bytes: int = 0,
+) -> dict[str, Any]:
+    """Build the statistics dict for writer metadata.
+
+    Schema:
+        node_count, relationship_count, nodes_per_label, rel_per_type,
+        input_files_count, input_files_total_size_bytes.
+    """
+    if nodes_per_label is None:
+        nodes_per_label = {}
+        for node in graph.nodes:
+            nodes_per_label[node.label] = nodes_per_label.get(node.label, 0) + 1
+    if rel_per_type is None:
+        rel_per_type = {}
+        for rel in graph.relationships:
+            rel_per_type[rel.type] = rel_per_type.get(rel.type, 0) + 1
+    return {
+        "node_count": len(graph.nodes),
+        "relationship_count": len(graph.relationships),
+        "nodes_per_label": nodes_per_label,
+        "rel_per_type": rel_per_type,
+        "input_files_count": input_files_count,
+        "input_files_total_size_bytes": input_files_total_size_bytes,
+    }
+
+
 class KGWriterModel(DataModel):
     """Data model for the output of the Knowledge Graph writer.
 
     Attributes:
-        status (Literal["SUCCESS", "FAILURE"]): Whether the write operation was successful.
+        status: Whether the write operation was successful ("SUCCESS" or "FAILURE").
+        metadata: Optional dict. When status is SUCCESS, contains at least:
+            - "statistics": dict with node_count, relationship_count, nodes_per_label,
+              rel_per_type, input_files_count, input_files_total_size_bytes.
+            - "files": list of file descriptors with file_path, etc. (ParquetWriter).
     """
 
     status: Literal["SUCCESS", "FAILURE"]
@@ -223,10 +286,184 @@ async def run(
             return KGWriterModel(
                 status="SUCCESS",
                 metadata={
-                    "node_count": len(graph.nodes),
-                    "relationship_count": len(graph.relationships),
+                    "statistics": _graph_stats(graph),
+                    "files": [],
                 },
             )
         except neo4j.exceptions.ClientError as e:
             logger.exception(e)
             return KGWriterModel(status="FAILURE", metadata={"error": str(e)})
+
+
+class ParquetWriter(KGWriter):
+    """Writes a knowledge graph to Parquet files using Neo4jGraphParquetFormatter.
+
+    Writes one Parquet file per node label and one per (head_label, relationship_type, tail_label)
+    to the given destinations, e.g. ``Person.parquet``, ``Person_KNOWS_Person.parquet``.
+
+    Args:
+        nodes_dest (ParquetOutputDestination): Destination for node Parquet files.
+        relationships_dest (ParquetOutputDestination): Destination for relationship Parquet files.
+        collision_handler (FilenameCollisionHandler): Handler for resolving filename collisions.
+        prefix (str): Optional filename prefix for all written files. Defaults to "".
+
+    Example:
+
+    .. code-block:: python
+
+        from neo4j_graphrag.experimental.components.filename_collision_handler import FilenameCollisionHandler
+        from neo4j_graphrag.experimental.components.kg_writer import ParquetWriter
+        from neo4j_graphrag.experimental.components.parquet_output import ParquetOutputDestination
+        from neo4j_graphrag.experimental.pipeline import Pipeline
+
+        # Provide your own implementation of ParquetOutputDestination (local, GCS, S3, etc.)
+        nodes_dest: ParquetOutputDestination = ...
+        relationships_dest: ParquetOutputDestination = ...
+
+        writer = ParquetWriter(
+            nodes_dest=nodes_dest,
+            relationships_dest=relationships_dest,
+            collision_handler=FilenameCollisionHandler(),
+        )
+        pipeline = Pipeline()
+        pipeline.add_component(writer, "writer")
+    """
+
+    def __init__(
+        self,
+        nodes_dest: ParquetOutputDestination,
+        relationships_dest: ParquetOutputDestination,
+        collision_handler: FilenameCollisionHandler,
+        prefix: str = "",
+    ) -> None:
+        self.nodes_dest = nodes_dest
+        self.relationships_dest = relationships_dest
+        self.collision_handler = collision_handler
+        self.prefix = prefix
+
+    @validate_call
+    async def run(
+        self,
+        graph: Neo4jGraph,
+        lexical_graph_config: LexicalGraphConfig = LexicalGraphConfig(),
+        schema: Optional[dict[str, Any]] = None,
+    ) -> KGWriterModel:
+        """Write the knowledge graph to Parquet files via Neo4jGraphParquetFormatter.
+
+        Args:
+            graph (Neo4jGraph): The knowledge graph to write.
+            lexical_graph_config (LexicalGraphConfig): Used by the formatter for
+                lexical graph labels (e.g. __Entity__) and key properties.
+            schema (Optional[dict[str, Any]]): Optional GraphSchema as a dictionary for
+                uniqueness constraints and key properties. If not provided, ``__id__`` is used.
+        """
+        try:
+            formatter = Neo4jGraphParquetFormatter(schema=schema)
+            data, file_metadata, stats = formatter.format_graph(
+                graph, lexical_graph_config, prefix=self.prefix
+            )
+
+            meta_by_filename: dict[str, Any] = {m.filename: m for m in file_metadata}
+            files: list[dict[str, Any]] = []
+            node_label_to_source_name: dict[str, str] = {}
+
+            base_nodes = self.nodes_dest.output_path.rstrip("/")
+            for filename, content in data["nodes"].items():
+                meta = meta_by_filename[filename]
+                unique_filename = self.collision_handler.get_unique_filename(
+                    filename, self.nodes_dest.output_path
+                )
+                await self.nodes_dest.write(content, unique_filename)
+                file_path = os.path.join(base_nodes, unique_filename)
+
+                resolved_stem = (
+                    unique_filename[:-8]
+                    if unique_filename.endswith(".parquet")
+                    else unique_filename
+                )
+                if meta.node_label is not None:
+                    node_label_to_source_name[meta.node_label] = resolved_stem
+
+                columns = _build_columns_from_schema(
+                    meta.schema,
+                    meta.key_properties or [],
+                )
+                name = meta.node_label or (
+                    meta.labels[0] if meta.labels else resolved_stem
+                )
+                files.append(
+                    {
+                        "name": name,
+                        "file_path": file_path,
+                        "columns": columns,
+                        "is_node": True,
+                        "labels": meta.labels or [],
+                    }
+                )
+
+            base_rel = self.relationships_dest.output_path.rstrip("/")
+            for filename, content in data["relationships"].items():
+                meta = meta_by_filename[filename]
+                unique_filename = self.collision_handler.get_unique_filename(
+                    filename, self.relationships_dest.output_path
+                )
+                await self.relationships_dest.write(content, unique_filename)
+                file_path = os.path.join(base_rel, unique_filename)
+
+                start_node_source = node_label_to_source_name.get(
+                    meta.relationship_head or "", meta.relationship_head or ""
+                )
+                end_node_source = node_label_to_source_name.get(
+                    meta.relationship_tail or "", meta.relationship_tail or ""
+                )
+                columns = _build_columns_from_schema(
+                    meta.schema,
+                    ["from", "to"],
+                )
+                rel_name = (
+                    f"{meta.relationship_head}_{meta.relationship_type}_{meta.relationship_tail}"
+                    if meta.relationship_head
+                    and meta.relationship_type
+                    and meta.relationship_tail
+                    else unique_filename[:-8]
+                    if unique_filename.endswith(".parquet")
+                    else unique_filename
+                )
+                files.append(
+                    {
+                        "name": rel_name,
+                        "file_path": file_path,
+                        "columns": columns,
+                        "is_node": False,
+                        "relationship_type": meta.relationship_type,
+                        "start_node_source": start_node_source,
+                        "start_node_primary_keys": meta.head_node_key_properties
+                        or ["__id__"],
+                        "end_node_source": end_node_source,
+                        "end_node_primary_keys": meta.tail_node_key_properties
+                        or ["__id__"],
+                    }
+                )
+
+            logger.info(
+                "Wrote %d node files and %d relationship files",
+                len(data["nodes"]),
+                len(data["relationships"]),
+            )
+            statistics = _graph_stats(
+                graph,
+                nodes_per_label=stats["nodes_per_label"],
+                rel_per_type=stats["rel_per_type"],
+                input_files_count=0,
+                input_files_total_size_bytes=0,
+            )
+            return KGWriterModel(
+                status="SUCCESS",
+                metadata={
+                    "statistics": statistics,
+                    "files": files,
+                },
+            )
+        except Exception as e:
+            logger.exception(e)
+            return KGWriterModel(status="FAILURE", metadata={"error": str(e)})
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ experimental = [`
`58`	`58`	`"langchain-text-splitters>=0.3.0,<0.4.0",`
`59`	`59`	`"neo4j-viz>=0.4.2,<0.5.0",`
`60`	`60`	`"llama-index>=0.13.0,<0.14.0",`
	`61`	`+ "pyarrow>=20.0.0", # ParquetWriter, Neo4jGraphParquetFormatter`
`61`	`62`	`]`
`62`	`63`	`examples = [`
`63`	`64`	`"langchain-openai>=0.2.2,<0.3.0",`