updated to_geff export function and added cli (#248)

TeunHuijben · JoOkuma · web-flow · commit 727226170e06 · 2025-11-21T08:57:23.000-08:00
* updated to_geff export function and added cli

* Update ultrack/cli/export.py

Co-authored-by: Jordão Bragantini &lt;jordao.bragantini@czbiohub.org&gt;

* handle Jordaos review

---------

Co-authored-by: Jordão Bragantini &lt;jordao.bragantini@czbiohub.org&gt;
diff --git a/ultrack/cli/export.py b/ultrack/cli/export.py
@@ -15,7 +15,13 @@
     tuple_callback,
 )
 from ultrack.config import MainConfig
-from ultrack.core.export import to_ctc, to_trackmate, to_tracks_layer, tracks_to_zarr
+from ultrack.core.export import (
+    to_ctc,
+    to_geff_from_database,
+    to_trackmate,
+    to_tracks_layer,
+    tracks_to_zarr,
+)
 from ultrack.core.solve.sqltracking import SQLTracking
 from ultrack.imgproc.measure import tracks_properties
 from ultrack.utils.data import validate_and_overwrite_path
@@ -187,6 +193,45 @@ def trackmate_cli(
     to_trackmate(config, output_path, overwrite)
 
 
+@click.command("geff")
+@click.argument(
+    "database_path",
+    type=click.Path(path_type=Path, exists=True),
+)
+@click.option(
+    "--output-path",
+    "-o",
+    required=False,
+    type=click.Path(path_type=Path),
+    default=None,
+    help=(
+        "Geff (Graph Exchange File Format) output path. "
+        "If not provided, saves to same directory as database with '_geff.geff' extension."
+    ),
+)
+@overwrite_option()
+def geff_cli(
+    database_path: Path,
+    output_path: Optional[Path],
+    overwrite: bool,
+) -> None:
+    """
+    Exports tracking results to Geff (Graph Exchange File Format) format.
+    """
+    if output_path is None:
+        # Generate output path from database path
+        output_path = database_path.parent / f"{database_path.stem}.geff"
+    else:
+        # Validate that the output path has a geff extension
+        output_str = str(output_path)
+        if not (output_str.endswith(".geff") or output_str.endswith(".geff.zarr")):
+            raise click.BadParameter(
+                f"Output path must have a .geff or .geff.zarr extension, got: {output_path}"
+            )
+
+    to_geff_from_database(database_path, output_path, overwrite)
+
+
 @click.command("lp")
 @click.option(
     "--output-path",
@@ -229,6 +274,7 @@ def export_cli() -> None:
 
 
 export_cli.add_command(ctc_cli)
+export_cli.add_command(geff_cli)
 export_cli.add_command(lp_cli)
 export_cli.add_command(trackmate_cli)
 export_cli.add_command(zarr_napari_cli)
diff --git a/ultrack/core/export/__init__.py b/ultrack/core/export/__init__.py
@@ -1,6 +1,6 @@
 from ultrack.core.export.ctc import to_ctc
 from ultrack.core.export.exporter import export_tracks_by_extension
-from ultrack.core.export.geff import to_geff
+from ultrack.core.export.geff import to_geff, to_geff_from_database
 from ultrack.core.export.networkx import to_networkx, tracks_layer_to_networkx
 from ultrack.core.export.trackmate import to_trackmate, tracks_layer_to_trackmate
 from ultrack.core.export.tracks_layer import to_tracks_layer
diff --git a/ultrack/core/export/geff.py b/ultrack/core/export/geff.py
@@ -15,18 +15,29 @@
 from ultrack.core.database import NO_PARENT, LinkDB, NodeDB, OverlapDB
 
 
-def to_geff(
-    config: MainConfig,
+# Helper function to convert pandas/numpy dtypes to string dtype names
+def dtype_to_str(dtype) -> str:
+    """Convert pandas/numpy dtype to string dtype name for PropMetadata."""
+    # Convert to numpy dtype first to get consistent .name attribute
+    np_dtype = np.dtype(dtype)
+    dtype_name = np_dtype.name
+
+    # Most dtypes work directly (int64, float64, bool, etc.)
+    return dtype_name
+
+
+def to_geff_from_database(
+    database_path: Union[str, Path],
     filename: Union[str, Path],
     overwrite: bool = False,
 ) -> None:
     """
-    Export tracks to a geff (Graph Exchange File Format) file.
+    Export tracks to a geff (Graph Exchange File Format) file from a database.
 
     Parameters
     ----------
-    config : MainConfig
-        The configuration object.
+    database_path : str or Path
+        The path to the database file.
     filename : str or Path
         The name of the file to save the tracks to.
     overwrite : bool, optional
@@ -46,9 +57,39 @@ def to_geff(
         else:
             shutil.rmtree(filename)
 
-    engine = sqla.create_engine(config.data_config.database_path)
+    # Convert database_path to SQLAlchemy URL format if needed
+    database_path_str = str(database_path)
+    # If it's not already a SQLAlchemy URL (doesn't start with a protocol), assume it's a SQLite file path
+    if not database_path_str.startswith(
+        ("sqlite://", "postgresql://", "mysql://", "postgresql+psycopg2://")
+    ):
+        # Convert file path to SQLite URL format
+        database_path_str = f"sqlite:///{Path(database_path).absolute()}"
+    engine = sqla.create_engine(database_path_str)
     with Session(engine) as session:
-        node_stmt = session.query(
+        # Collect nodes data, storing masks and bboxes separately
+        all_nodes_data = []
+        all_masks = []
+        all_bboxes = []
+        solution_source = []
+        solution_target = []
+
+        for (
+            node_id,
+            t,
+            parent_id,
+            z,
+            y,
+            x,
+            z_shift,
+            y_shift,
+            x_shift,
+            area,
+            frontier,
+            height,
+            selected,
+            pickle_obj,
+        ) in session.query(
             NodeDB.id,
             NodeDB.t,
             NodeDB.parent_id,
@@ -63,59 +104,96 @@ def to_geff(
             NodeDB.height,
             NodeDB.selected,
             NodeDB.pickle,
-        ).statement
-        node_df = pd.read_sql(node_stmt, session.bind, index_col="id")
-        node_df["id"] = node_df.index
+        ):
+            node_dict = {
+                "id": node_id,
+                "parent_id": parent_id,
+                "t": t,
+                "z": z,
+                "y": y,
+                "x": x,
+                "z_shift": z_shift,
+                "y_shift": y_shift,
+                "x_shift": x_shift,
+                "area": area,
+                "frontier": frontier,
+                "height": height,
+                "solution": selected,
+            }
+            all_nodes_data.append(node_dict)
+            # Store masks and bboxes separately
+            all_masks.append(pickle_obj.mask.astype(np.uint64))
+            all_bboxes.append(pickle_obj.bbox.astype(np.int64))
+
+            # Collect solution edges (parent-child relationships)
+            if selected and parent_id != NO_PARENT:
+                solution_source.append(parent_id)
+                solution_target.append(node_id)
+
+        # Create nodes dataframe (only scalar values, no pickle objects)
+        node_df = pd.DataFrame(all_nodes_data)
+        node_df.set_index("id", inplace=True)
+        node_df["solution"] = node_df["solution"].astype(bool)
 
+        # Query edges
         edge_stmt = session.query(
             LinkDB.source_id, LinkDB.target_id, LinkDB.weight
         ).statement
         edge_df = pd.read_sql(edge_stmt, session.bind)
 
-        sol_links_df = node_df.loc[
-            node_df["selected"] & node_df["parent_id"] != NO_PARENT,
-            ["id", "parent_id"],
-        ]
-        sol_links_df = sol_links_df.rename(
-            columns={"parent_id": "source_id", "id": "target_id"},
+        # Add solution column to edges
+        sol_links_df = pd.DataFrame(
+            {
+                "source_id": solution_source,
+                "target_id": solution_target,
+                "solution": True,
+            }
         )
-        sol_links_df["solution"] = True
-        edge_df = edge_df.merge(sol_links_df, on=["source_id", "target_id"])
-        edge_df["solution"] = edge_df["solution"].fillna(False)
-
-        node_df.rename(columns={"selected": "solution"}, inplace=True)
-        node_df.drop(["id", "parent_id"], axis=1, inplace=True)
+        edge_df = edge_df.merge(sol_links_df, on=["source_id", "target_id"], how="left")
+        edge_df.loc[edge_df["solution"].isna(), "solution"] = False
+        edge_df["solution"] = edge_df["solution"].astype(bool)
+        if "weight" in edge_df.columns:
+            edge_df["weight"] = edge_df["weight"].astype(np.float64)
 
+        # Query overlaps
         overlap_stmt = session.query(
             OverlapDB.node_id,
             OverlapDB.ancestor_id,
         ).statement
         overlap_df = pd.read_sql(overlap_stmt, session.bind)
 
-    node_props_metadata = {
-        c: PropMetadata(
+    # Create node properties metadata
+    node_props_metadata = {}
+    for c in node_df.columns:
+        node_props_metadata[c] = PropMetadata(
             identifier=c,
-            dtype=node_df[c].dtype,
+            dtype=dtype_to_str(node_df[c].dtype),
         )
-        for c in node_df.columns
-        if c != "pickle"
-    }
     node_props_metadata["mask"] = PropMetadata(
         identifier="mask",
-        dtype=np.uint64,
+        dtype="uint64",
         varlength=True,
     )
     node_props_metadata["bbox"] = PropMetadata(
         identifier="bbox",
-        dtype=np.int64,
+        dtype="int64",
     )
 
-    edge_ids = edge_df[["source_id", "target_id"]].to_numpy(dtype=np.uint64)
+    # Prepare edge IDs and properties
+    edge_ids = np.column_stack(
+        [
+            edge_df["source_id"].to_numpy(dtype=np.uint64),
+            edge_df["target_id"].to_numpy(dtype=np.uint64),
+        ]
+    )
     edge_df = edge_df.drop(columns=["source_id", "target_id"])
 
-    edge_props_metadata = {
-        c: PropMetadata(identifier=c, dtype=edge_df[c].dtype) for c in edge_df.columns
-    }
+    # Create edge properties metadata
+    edge_props_metadata = {}
+    for c in edge_df.columns:
+        edge_props_metadata[c] = PropMetadata(
+            identifier=c, dtype=dtype_to_str(edge_df[c].dtype)
+        )
 
     geff_metadata = geff.GeffMetadata(
         directed=True,
@@ -129,27 +207,32 @@ def to_geff(
         edge_props_metadata=edge_props_metadata,
     )
 
-    node_props = {
-        c: {"values": node_df[c].to_numpy(), "missing": None}
-        for c in node_df.columns
-        if c != "pickle"
-    }
-    node_props["mask"] = construct_var_len_props(
-        [v.mask.astype(np.uint64) for v in node_df["pickle"]]
-    )
-    node_props["bbox"] = {
-        "values": np.stack([v.bbox for v in node_df["pickle"]]),
-        "missing": None,
-    }
+    # Prepare node properties (using separately stored masks and bboxes)
+    node_props = {}
+    for c in node_df.columns:
+        # Convert to appropriate numpy dtype
+        values = node_df[c].to_numpy()
+        node_props[c] = {"values": values, "missing": None}
+
+    # Handle mask - use the separately stored masks
+    node_props["mask"] = construct_var_len_props(all_masks)
+
+    # Handle bbox - stack into 2D array from separately stored bboxes
+    bbox_array = np.stack(all_bboxes)
+    node_props["bbox"] = {"values": bbox_array, "missing": None}
+
+    # Prepare edge properties with proper dtypes
+    edge_props = {}
+    for c in edge_df.columns:
+        values = edge_df[c].to_numpy()
+        edge_props[c] = {"values": values, "missing": None}
 
     write_arrays(
         filename,
         node_ids=node_df.index.to_numpy(dtype=np.uint64),
         node_props=node_props,
         edge_ids=edge_ids,
-        edge_props={
-            c: {"values": edge_df[c].to_numpy(), "missing": None} for c in edge_df
-        },
+        edge_props=edge_props,
         metadata=geff_metadata,
     )
 
@@ -159,3 +242,32 @@ def to_geff(
         dtype=np.uint64
     )
     store.create_group("overlaps/props")
+
+
+def to_geff(
+    config: MainConfig,
+    filename: Union[str, Path],
+    overwrite: bool = False,
+) -> None:
+    """
+    Export tracks to a geff (Graph Exchange File Format) file.
+
+    Parameters
+    ----------
+    config : MainConfig
+        The configuration object.
+    filename : str or Path
+        The name of the file to save the tracks to.
+    overwrite : bool, optional
+        Whether to overwrite the file if it already exists, by default False.
+
+    Raises
+    ------
+    FileExistsError
+        If the file already exists and overwrite is False.
+    """
+    to_geff_from_database(
+        database_path=config.data_config.database_path,
+        filename=filename,
+        overwrite=overwrite,
+    )