Refactor trackpy exporter

lxenard · lxenard · commit 00c2c9ef3820 · 2025-04-15T15:06:23.000+02:00
diff --git a/pycellin/io/trackpy/exporter.py b/pycellin/io/trackpy/exporter.py
@@ -24,100 +24,200 @@
 from pycellin.classes.model import Model
 
 
-def export_trackpy_dataframe(model: Model) -> pd.DataFrame:
+def safekeep_original_lineage_IDs(model: Model) -> None:
     """
-    Export a Pycellin model to a trackpy DataFrame.
+    Add original lineage IDs to the nodes of the model.
 
-    Trackpy does not support division events. They will be removed for
-    the export so each cell cycle will be reprensented by a single
-    trackpy track in the dataframe.
+    We want to safekeep them since we are going to renumber
+    the lineages later on.
 
     Parameters
     ----------
     model : Model
-        The Pycellin model to export.
-
-    Returns
-    -------
-    pd.DataFrame
-        A DataFrame containing trackpy formatted data.
+        The Pycellin model to modify.
     """
-    model_copy = copy.deepcopy(model)  # Don't want to modify the original model.
-
-    # We want to safekeep the original lineage IDs in the nodes of the model since
-    # we are going to rename and/or renumber them.
-    for lin_ID, lin in model_copy.data.cell_data.items():
+    for lin_ID, lin in model.data.cell_data.items():
         for node in lin.nodes():
             lin.nodes[node]["lineage_ID_Pycellin"] = lin_ID
 
-    # Removal of division events.
-    # We simply remove the edges involved in the divisions.
-    for lin in model_copy.get_cell_lineages():
+
+def remove_division_events(model: Model) -> None:
+    """
+    Remove division events by deleting edges involved in divisions.
+
+    Parameters
+    ----------
+    model : Model
+        The Pycellin model to modify.
+    """
+    for lin in model.get_cell_lineages():
         divs = lin.get_divisions()
         div_edges = [edge for div in divs for edge in lin.out_edges(div)]
         for edge in div_edges:
-            model_copy.remove_link(*edge, lin.graph["lineage_ID"])
-    model_copy.update()
+            model.remove_link(*edge, lin.graph["lineage_ID"])
+    model.update()
+
 
-    # Trackpy might not like negative lineage IDs so we change them to positive ones.
+def renumber_negative_lineage_IDs(model: Model) -> None:
+    """
+    Ensure lineage IDs are positive.
+
+    Trackpy might not support negative lineage IDs so it is safer to
+    renumber them to positive ones.
+
+    Parameters
+    ----------
+    model : Model
+        The Pycellin model to modify.
+    """
     one_node_lin_IDs = [
         lin.graph["lineage_ID"]
-        for lin in model_copy.get_cell_lineages()
+        for lin in model.get_cell_lineages()
         if lin.graph["lineage_ID"] < 0
     ]
     for lin_ID in one_node_lin_IDs:
-        lin = model_copy.get_cell_lineage_from_ID(lin_ID)
+        lin = model.get_cell_lineage_from_ID(lin_ID)
         assert lin is not None
-        new_lin_ID = model_copy.get_next_available_lineage_ID()
+        new_lin_ID = model.get_next_available_lineage_ID()
         # Update the lineage ID in the graph.
         lin.graph["lineage_ID"] = new_lin_ID
         # Update the lineage ID in the cell data.
-        model_copy.data.cell_data.pop(lin_ID)
-        model_copy.data.cell_data[new_lin_ID] = lin
+        model.data.cell_data.pop(lin_ID)
+        model.data.cell_data[new_lin_ID] = lin
 
-    # Creation of the trackpy DataFrame.
-    df = model_copy.to_cell_dataframe()
-    # We have to rename some columns to be compatible with trackpy.
-    if "particle" in df.columns:
-        # If we already have this column, it means the data is coming from
-        # trackpy, but it might not be up to date. Safer to remove it and
-        # rename it from "lineage_ID".
-        df.drop(columns=["particle"], inplace=True)
-    df.rename(columns={"lineage_ID": "particle"}, inplace=True)
-    df.rename(columns={"cell_x": "x"}, inplace=True)
-    df.rename(columns={"cell_y": "y"}, inplace=True)
-    if "cell_z" in df.columns:
-        df.rename(columns={"cell_z": "z"}, inplace=True)
-    if "ROI_coords" in df.columns:
-        # We need to remove the ROI_coords column.
-        df.drop(columns=["ROI_coords"], inplace=True)
-    # Reorder the columns to match trackpy format.
-    if "z" in df.columns:
-        dim_columns = ["z", "y", "x"]
-    else:
-        dim_columns = ["y", "x"]
+
+def rename_columns_if_exist(df, columns_map):
+    """
+    Helper function to rename columns if they exist in the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to modify.
+    columns_map : dict
+        A dictionary mapping old column names to new column names.
+    """
+    for old_name, new_name in columns_map.items():
+        if old_name in df.columns:
+            df.rename(columns={old_name: new_name}, inplace=True)
+
+
+def drop_columns_if_exist(df, columns):
+    """
+    Helper function to drop columns if they exist in the DataFrame.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to modify.
+    columns : list
+        The names of the columns to drop.
+    """
+    for column in columns:
+        if column in df.columns:
+            df.drop(columns=[column], inplace=True)
+
+
+def format_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Format the DataFrame to be compatible with trackpy.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to format.
+
+    Returns
+    -------
+    pd.DataFrame
+        The formatted DataFrame.
+    """
+    # Drop unnecessary columns.
+    drop_columns_if_exist(df, ["ROI_coords", "particle"])
+    # If we already have the "particle" column, it means the data is coming from
+    # trackpy, but it might not be up to date. Safer to remove it then recreate
+    # it from "lineage_ID".
+
+    # Rename columns to match trackpy format.
+    rename_columns_if_exist(
+        df,
+        {
+            "cell_x": "x",
+            "cell_y": "y",
+            "cell_z": "z",
+            "lineage_ID": "particle",
+        },
+    )
+
+    # Reorder columns to match trackpy format
+    dim_columns = ["z", "y", "x"] if "z" in df.columns else ["y", "x"]
     df = df[
         dim_columns
-        + [col for col in df.columns if col not in ["z", "y", "x", "frame", "particle"]]
+        + [col for col in df.columns if col not in dim_columns + ["frame", "particle"]]
         + ["frame", "particle"]
     ]
+
     # Sort the rows.
     df.sort_values(by=["particle", "frame"], inplace=True)
 
     return df
 
 
+def export_trackpy_dataframe(model: Model) -> pd.DataFrame:
+    """
+    Export a Pycellin model to a trackpy DataFrame.
+
+    Trackpy does not support division events. They will be removed for
+    the export so each cell cycle will be reprensented by a single
+    trackpy track in the dataframe.
+
+    Parameters
+    ----------
+    model : Model
+        The Pycellin model to export.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing trackpy formatted data.
+    """
+    # Prepare the model for export.
+    model_copy = copy.deepcopy(model)  # Don't want to modify the original model.
+    safekeep_original_lineage_IDs(model_copy)
+    remove_division_events(model_copy)  # Trackpy does not support division events.
+    renumber_negative_lineage_IDs(model_copy)
+
+    # Creation of the trackpy DataFrame.
+    df = model_copy.to_cell_dataframe()
+    df = format_dataframe(df)
+
+    return df
+
+
 if __name__ == "__main__":
+
+    # # Test with a sample TrackMate XML file.
+    # from pycellin import load_TrackMate_XML
+
+    # xml = "sample_data/Ecoli_growth_on_agar_pad.xml"
+
+    # model = load_TrackMate_XML(xml)
+    # for lin in model.get_cell_lineages():
+    #     print(lin)
+
+    # df = export_trackpy_dataframe(model)
+    # print(df.head())
+
+    # Test with a sample trackpy DataFrame.
+    from pycellin import load_trackpy_dataframe
+
     folder = "/mnt/data/Code/trackpy-examples-master/sample_data/"
     tracks = "FakeTracks_trackpy.pkl"
-    xml = "sample_data/Ecoli_growth_on_agar_pad.xml"
 
     df = pd.read_pickle(folder + tracks)
     print(df.head())
+    print(df.shape)
 
-    from pycellin import load_trackpy_dataframe, load_TrackMate_XML
-
-    # model = load_TrackMate_XML(xml)
     model = load_trackpy_dataframe(df)
     for lin in model.get_cell_lineages():
         print(lin)
@@ -128,3 +228,4 @@ def export_trackpy_dataframe(model: Model) -> pd.DataFrame:
 
     df = export_trackpy_dataframe(model)
     print(df.head())
+    print(df.shape)