dattalab
diff --git a/‎docs/keypoint_moseq_colab.ipynb‎
Lines changed: 685 additions & 669 deletions b/‎docs/keypoint_moseq_colab.ipynb‎
Lines changed: 685 additions & 669 deletions
diff --git a/‎docs/source/FAQs.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/FAQs.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/advanced.rst‎
Lines changed: 35 additions & 0 deletions b/‎docs/source/advanced.rst‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎docs/source/modeling.ipynb‎
Lines changed: 24 additions & 2198 deletions b/‎docs/source/modeling.ipynb‎
Lines changed: 24 additions & 2198 deletions
diff --git a/‎keypoint_moseq/analysis.py‎
Lines changed: 73 additions & 27 deletions b/‎keypoint_moseq/analysis.py‎
Lines changed: 73 additions & 27 deletions
@@ -393,6 +393,12 @@ Users occasionally find that the trajectory plot and grid movie for a given syll
 Density sampling is a way of selecting syllable instances that are most representative relative to the full dataset. Specifically, for each syllable, a syllable-specific density function is computed in trajectory space and compared to the overall density across all syllables. An exemplar instance that maximizes the ratio between these densities is chosen for each syllable, and its nearest neighbors are randomly sampled. When the distribution of trajectories for a syllable is multimodal (i.e., it represents a mixture of distinct behaviors), the examplar syllable may not capture the full range of behaviors, or it may jump from one mode to another when an existing model is applied to new data. In these cases, it may be better to sample syllable instances uniformly by setting turning off density sampling as shown above.
 
 
+Two different syllables look very similar. Is there a way to consider them as one syllable?
+----------------------------------------------------
+
+Yes, see the :ref:`Merging similar syllables <merging-syllables>` section in the Advanced Usage guide for instructions on how to combine syllables that represent the same behavior.
+
+
 Troubleshooting
 ===============
 
 
@@ -347,5 +347,40 @@ After this, the pipeline can be run as usual, except for steps that involve read
     # Overlaying keypoints
     kpms.overlay_keypoints_on_video(..., video_frame_indexes=video_frame_indexes)
 
+.. _merging-syllables:
+Merging similar syllables
+-------------------------
 
+In some cases it may be convenient to combine syllables that represent similar behaviors. Keypoint-moseq provides convenience functions for merging syllables into user-defined groups. These groups could be based on inspection of trajecotry plots, grid movies, or syllable dendrograms.
 
+.. code-block:: python
+
+    # Define the syllables to merge as a list of lists. All syllables within
+    # a given inner list will be merged into a single syllable.
+    # In this case, we're merging syllables 1 and 3 into a single syllable, 
+    # and merging syllables 4 and 5 into a single syllable.
+    syllables_to_merge = [
+        [1, 3],
+        [4, 5]
+    ]
+
+    # Load the results you wish to merge (change path as needed)
+    import os
+    results_path = os.path.join(project_dir, model_name, 'results.h5')
+    results = kpms.load_hdf5(results_path)
+
+    # Generate a mapping that specifies how syllables will be relabled.
+    syllable_mapping = kpms.generate_syllable_mapping(results, syllables_to_merge)
+    new_results = kpms.apply_syllable_mapping(results, syllable_mapping)
+
+    # Save the new results to disk (using a modified path)
+    new_results_path = os.path.join(project_dir, model_name, 'results_merged.h5')
+    kpms.save_hdf5(new_results_path, new_results)
+
+    # Optionally generate new trajectory plots and grid movies
+    # In each case, specify the output directory to avoid overwriting
+    output_dir = os.path.join(project_dir, model_name, 'grid_movies_merged')
+    kpms.generate_grid_movies(new_results, output_dir=output_dir, coordinates=coordinates, **config())
+
+    output_dir = os.path.join(project_dir, model_name, 'trajectory_plots_merged')
+    kpms.generate_trajectory_plots(coordinates, new_results, output_dir=output_dir, **config())
@@ -46,7 +46,9 @@ def get_syllable_names(project_dir, model_name, syllable_ixs):
 
         for ix in syllable_ixs:
             if len(syll_info_df[syll_info_df.syllable == ix].label.values[0]) > 0:
-                labels[ix] = f"{ix} ({syll_info_df[syll_info_df.syllable == ix].label.values[0]})"
+                labels[ix] = (
+                    f"{ix} ({syll_info_df[syll_info_df.syllable == ix].label.values[0]})"
+                )
     names = [labels[ix] for ix in syllable_ixs]
     return names
 
@@ -214,14 +216,17 @@ def compute_moseq_df(project_dir, model_name, *, fps=30, smooth_heading=True):
             np.concatenate(
                 (
                     [0],
-                    np.sqrt(np.square(np.diff(v["centroid"], axis=0)).sum(axis=1)) * fps,
+                    np.sqrt(np.square(np.diff(v["centroid"], axis=0)).sum(axis=1))
+                    * fps,
                 )
             )
         )
 
         if index_data is not None:
             # find the group for each recording from index data
-            s_group.append([index_data[index_data["name"] == k]["group"].values[0]] * n_frame)
+            s_group.append(
+                [index_data[index_data["name"] == k]["group"].values[0]] * n_frame
+            )
         else:
             # no index data
             s_group.append(["default"] * n_frame)
@@ -236,8 +241,12 @@ def compute_moseq_df(project_dir, model_name, *, fps=30, smooth_heading=True):
         heading.append(recording_heading)
 
         # compute angular velocity (radian per second)
-        gaussian_smoothed_heading = filter_angle(recording_heading, size=3, method="gaussian")
-        angular_velocity.append(np.concatenate(([0], np.diff(gaussian_smoothed_heading) * fps)))
+        gaussian_smoothed_heading = filter_angle(
+            recording_heading, size=3, method="gaussian"
+        )
+        angular_velocity.append(
+            np.concatenate(([0], np.diff(gaussian_smoothed_heading) * fps))
+        )
 
         # add syllable data
         syllables.append(v["syllable"])
@@ -367,7 +376,9 @@ def compute_stats_df(
 def generate_syll_info(project_dir, model_name, syll_info_path):
     # parse model results
     model_results = load_results(project_dir, model_name)
-    unique_sylls = np.unique(np.concatenate([file["syllable"] for file in model_results.values()]))
+    unique_sylls = np.unique(
+        np.concatenate([file["syllable"] for file in model_results.values()])
+    )
     # construct the syllable dictionary
     # in the non interactive version there won't be any group info
     syll_info_df = pd.DataFrame(
@@ -428,8 +439,12 @@ def label_syllables(project_dir, model_name, moseq_df):
     # load syll_info
     syll_info_df = pd.read_csv(syll_info_path, index_col=False).fillna("")
     # split into with movie and without movie
-    syll_info_df_with_movie = syll_info_df[syll_info_df.movie_path.str.contains(".mp4")].copy()
-    syll_info_df_without_movie = syll_info_df[~syll_info_df.movie_path.str.contains(".mp4")].copy()
+    syll_info_df_with_movie = syll_info_df[
+        syll_info_df.movie_path.str.contains(".mp4")
+    ].copy()
+    syll_info_df_without_movie = syll_info_df[
+        ~syll_info_df.movie_path.str.contains(".mp4")
+    ].copy()
 
     # create select widget only include the ones with a movie
     select = pn.widgets.Select(
@@ -520,7 +535,9 @@ def b(event, save=True):
     button.on_click(b)
 
     # bind everything together
-    return pn.Row(pn.Column(select, ivideo), pn.Column(summary_table, pn.Column(button)))
+    return pn.Row(
+        pn.Column(select, ivideo), pn.Column(summary_table, pn.Column(button))
+    )
 
 
 def get_tie_correction(x, N_m):
@@ -605,7 +622,10 @@ def run_manual_KW_test(
     # get square of sums for each group
     ssbn = np.zeros((n_perm, N_s))
     for i in range(num_groups):
-        ssbn += perm_ranks[:, cum_group_idx[i] : cum_group_idx[i + 1]].sum(1) ** 2 / n_per_group[i]
+        ssbn += (
+            perm_ranks[:, cum_group_idx[i] : cum_group_idx[i + 1]].sum(1) ** 2
+            / n_per_group[i]
+        )
 
     # h-statistic
     h_all = 12.0 / (N_m * (N_m + 1)) * ssbn - 3 * (N_m + 1)
@@ -616,7 +636,9 @@ def run_manual_KW_test(
     p_i = np.random.randint(n_perm)
     s_i = np.random.randint(N_s)
     kr = stats.kruskal(
-        *np.array_split(merged_usages_all[perm[p_i, :], s_i], np.cumsum(n_per_group[:-1]))
+        *np.array_split(
+            merged_usages_all[perm[p_i, :], s_i], np.cumsum(n_per_group[:-1])
+        )
     )
     assert (kr.statistic == h_all[p_i, s_i]) & (
         kr.pvalue == p_vals[p_i, s_i]
@@ -671,7 +693,8 @@ def dunns_z_test_permute_within_group_pairs(
 
         ranks_perm = real_ranks[(is_i | is_j)][rnd.rand(n_perm, n_mice).argsort(-1)]
         diff = np.abs(
-            ranks_perm[:, : is_i.sum(), :].mean(1) - ranks_perm[:, is_i.sum() :, :].mean(1)
+            ranks_perm[:, : is_i.sum(), :].mean(1)
+            - ranks_perm[:, is_i.sum() :, :].mean(1)
         )
         B = 1.0 / vc.loc[i_n] + 1.0 / vc.loc[j_n]
 
@@ -732,7 +755,9 @@ def compute_pvalues_for_group_pairs(
 
     p_vals_allperm = {}
     for pair in combinations(group_names, 2):
-        p_vals_allperm[pair] = ((null_zs[pair] > real_zs_within_group[pair]).sum(0) + 1) / n_perm
+        p_vals_allperm[pair] = (
+            (null_zs[pair] > real_zs_within_group[pair]).sum(0) + 1
+        ) / n_perm
 
     # summarize into df
     df_pval = pd.DataFrame(p_vals_allperm)
@@ -782,7 +807,9 @@ def run_kruskal(
     rnd = np.random.RandomState(seed=seed)
     # get grouped mean data
     grouped_data = (
-        stats_df.pivot_table(index=["group", "name"], columns="syllable", values=statistic)
+        stats_df.pivot_table(
+            index=["group", "name"], columns="syllable", values=statistic
+        )
         .replace(np.nan, 0)
         .reset_index()
     )
@@ -813,7 +840,9 @@ def run_kruskal(
     # find the real k_real
     df_k_real = pd.DataFrame(
         [
-            stats.kruskal(*np.array_split(syllable_data[:, s_i], np.cumsum(n_per_group[:-1])))
+            stats.kruskal(
+                *np.array_split(syllable_data[:, s_i], np.cumsum(n_per_group[:-1]))
+            )
             for s_i in range(N_s)
         ]
     )
@@ -851,7 +880,9 @@ def run_kruskal(
     df_z = pd.DataFrame(real_zs_within_group)
     df_z.index = df_z.index.set_names("syllable")
     dunn_results_df = df_z.reset_index().melt(id_vars=[("syllable", "")])
-    dunn_results_df.rename(columns={"variable_0": "group1", "variable_1": "group2"}, inplace=True)
+    dunn_results_df.rename(
+        columns={"variable_0": "group1", "variable_1": "group2"}, inplace=True
+    )
 
     # Get intersecting significant syllables between
     intersect_sig_syllables = {}
@@ -864,7 +895,9 @@ def run_kruskal(
 
 
 # frequency plot stuff
-def sort_syllables_by_stat_difference(stats_df, ctrl_group, exp_group, stat="frequency"):
+def sort_syllables_by_stat_difference(
+    stats_df, ctrl_group, exp_group, stat="frequency"
+):
     """Sort syllables by the difference in the stat between the control and
     experimental group.
 
@@ -997,7 +1030,9 @@ def _validate_and_order_syll_stats_params(
             raise ValueError(
                 f"Attempting to sort by {stat} differences, but {ctrl_group} or {exp_group} not in {groups}."
             )
-        ordering = sort_syllables_by_stat_difference(complete_df, ctrl_group, exp_group, stat=stat)
+        ordering = sort_syllables_by_stat_difference(
+            complete_df, ctrl_group, exp_group, stat=stat
+        )
     if colors is None:
         colors = []
     if len(colors) == 0 or len(colors) != len(groups):
@@ -1146,7 +1181,9 @@ def plot_syll_stats_with_sem(
                     markings.append(np.where(ordering == s)[0])
                 if len(markings) > 0:
                     markings = np.concatenate(markings)
-                    plt.scatter(markings, [init_y] * len(markings), color="r", marker="*")
+                    plt.scatter(
+                        markings, [init_y] * len(markings), color="r", marker="*"
+                    )
                     plt.text(
                         plt.xlim()[1],
                         init_y,
@@ -1309,7 +1346,9 @@ def get_transition_matrix(
             # Get syllable transitions
             transitions = get_transitions(v)[0]
 
-            trans_mat = n_gram_transition_matrix(transitions, n=2, max_label=max_syllable)
+            trans_mat = n_gram_transition_matrix(
+                transitions, n=2, max_label=max_syllable
+            )
             init_matrix.append(trans_mat)
 
         init_matrix = np.sum(init_matrix, axis=0) + smoothing
@@ -1322,7 +1361,8 @@ def get_transition_matrix(
             transitions = get_transitions(v)[0]
 
             trans_mat = (
-                n_gram_transition_matrix(transitions, n=2, max_label=max_syllable) + smoothing
+                n_gram_transition_matrix(transitions, n=2, max_label=max_syllable)
+                + smoothing
             )
 
             # Normalize matrix
@@ -1368,9 +1408,9 @@ def get_group_trans_mats(labels, label_group, group, syll_include, normalize="bi
         # Get recordings to include in trans_mat
         # subset only syllable included
         trans_mats.append(
-            get_transition_matrix(use_labels, normalize=normalize, combine=True)[syll_include, :][
-                :, syll_include
-            ]
+            get_transition_matrix(use_labels, normalize=normalize, combine=True)[
+                syll_include, :
+            ][:, syll_include]
         )
 
         # Getting frequency information for node scaling
@@ -1447,7 +1487,9 @@ def visualize_transition_bigram(
     save_analysis_figure(fig, plot_name, project_dir, model_name, save_dir)
 
 
-def generate_transition_matrices(project_dir, model_name, normalize="bigram", min_frequency=0.005):
+def generate_transition_matrices(
+    project_dir, model_name, normalize="bigram", min_frequency=0.005
+):
     """Generate the transition matrices for each recording.
 
     Parameters
@@ -1543,7 +1585,9 @@ def plot_transition_graph_group(
         nodelist = G.nodes()
         # normalize the usage values
         sum_usages = sum(usages[i])
-        normalized_usages = np.array([u / sum_usages for u in usages[i]]) * node_scaling + 1000
+        normalized_usages = (
+            np.array([u / sum_usages for u in usages[i]]) * node_scaling + 1000
+        )
         nx.draw_networkx_nodes(
             G,
             pos,
@@ -1631,7 +1675,9 @@ def plot_transition_graph_difference(
         # left tm minus right tm
         tm_diff = trans_mats[left_ind] - trans_mats[right_ind]
         # left usage minus right usage
-        usages_diff = np.array(list(usages[left_ind])) - np.array(list(usages[right_ind]))
+        usages_diff = np.array(list(usages[left_ind])) - np.array(
+            list(usages[right_ind])
+        )
         normlized_usg_abs_diff = (
             np.abs(usages_diff) / np.abs(usages_diff).sum()
         ) * node_scaling + 500