Fix apostrophe bug in spikeinterface module and improve memory efficiency (#1666)

h-mayorquin · web-flow · commit 58c3d94ed343 · 2026-02-13T15:00:45.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 * Fixed bug in `write_imaging_to_nwbfile` where `nwbfile` was incorrectly passed to `add_imaging_to_nwbfile` instead of the created/loaded nwbfile. [PR #1649](https://github.com/catalystneuro/neuroconv/pull/1649)
 * Fixed bug in `write_segmentation_to_nwbfile` where invalid `plane_num` parameter was passed to `add_segmentation_to_nwbfile`. [PR #1649](https://github.com/catalystneuro/neuroconv/pull/1649)
 * Fixed `get_json_schema_from_method_signature` to skip `*args` (VAR_POSITIONAL) parameters, which was causing schema validation errors when methods used the `*args` pattern for deprecating positional arguments. [PR #1647](https://github.com/catalystneuro/neuroconv/pull/1647)
+* Fixed a bug in unit table addition to nwbfile where `unit_name` values containing apostrophes could fail matching due to pandas `to_dataframe().query(...)` parsing. Replaced query-based matching with direct `unit_name` column mapping, with improved memory efficiency as a side effect. Added regression coverage for quoted unit names. [PR #1666](https://github.com/catalystneuro/neuroconv/pull/1666)
 
 ## Features
 * Added `roi_ids_to_add` parameter to `BaseSegmentationExtractorInterface.add_to_nwbfile()` to select a subset of ROIs during conversion, reducing file size by excluding rejected or unwanted ROIs. Also added `roi_ids` property to inspect available ROI IDs. Requires roiextractors >= 0.8.0. [PR #1658](https://github.com/catalystneuro/neuroconv/pull/1658)
diff --git a/src/neuroconv/tools/spikeinterface/spikeinterface.py b/src/neuroconv/tools/spikeinterface/spikeinterface.py
@@ -1045,10 +1045,10 @@ def add_electrodes_to_nwbfile(
     channel_map = _build_channel_id_to_electrodes_table_map(recording=recording, nwbfile=nwbfile)
 
     # Get indices where this recording's data goes (all should be found now)
-    all_indices = np.arange(electrode_table_size)
     channel_ids = recording.get_channel_ids()
     indices_for_new_data = [channel_map[channel_id] for channel_id in channel_ids]
-    indices_for_null_values = [index for index in all_indices if index not in indices_for_new_data]
+    new_indices_set = set(indices_for_new_data)
+    indices_for_null_values = [index for index in range(electrode_table_size) if index not in new_indices_set]
     extending_column = len(indices_for_null_values) > 0
 
     # Add properties as columns (exclude channel_name and electrode_name as they were handled above)
@@ -2173,14 +2173,17 @@ def _add_units_table_to_nwbfile(
         cols_args["data"] = extended_data
         units_table.add_column("unit_name", **cols_args)
 
-    # Build  a channel name to electrode table index map
-    table_df = units_table.to_dataframe().reset_index()
-    unit_name_to_electrode_index = {
-        unit_name: table_df.query(f"unit_name=='{unit_name}'").index[0] for unit_name in unit_name_array
-    }
+    # Build a unit_name to units table row index map directly from the table column.
+    # This avoids materializing a pandas DataFrame and query parsing/casting pitfalls.
+    unit_names_in_table = units_table["unit_name"][:]
+    unit_name_to_electrode_index = {}
+    for index, unit_name in enumerate(unit_names_in_table):
+        if unit_name not in unit_name_to_electrode_index:
+            unit_name_to_electrode_index[unit_name] = index
 
     indices_for_new_data = [unit_name_to_electrode_index[unit_name] for unit_name in unit_name_array]
-    indices_for_null_values = table_df.index.difference(indices_for_new_data).values
+    new_indices_set = set(indices_for_new_data)
+    indices_for_null_values = [index for index in range(unit_table_size) if index not in new_indices_set]
     extending_column = len(indices_for_null_values) > 0
 
     # Add properties as columns
@@ -2871,8 +2874,11 @@ def _get_electrode_group_indices(recording, nwbfile):
     if group_names is None:
         electrode_group_indices = None
     else:
-        group_names = [str(group_name) for group_name in group_names]
-        electrode_group_indices = nwbfile.electrodes.to_dataframe().query(f"group_name in {group_names}").index.values
+        group_names_set = {str(group_name) for group_name in group_names}
+        table_group_names = nwbfile.electrodes["group_name"][:]
+        electrode_group_indices = np.array(
+            [index for index, group_name in enumerate(table_group_names) if str(group_name) in group_names_set]
+        )
     return electrode_group_indices
 
 
diff --git a/tests/test_modalities/test_ecephys/test_tools_spikeinterface.py b/tests/test_modalities/test_ecephys/test_tools_spikeinterface.py
@@ -1673,6 +1673,30 @@ def test_non_overwriting_unit_names_sorting_property(self):
         unit_names_in_units_table = list(self.nwbfile.units["unit_name"].data)
         self.assertListEqual(unit_names_in_units_table, expected_unit_names_in_units_table)
 
+    def test_property_matching_by_unit_name_with_quotes(self):
+        """Ensure matching by unit_name works when names contain apostrophes.
+
+        This test was added in PR #1666, which removed pandas DataFrame/query matching
+        (`to_dataframe().query(...)`) from units-table extension logic. It protects
+        against regressions to query-string based matching (e.g., `pandas.query`),
+        which can fail or mis-parse when unit_name contains quotes.
+        """
+        quoted_unit_names = ["unit'a", "unit'b", "unit'c", "unit'd"]
+        self.sorting_1.set_property(key="unit_name", values=quoted_unit_names)
+        self.sorting_1.set_property(key="property", values=["value_a", "value_b", "value_c", "value_d"])
+
+        add_sorting_to_nwbfile(sorting=self.sorting_1, nwbfile=self.nwbfile)
+
+        self.sorting_2.set_property(key="unit_name", values=["unit'c", "unit'd", "unit'e", "unit'f"])
+        self.sorting_2.set_property(key="property", values=["value_c2", "value_d2", "value_e", "value_f"])
+
+        add_sorting_to_nwbfile(sorting=self.sorting_2, nwbfile=self.nwbfile)
+
+        expected_unit_names = ["unit'a", "unit'b", "unit'c", "unit'd", "unit'e", "unit'f"]
+        expected_property_values = ["value_a", "value_b", "value_c", "value_d", "value_e", "value_f"]
+        self.assertListEqual(list(self.nwbfile.units["unit_name"].data), expected_unit_names)
+        self.assertListEqual(list(self.nwbfile.units["property"].data), expected_property_values)
+
     def test_integer_unit_names_overwrite(self):
         """Ensure unit names merge correctly after appending when unit names are integers."""
         unit_ids = self.base_sorting.get_unit_ids()