[ENH] make loading large studysets faster (#1007)

jdkent · web-flow · commit 27678e8cd840 · 2026-04-21T01:18:58.000-05:00
* make loading large studysets faster

* respond to review
diff --git a/nimare/_studyset_store.py b/nimare/_studyset_store.py
@@ -136,6 +136,14 @@ def _cached_default_masker(target):
 
 def _apply_annotation_payloads(source_dict, annotation_payloads):
     """Apply top-level annotation notes into analysis-level annotation dictionaries."""
+    if annotation_payloads is None:
+        return source_dict
+
+    annotation_payloads = _coerce_annotation_payloads(annotation_payloads)
+    if not annotation_payloads:
+        source_dict["annotations"] = []
+        return source_dict
+
     analysis_map = {}
     for study in source_dict.get("studies", []):
         for analysis in study.get("analyses", []):
@@ -247,12 +255,27 @@ def _build_tables_from_source(source_dict):
     studies_rows = []
     analyses_rows = []
     ids = []
-    coordinate_rows = []
     image_rows = []
     metadata_rows = []
     annotation_rows = []
     text_rows = []
 
+    # Coordinate rows: collected as parallel column-arrays for fast DataFrame construction.
+    # POINT_RELATIONSHIP_COLUMNS are collected as lists and only added when non-all-None.
+    # The resulting DataFrame is canonicalized by a stable sort on 'id', making
+    # coordinate ordering explicit while preserving original order for rows with identical ids.
+    coord_ids_acc: list = []
+    coord_study_ids_acc: list = []
+    coord_contrast_ids_acc: list = []
+    coord_xs: list = []
+    coord_ys: list = []
+    coord_zs: list = []
+    coord_spaces: list = []
+    prc_lists: dict = {col: [] for col in POINT_RELATIONSHIP_COLUMNS}
+    prc_seen: dict = {col: False for col in POINT_RELATIONSHIP_COLUMNS}
+    # Truly sparse extras (point values, coordinate_metadata): (row_index, {col: val})
+    coord_sparse_extras: list = []
+
     for study in source_dict.get("studies", []):
         study_id = str(study["id"])
         studies_rows.append(
@@ -286,29 +309,30 @@ def _build_tables_from_source(source_dict):
                 "journal": study.get("publication", ""),
                 "name": f"{study_name}-{analysis_name}",
             }
-            study_metadata = study.get("metadata", {}) or {}
-            analysis_metadata = analysis.get("metadata", {}) or {}
+            study_metadata = study.get("metadata") or {}
+            analysis_metadata = analysis.get("metadata") or {}
             coordinate_metadata, coordinate_metadata_keys = _extract_coordinate_row_metadata(
                 analysis_metadata,
                 len(analysis.get("points", []) or []),
             )
-            combined_metadata = copy.deepcopy(study_metadata)
-            combined_metadata.update(copy.deepcopy(analysis_metadata))
-            combined_metadata.pop("sample_sizes", None)
-            combined_metadata.pop("sample_size", None)
-            for key in coordinate_metadata_keys:
-                combined_metadata.pop(key, None)
-            sample_sizes = _extract_coerced_sample_sizes(
-                [
-                    ("sample_sizes", analysis_metadata.get("sample_sizes")),
-                    ("sample_size", analysis_metadata.get("sample_size")),
-                    ("sample_sizes", study_metadata.get("sample_sizes")),
-                    ("sample_size", study_metadata.get("sample_size")),
-                ]
-            )
-            if sample_sizes:
-                combined_metadata["sample_sizes"] = sample_sizes
-            metadata_row.update(combined_metadata)
+            if study_metadata or analysis_metadata:
+                combined_metadata = copy.deepcopy(study_metadata)
+                combined_metadata.update(copy.deepcopy(analysis_metadata))
+                combined_metadata.pop("sample_sizes", None)
+                combined_metadata.pop("sample_size", None)
+                for key in coordinate_metadata_keys:
+                    combined_metadata.pop(key, None)
+                sample_sizes = _extract_coerced_sample_sizes(
+                    [
+                        ("sample_sizes", analysis_metadata.get("sample_sizes")),
+                        ("sample_size", analysis_metadata.get("sample_size")),
+                        ("sample_sizes", study_metadata.get("sample_sizes")),
+                        ("sample_size", study_metadata.get("sample_size")),
+                    ]
+                )
+                if sample_sizes:
+                    combined_metadata["sample_sizes"] = sample_sizes
+                metadata_row.update(combined_metadata)
             metadata_rows.append(metadata_row)
 
             annotation_row = dict(base_row)
@@ -320,7 +344,9 @@ def _build_tables_from_source(source_dict):
             annotation_rows.append(annotation_row)
 
             text_row = dict(base_row)
-            text_row.update(copy.deepcopy(analysis.get("texts", {}) or {}))
+            texts = analysis.get("texts") or {}
+            if texts:
+                text_row.update(copy.deepcopy(texts))
             text_rows.append(text_row)
 
             image_row = dict(base_row)
@@ -340,37 +366,68 @@ def _build_tables_from_source(source_dict):
 
             for i_point, point in enumerate(analysis.get("points", []) or []):
                 coords = point.get("coordinates", [None, None, None])
-                coordinate_row = {
-                    **base_row,
-                    "x": float(coords[0]),
-                    "y": float(coords[1]),
-                    "z": float(coords[2]),
-                    "space": point.get("space"),
-                }
-                for column in POINT_RELATIONSHIP_COLUMNS:
-                    value = point.get(column)
-                    if value is not None:
-                        coordinate_row[column] = value
-
+                coord_ids_acc.append(full_id)
+                coord_study_ids_acc.append(study_id)
+                coord_contrast_ids_acc.append(contrast_id)
+                coord_xs.append(coords[0])
+                coord_ys.append(coords[1])
+                coord_zs.append(coords[2])
+                coord_spaces.append(point.get("space"))
+
+                for col in POINT_RELATIONSHIP_COLUMNS:
+                    val = point.get(col)
+                    prc_lists[col].append(val)
+                    if val is not None:
+                        prc_seen[col] = True
+
+                extra: dict = {}
                 for point_value in point.get("values", []) or []:
                     if not isinstance(point_value, dict):
                         continue
                     column = _point_value_kind_to_coordinate_column(point_value.get("kind"))
                     value = point_value.get("value")
                     if column is not None and value is not None:
-                        coordinate_row[column] = value
-
+                        extra[column] = value
                 for column, values in coordinate_metadata.items():
-                    coordinate_row[column] = values[i_point]
-
-                coordinate_rows.append(coordinate_row)
+                    extra[column] = values[i_point]
+                if extra:
+                    coord_sparse_extras.append((len(coord_ids_acc) - 1, extra))
+
+    n_coord = len(coord_ids_acc)
+    if n_coord:
+        coord_frame: dict = {
+            "id": coord_ids_acc,
+            "study_id": coord_study_ids_acc,
+            "contrast_id": coord_contrast_ids_acc,
+            "x": np.asarray(coord_xs, dtype=float),
+            "y": np.asarray(coord_ys, dtype=float),
+            "z": np.asarray(coord_zs, dtype=float),
+            "space": coord_spaces,
+        }
+        for col in POINT_RELATIONSHIP_COLUMNS:
+            if prc_seen[col]:
+                coord_frame[col] = prc_lists[col]
+        if coord_sparse_extras:
+            extra_cols: dict = {}
+            for row_idx, extra in coord_sparse_extras:
+                for col, val in extra.items():
+                    if col not in extra_cols:
+                        extra_cols[col] = [None] * n_coord
+                    extra_cols[col][row_idx] = val
+            coord_frame.update(extra_cols)
+        id_arr = np.asarray(coord_ids_acc, dtype=str)
+        coord_frame["id"] = id_arr
+        sort_order = np.argsort(id_arr, kind="stable")
+        coord_df = pd.DataFrame(coord_frame).iloc[sort_order].reset_index(drop=True)
+    else:
+        coord_df = pd.DataFrame(columns=_ID_COLS + ["x", "y", "z", "space"])
 
     ids = np.sort(np.asarray(ids, dtype=str))
     return {
         "studies": _rows_to_df(studies_rows, ["study_id", "name", "authors", "publication"]),
         "analyses": _rows_to_df(analyses_rows, _ID_COLS + ["name"]),
         "ids": ids,
-        "coordinates": _rows_to_df(coordinate_rows, _ID_COLS + ["x", "y", "z", "space"]),
+        "coordinates": coord_df,
         "images": _rows_to_df(image_rows, _ID_COLS, normalize_none_strings=True),
         "metadata": _rows_to_df(metadata_rows, _ID_COLS, normalize_none_strings=True),
         "annotations": _rows_to_df(annotation_rows, _ID_COLS, normalize_none_strings=True),
@@ -497,11 +554,8 @@ def from_source_dict(
             target,
             harmonize_coordinates=harmonize_coordinates,
         )
-        annotation_payloads = (
-            _coerce_annotation_payloads(annotation_payloads)
-            if annotation_payloads is not None
-            else _coerce_annotation_payloads(source_dict.get("annotations", []))
-        )
+        if annotation_payloads is None:
+            annotation_payloads = source_dict.get("annotations", [])
         source_dict = _apply_annotation_payloads(source_dict, annotation_payloads)
         return cls(
             source_dict["id"],
@@ -683,7 +737,7 @@ def selected_source_dict(self, selected_full_ids=None):
             }
 
         if selected_full_ids is None:
-            return copy.deepcopy(source_dict)
+            return _structural_copy_source_dict(source_dict)
 
         selected_ids = set(self.selected_ids(selected_full_ids).tolist())
         if not selected_ids:
diff --git a/nimare/nimads.py b/nimare/nimads.py
@@ -166,9 +166,11 @@ def __init__(
         if target is _UNSET:
             target = "mni152_2mm"
 
-        # load source as json
+        # load source as json; track ownership so the structural copy can be skipped
+        _owned = False
         if isinstance(source, str):
             source = load_json(source)
+            _owned = True  # freshly parsed — no other reference exists
 
         _validate_studyset_source(source)
 
@@ -184,6 +186,7 @@ def __init__(
             annotation_payloads=annotation_payloads,
             target=target,
             harmonize_coordinates=harmonize_coordinates,
+            _owned=_owned,
         )
         execution_profile = StudysetExecutionProfile(
             target=target,
@@ -929,21 +932,23 @@ def from_sleuth(cls, sleuth_file):
 
     def combine_analyses(self):
         """Combine analyses in Studyset."""
-        studyset = self.copy()
-        for study in studyset.studies:
-            if len(study.analyses) > 1:
-                source_lst = [analysis.to_dict() for analysis in study.analyses]
-                ids = [source["id"] for source in source_lst]
-                names = [source["name"] for source in source_lst]
-                conditions = [source.get("conditions", []) for source in source_lst]
-                images = [source.get("images", []) for source in source_lst]
-                points = [source.get("points", []) for source in source_lst]
-                weights = [source.get("weights", []) for source in source_lst]
-                metadata = [source.get("metadata", {}) for source in source_lst]
-                annotations = [source.get("annotations", {}) for source in source_lst]
-                texts = [source.get("texts", {}) for source in source_lst]
-
-                new_source = {
+        from nimare._studyset_store import StudysetStore
+
+        source_dict = self.to_dict()
+        for study in source_dict.get("studies", []):
+            analyses = study.get("analyses", [])
+            if len(analyses) > 1:
+                ids = [a["id"] for a in analyses]
+                names = [a.get("name", "") for a in analyses]
+                conditions = [a.get("conditions", []) for a in analyses]
+                images = [a.get("images", []) for a in analyses]
+                points = [a.get("points", []) for a in analyses]
+                weights = [a.get("weights", []) for a in analyses]
+                metadata = [a.get("metadata", {}) for a in analyses]
+                annotations = [a.get("annotations", {}) for a in analyses]
+                texts = [a.get("texts", {}) for a in analyses]
+
+                new_analysis = {
                     "id": "_".join(ids),
                     "name": "; ".join(names),
                     "conditions": [cond for c_list in conditions for cond in c_list],
@@ -957,16 +962,23 @@ def combine_analyses(self):
                 }
                 combined_texts = {k: v for text_dict in texts for k, v in text_dict.items()}
                 if combined_annotations:
-                    new_source["annotations"] = combined_annotations
+                    new_analysis["annotations"] = combined_annotations
                 if combined_texts:
-                    new_source["texts"] = combined_texts
-                study.analyses = [Analysis(new_source)]
+                    new_analysis["texts"] = combined_texts
+                study["analyses"] = [new_analysis]
 
-        # Old Analysis objects are gone; Annotation notes hold dead weak references.
-        # Clear top-level annotations so touch() can rebuild cleanly.
-        studyset._annotations = []
-        studyset.touch()
-        return studyset
+        # Drop annotation payloads: they reference pre-merge analysis IDs and
+        # can't be mapped to the new merged IDs, so clear them (matching the
+        # original copy+touch behaviour that set _annotations = []).
+        source_dict.pop("annotations", None)
+        store = StudysetStore.from_source_dict(
+            source_dict,
+            annotation_payloads=[],
+            target=None,
+            harmonize_coordinates=False,
+            _owned=True,
+        )
+        return self.__class__._from_store(store, self._copy_execution_profile())
 
     def to_nimads(self, filename):
         """Write the Studyset to a NIMADS JSON file."""
diff --git a/nimare/results.py b/nimare/results.py
@@ -233,15 +233,23 @@ def save_tables(self, output_dir=".", prefix="", prefix_sep="_", names=None):
             else:
                 LGR.warning(f"Table {tabletype} is None. Not saving.")
 
+    def _set_description(self, desc):
+        self.__description = desc
+        self.bibtex_ = "" if not desc else get_description_references(desc)
+
     def copy(self):
         """Return copy of result object."""
-        new = MetaResult(
-            estimator=self.estimator,
-            corrector=self.corrector,
-            diagnostics=self.diagnostics,
-            mask=self.masker,
-            maps=copy.deepcopy(self.maps),
-            tables=copy.deepcopy(self.tables),
-            description=self.description_,
-        )
+        new = object.__new__(MetaResult)
+        # Deep copy the estimator so that corrected results can update estimator state
+        # without mutating the original MetaResult or estimator.
+        new.estimator = copy.deepcopy(self.estimator)
+        new.corrector = self.corrector
+        new.diagnostics = self.diagnostics
+        new.masker = self.masker
+        new.maps = copy.deepcopy(self.maps)
+        new.tables = copy.deepcopy(self.tables)
+        new.metadata = {}
+        # Bypass the description_ setter (which re-parses bibtex on every call).
+        # Both attributes are already computed and neither changes after fit.
+        new._set_description(self.description_)
         return new