Optimize GNS core performance

kks32 · kks32 · commit 9d18cac9de04 · 2025-10-02T06:30:53.000-05:00
diff --git a/gns/learned_simulator.py b/gns/learned_simulator.py
@@ -74,6 +74,13 @@ def __init__(
 
         self._device = device
 
+        # Optimized: Register boundary tensor as buffer for automatic device management
+        # This avoids recreating the tensor on every forward pass
+        self.register_buffer(
+            '_boundary_tensor',
+            torch.tensor(boundaries, dtype=torch.float32)
+        )
+
     def forward(self):
         """Forward hook runs on class instantiation"""
         pass
@@ -95,12 +102,11 @@ def _compute_graph_connectivity(
           add_self_edges: Boolean flag to include self edge (default: True)
         """
         # Specify examples id for particles
-        batch_ids = torch.cat(
-            [
-                torch.LongTensor([i for _ in range(n)])
-                for i, n in enumerate(nparticles_per_example)
-            ]
-        ).to(self._device)
+        # Optimized: Use repeat_interleave instead of list comprehension + cat
+        batch_ids = torch.repeat_interleave(
+            torch.arange(len(nparticles_per_example), device=self._device, dtype=torch.long),
+            nparticles_per_example
+        )
 
         # radius_graph accepts r < radius not r <= radius
         # A torch tensor list of source and target nodes with shape (2, nedges)
@@ -161,11 +167,9 @@ def _encoder_preprocessor(
         # Normalized clipped distances to lower and upper boundaries.
         # boundaries are an array of shape [num_dimensions, 2], where the second
         # axis, provides the lower/upper boundaries.
-        boundaries = (
-            torch.tensor(self._boundaries, requires_grad=False).float().to(self._device)
-        )
-        distance_to_lower_boundary = most_recent_position - boundaries[:, 0][None]
-        distance_to_upper_boundary = boundaries[:, 1][None] - most_recent_position
+        # Optimized: Use pre-computed boundary tensor buffer
+        distance_to_lower_boundary = most_recent_position - self._boundary_tensor[:, 0][None]
+        distance_to_upper_boundary = self._boundary_tensor[:, 1][None] - most_recent_position
         distance_to_boundaries = torch.cat(
             [distance_to_lower_boundary, distance_to_upper_boundary], dim=1
         )
@@ -193,28 +197,19 @@ def _encoder_preprocessor(
         # 31 = 10 (5 velocity sequences*dim) + 4 boundaries + 16 particle embedding + 1 material property
 
         # Collect edge features.
-        edge_features = []
-
-        # Relative displacement and distances normalized to radius
-        # with shape (nedges, 2)
-        # normalized_relative_displacements = (
-        #     torch.gather(most_recent_position, 0, senders) -
-        #     torch.gather(most_recent_position, 0, receivers)
-        # ) / self._connectivity_radius
-        normalized_relative_displacements = (
-            most_recent_position[senders, :] - most_recent_position[receivers, :]
-        ) / self._connectivity_radius
-
-        # Add relative displacement between two particles as an edge feature
-        # with shape (nparticles, ndim)
-        edge_features.append(normalized_relative_displacements)
-
-        # Add relative distance between 2 particles with shape (nparticles, 1)
-        # Edge features has a final shape of (nparticles, ndim + 1)
-        normalized_relative_distances = torch.norm(
-            normalized_relative_displacements, dim=-1, keepdim=True
-        )
-        edge_features.append(normalized_relative_distances)
+        # Optimized: Compute displacement and distance together to reduce indexing operations
+        sender_pos = most_recent_position[senders, :]
+        receiver_pos = most_recent_position[receivers, :]
+        relative_displacements = sender_pos - receiver_pos
+
+        # Compute distance before normalization for numerical stability
+        relative_distances = torch.norm(relative_displacements, dim=-1, keepdim=True)
+
+        # Normalize both by connectivity radius
+        normalized_relative_displacements = relative_displacements / self._connectivity_radius
+        normalized_relative_distances = relative_distances / self._connectivity_radius
+
+        edge_features = [normalized_relative_displacements, normalized_relative_distances]
 
         return (
             torch.cat(node_features, dim=-1),
diff --git a/gns/noise_utils.py b/gns/noise_utils.py
@@ -21,23 +21,27 @@ def get_random_walk_noise_for_position_sequence(
     # so to keep `std_last_step` fixed, we apply at each step:
     # std_each_step `std_last_step / np.sqrt(num_input_velocities)`
     num_velocities = velocity_sequence.shape[1]
-    velocity_sequence_noise = torch.randn(list(velocity_sequence.shape)) * (
-        noise_std_last_step / num_velocities**0.5
-    )
+
+    # Optimized: Create noise directly on same device as input
+    velocity_sequence_noise = torch.randn(
+        velocity_sequence.shape,
+        device=position_sequence.device,
+        dtype=position_sequence.dtype
+    ) * (noise_std_last_step / num_velocities**0.5)
 
     # Apply the random walk.
     velocity_sequence_noise = torch.cumsum(velocity_sequence_noise, dim=1)
 
     # Integrate the noise in the velocity to the positions, assuming
-    # an Euler intergrator and a dt = 1, and adding no noise to the very first
+    # an Euler integrator and a dt = 1, and adding no noise to the very first
     # position (since that will only be used to calculate the first position
     # change).
-    position_sequence_noise = torch.cat(
-        [
-            torch.zeros_like(velocity_sequence_noise[:, 0:1]),
-            torch.cumsum(velocity_sequence_noise, dim=1),
-        ],
-        dim=1,
+    # Optimized: Pre-allocate on correct device
+    position_sequence_noise = torch.zeros(
+        (velocity_sequence.shape[0], velocity_sequence.shape[1] + 1, velocity_sequence.shape[2]),
+        device=position_sequence.device,
+        dtype=position_sequence.dtype
     )
+    position_sequence_noise[:, 1:] = torch.cumsum(velocity_sequence_noise, dim=1)
 
     return position_sequence_noise
diff --git a/gns/particle_data_loader.py b/gns/particle_data_loader.py
@@ -131,41 +131,63 @@ def get_num_features(self):
 
 
 def collate_fn_sample(batch):
+    """Optimized collation function with pre-allocation and minimal copies."""
     features, labels = zip(*batch)
 
-    position_list = []
-    particle_type_list = []
-    material_property_list = []
-    n_particles_per_example_list = []
-
-    for feature in features:
-        position_list.append(feature[0])
-        particle_type_list.append(feature[1])
-        if len(feature) == 4:  # If material property is present
-            material_property_list.append(feature[2])
-            n_particles_per_example_list.append(feature[3])
+    # Pre-calculate total particles to avoid reallocation
+    total_particles = sum(f[0].shape[0] for f in features)
+    batch_size = len(features)
+    has_material = len(features[0]) == 4
+
+    # Get dimensions from first sample
+    seq_len = features[0][0].shape[1]
+    dim = features[0][0].shape[2]
+
+    # Pre-allocate tensors with pinned memory for faster GPU transfer
+    positions = torch.empty((total_particles, seq_len, dim),
+                           dtype=torch.float32, pin_memory=True)
+    particle_types = torch.empty(total_particles,
+                                 dtype=torch.long, pin_memory=True)
+    n_particles = torch.empty(batch_size,
+                             dtype=torch.long, pin_memory=True)
+
+    if has_material:
+        materials = torch.empty(total_particles,
+                               dtype=torch.float32, pin_memory=True)
+
+    # Fill pre-allocated tensors (single copy from numpy)
+    offset = 0
+    for i, feature in enumerate(features):
+        n_part = feature[0].shape[0]
+
+        # Direct numpy-to-torch copy
+        positions[offset:offset+n_part] = torch.from_numpy(feature[0])
+        particle_types[offset:offset+n_part] = torch.from_numpy(feature[1])
+
+        if has_material:
+            materials[offset:offset+n_part] = torch.from_numpy(feature[2])
+            n_particles[i] = feature[3]
         else:
-            n_particles_per_example_list.append(feature[2])
-
-    collated_features = (
-        torch.tensor(np.vstack(position_list)).to(torch.float32).contiguous(),
-        torch.tensor(np.concatenate(particle_type_list)).contiguous(),
-        torch.tensor(n_particles_per_example_list).contiguous(),
-    )
-
-    if material_property_list:
-        material_property_tensor = (
-            torch.tensor(np.concatenate(material_property_list))
-            .to(torch.float32)
-            .contiguous()
-        )
-        collated_features = (
-            collated_features[:2] + (material_property_tensor,) + collated_features[2:]
-        )
+            n_particles[i] = feature[2]
 
-    collated_labels = torch.tensor(np.vstack(labels)).to(torch.float32).contiguous()
+        offset += n_part
 
-    return collated_features, collated_labels
+    # Build output tuple
+    if has_material:
+        collated_features = (positions, particle_types, materials, n_particles)
+    else:
+        collated_features = (positions, particle_types, n_particles)
+
+    # Labels - same optimization
+    labels_tensor = torch.empty((total_particles, dim),
+                               dtype=torch.float32, pin_memory=True)
+    offset = 0
+    for label in labels:
+        n_part = label.shape[0]
+        labels_tensor[offset:offset+n_part] = torch.from_numpy(label)
+        offset += n_part
+
+    return collated_features, labels_tensor
 
 
 def collate_fn_trajectory(batch):
diff --git a/gns/train.py b/gns/train.py
@@ -53,8 +53,20 @@ def rollout(
     initial_positions = position[:, : cfg.data.input_sequence_length]
     ground_truth_positions = position[:, cfg.data.input_sequence_length :]
 
-    current_positions = initial_positions
-    predictions = []
+    current_positions = initial_positions.clone()
+
+    # Pre-allocate predictions tensor to avoid memory fragmentation
+    n_particles = position.shape[0]
+    dim = position.shape[-1]
+    predictions = torch.zeros(
+        (nsteps, n_particles, dim),
+        device=device,
+        dtype=position.dtype
+    )
+
+    # Pre-compute kinematic mask once (static for entire rollout)
+    kinematic_mask = (particle_types == cfg.data.kinematic_particle_id).bool()
+    kinematic_mask_expanded = kinematic_mask[:, None].expand(-1, dim)
 
     for step in tqdm(range(nsteps), total=nsteps):
         # Get next position with shape (nnodes, dim)
@@ -66,29 +78,17 @@ def rollout(
         )
 
         # Update kinematic particles from prescribed trajectory.
-        kinematic_mask = (
-            (particle_types == cfg.data.kinematic_particle_id)
-            .clone()
-            .detach()
-            .to(device)
-        )
         next_position_ground_truth = ground_truth_positions[:, step]
-        kinematic_mask = kinematic_mask.bool()[:, None].expand(
-            -1, current_positions.shape[-1]
-        )
         next_position = torch.where(
-            kinematic_mask, next_position_ground_truth, next_position
+            kinematic_mask_expanded, next_position_ground_truth, next_position
         )
-        predictions.append(next_position)
 
-        # Shift `current_positions`, removing the oldest position in the sequence
-        # and appending the next position at the end.
-        current_positions = torch.cat(
-            [current_positions[:, 1:], next_position[:, None, :]], dim=1
-        )
+        # Store prediction in pre-allocated tensor
+        predictions[step] = next_position
 
-    # Predictions with shape (time, nnodes, dim)
-    predictions = torch.stack(predictions)
+        # Shift `current_positions` in-place
+        current_positions[:, :-1] = current_positions[:, 1:].clone()
+        current_positions[:, -1] = next_position
     ground_truth_positions = ground_truth_positions.permute(1, 0, 2)
 
     loss = (predictions - ground_truth_positions) ** 2
@@ -577,41 +577,28 @@ def train(rank, cfg, world_size, device, verbose, use_dist):
                         labels,
                     ) = prepare_data(example, device_id)
 
-                    n_particles_per_example = n_particles_per_example.to(device_id)
-                    labels = labels.to(device_id)
-
-                    sampled_noise = (
-                        noise_utils.get_random_walk_noise_for_position_sequence(
-                            position, noise_std_last_step=cfg.data.noise_std
-                        ).to(device_id)
-                    )
-                    non_kinematic_mask = (
-                        (particle_type != cfg.data.kinematic_particle_id)
-                        .clone()
-                        .detach()
-                        .to(device_id)
+                    # Optimized: Data already on device_id from prepare_data, no need to transfer again
+                    # Noise is now created directly on correct device (see noise_utils.py optimization)
+                    sampled_noise = noise_utils.get_random_walk_noise_for_position_sequence(
+                        position, noise_std_last_step=cfg.data.noise_std
                     )
+                    # Optimized: Comparison already creates new tensor, no need for clone/detach
+                    non_kinematic_mask = (particle_type != cfg.data.kinematic_particle_id)
                     sampled_noise *= non_kinematic_mask.view(-1, 1, 1)
 
-                    device_or_rank = rank if device == torch.device("cuda") else device
                     predict_fn = (
                         simulator.module.predict_accelerations
                         if use_dist
                         else simulator.predict_accelerations
                     )
+                    # Optimized: All tensors already on correct device, no transfers needed
                     pred_acc, target_acc = predict_fn(
-                        next_positions=labels.to(device_or_rank),
-                        position_sequence_noise=sampled_noise.to(device_or_rank),
-                        position_sequence=position.to(device_or_rank),
-                        nparticles_per_example=n_particles_per_example.to(
-                            device_or_rank
-                        ),
-                        particle_types=particle_type.to(device_or_rank),
-                        material_property=(
-                            material_property.to(device_or_rank)
-                            if n_features == 3
-                            else None
-                        ),
+                        next_positions=labels,
+                        position_sequence_noise=sampled_noise,
+                        position_sequence=position,
+                        nparticles_per_example=n_particles_per_example,
+                        particle_types=particle_type,
+                        material_property=material_property if n_features == 3 else None,
                     )
 
                     if (