fixing various typos in diverse texts (#40)

didier-durand · web-flow · commit 8127013c84c6 · 2025-08-19T08:58:16.000-04:00
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Rabbat*, Nicolas Ballas*
 
 Official Pytorch codebase for V-JEPA 2 and V-JEPA 2-AC.
 
-V-JEPA 2 is a self-supervised approach to training video encoders, using internet-scale video data, that attains state-of-the-art performance on motion understanding and human action anticpation tasks. V-JEPA 2-AC is a latent action-conditioned world model post-trained from V-JEPA 2 (using a small amount of robot trajectory interaction data) that solves robot manipulation tasks without environment-specific data collection or task-specific training or calibration.
+V-JEPA 2 is a self-supervised approach to training video encoders, using internet-scale video data, that attains state-of-the-art performance on motion understanding and human action anticipation tasks. V-JEPA 2-AC is a latent action-conditioned world model post-trained from V-JEPA 2 (using a small amount of robot trajectory interaction data) that solves robot manipulation tasks without environment-specific data collection or task-specific training or calibration.
 
 <p align="center">
 	<img src="assets/flowchart.png" width=100%>
@@ -67,7 +67,7 @@ V-JEPA 2 is a self-supervised approach to training video encoders, using interne
 
 ## V-JEPA 2-AC Post-training
 
-**(Top)** After post-training with a small amount of robot data, we can deploy the model on a robot arm in new environments, and tackle foundational tasks like reaching, grasping, and pick-and-place by planning from image goals. **(Bottom)** Performance on robot maniuplation tasks using a Franka arm, with input provided through a monocular RGB camera.
+**(Top)** After post-training with a small amount of robot data, we can deploy the model on a robot arm in new environments, and tackle foundational tasks like reaching, grasping, and pick-and-place by planning from image goals. **(Bottom)** Performance on robot manipulation tasks using a Franka arm, with input provided through a monocular RGB camera.
 
 <img align="left" src="https://github.com/user-attachments/assets/c5d42221-0102-4216-911d-061a4369a805" width=65%>&nbsp;
 <table>
@@ -278,8 +278,10 @@ import torch
 vjepa2_encoder, vjepa2_ac_predictor = torch.hub.load('facebookresearch/vjepa2', 'vjepa2_ac_vit_giant')
 ```
 
-See [energy_landscape_example.ipynb](notebooks/energy_landscape_example.ipynb) for an example notebook computing the energy landscape of the pretrained action-conditioned backbone using a robot trajectory collected from our lab.
-To run this notebook, you'll need to aditionally install [Jupyter](https://jupyter.org/install) and [Scipy](https://scipy.org/install/) in your conda environment.
+
+See [energy_landscape_example.ipynb](notebooks/vjepa_droid/energy_landscape.ipynb) for an example notebook computing the energy landscape of the pretrained action-conditioned backbone using a robot trajectory collected from our lab.
+To run this notebook, you'll need to additionally install [Jupyter](https://jupyter.org/install) and [Scipy](https://scipy.org/install/) in your conda environment.
+
 
 ## Getting Started
 
@@ -316,7 +318,7 @@ Probe-based evaluation consists in training an attentive probe on top of frozen
 
 Evaluations can be run either locally, or distributed via SLURM. (Running locally is useful for debugging and validation).
 These sample commands launch Something-Something v2 video classification; other evals are launched by specifying the corresponding config.
-Use provided training configs under "Evaluation Attentive Probes". These configs allow to train multiple probes in parrallel with various optimization parameters.
+Use provided training configs under "Evaluation Attentive Probes". These configs allow to train multiple probes in parallel with various optimization parameters.
 Change filepaths as needed (e.g. `folder`, `checkpoint`, `dataset_train`, `dataset_val`) to match locations of data and downloaded checkpoints on your local filesystem.
 Change \# nodes and local batch size as needed to not exceed available GPU memory.
 
diff --git a/evals/video_classification_frozen/modelcustom/vit_encoder_multiclip.py b/evals/video_classification_frozen/modelcustom/vit_encoder_multiclip.py
@@ -80,7 +80,7 @@ def init_module(
 
 class ClipAggregation(nn.Module):
     """
-    Process each clip indepdnently and concatenate all tokens
+    Process each clip independently and concatenate all tokens
     """
 
     def __init__(
diff --git a/evals/video_classification_frozen/modelcustom/vit_encoder_multiclip_multilevel.py b/evals/video_classification_frozen/modelcustom/vit_encoder_multiclip_multilevel.py
@@ -84,7 +84,7 @@ def init_module(
 
 class ClipAggregation(nn.Module):
     """
-    Process each clip indepdnently and concatenate all tokens
+    Process each clip independently and concatenate all tokens
     """
 
     def __init__(
diff --git a/src/datasets/utils/video/transforms.py b/src/datasets/utils/video/transforms.py
@@ -97,9 +97,9 @@ def random_short_side_scale_jitter(images, min_size, max_size, boxes=None, inver
 
 def crop_boxes(boxes, x_offset, y_offset):
     """
-    Peform crop on the bounding boxes given the offsets.
+    Perform crop on the bounding boxes given the offsets.
     Args:
-        boxes (ndarray or None): bounding boxes to peform crop. The dimension
+        boxes (ndarray or None): bounding boxes to perform crop. The dimension
             is `num boxes` x 4.
         x_offset (int): cropping offset in the x axis.
         y_offset (int): cropping offset in the y axis.
@@ -150,7 +150,7 @@ def horizontal_flip(prob, images, boxes=None):
     """
     Perform horizontal flip on the given images and corresponding boxes.
     Args:
-        prob (float): probility to flip the images.
+        prob (float): probability to flip the images.
         images (tensor): images to perform horizontal flip, the dimension is
             `num frames` x `channel` x `height` x `width`.
         boxes (ndarray or None): optional. Corresponding boxes to images.
@@ -193,7 +193,7 @@ def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
             crop if height is larger than width.
         boxes (ndarray or None): optional. Corresponding boxes to images.
             Dimension is `num boxes` x 4.
-        scale_size (int): optinal. If not None, resize the images to scale_size before
+        scale_size (int): optimal. If not None, resize the images to scale_size before
             performing any crop.
     Returns:
         cropped (tensor): images with dimension of
@@ -296,7 +296,7 @@ def grayscale(images):
 
 def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
     """
-    Perfrom a color jittering on the input images. The channels of images
+    Perform a color jittering on the input images. The channels of images
     should be in order BGR.
     Args:
         images (tensor): images to perform color jitter. Dimension is
@@ -331,7 +331,7 @@ def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
 
 def brightness_jitter(var, images):
     """
-    Perfrom brightness jittering on the input images. The channels of images
+    Perform brightness jittering on the input images. The channels of images
     should be in order BGR.
     Args:
         var (float): jitter ratio for brightness.
@@ -350,7 +350,7 @@ def brightness_jitter(var, images):
 
 def contrast_jitter(var, images):
     """
-    Perfrom contrast jittering on the input images. The channels of images
+    Perform contrast jittering on the input images. The channels of images
     should be in order BGR.
     Args:
         var (float): jitter ratio for contrast.
@@ -370,7 +370,7 @@ def contrast_jitter(var, images):
 
 def saturation_jitter(var, images):
     """
-    Perfrom saturation jittering on the input images. The channels of images
+    Perform saturation jittering on the input images. The channels of images
     should be in order BGR.
     Args:
         var (float): jitter ratio for saturation.
@@ -435,15 +435,15 @@ def lighting_jitter(images, alphastd, eigval, eigvec):
 
 def color_normalization(images, mean, stddev):
     """
-    Perform color nomration on the given images.
+    Perform color normation on the given images.
     Args:
         images (tensor): images to perform color normalization. Dimension is
             `num frames` x `channel` x `height` x `width`.
         mean (list): mean values for normalization.
         stddev (list): standard deviations for normalization.
 
     Returns:
-        out_images (tensor): the noramlized images, the dimension is
+        out_images (tensor): the normalized images, the dimension is
             `num frames` x `channel` x `height` x `width`.
     """
     if len(images.shape) == 3:
diff --git a/src/datasets/utils/weighted_sampler.py b/src/datasets/utils/weighted_sampler.py
@@ -149,7 +149,7 @@ def __next__(self) -> int:
 
             # In order to avoid sampling the same example multiple times between the ranks,
             # we limit each rank to a subset of the total number of samples in the dataset.
-            # For example if our dataet is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], and we have 2 ranks,
+            # For example if our dataset is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], and we have 2 ranks,
             # then rank 0 will ONLY sample from [0, 2, 4, 6, 8], and rank 1 from [1, 3, 5, 7, 9].
             # In each iteration we first produce `in_rank_sample` which is the sample index in the rank,
             # based on the size of the subset which that rank can sample from.
diff --git a/src/datasets/utils/worker_init_fn.py b/src/datasets/utils/worker_init_fn.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This code originally comes from PyTorch Lighting with some light modificaitons:
+# This code originally comes from PyTorch Lighting with some light modifications:
 # https://github.com/Lightning-AI/pytorch-lightning/blob/a944e7744e57a5a2c13f3c73b9735edf2f71e329/src/lightning/fabric/utilities/seed.py
 
 
diff --git a/src/models/vision_transformer.py b/src/models/vision_transformer.py
@@ -218,7 +218,7 @@ def interpolate_pos_encoding(self, x, pos_embed):
 
         if self.is_video:
 
-            # If pos_embed already corret size, just return
+            # If pos_embed already correct size, just return
             _, _, T, H, W = x.shape
             if H == self.img_height and W == self.img_width and T == self.num_frames:
                 return pos_embed
@@ -254,7 +254,7 @@ def interpolate_pos_encoding(self, x, pos_embed):
 
         else:
 
-            # If pos_embed already corret size, just return
+            # If pos_embed already correct size, just return
             _, _, H, W = x.shape
             if H == self.img_height and W == self.img_width:
                 return pos_embed
diff --git a/tests/datasets/test_vjepa_transforms.py b/tests/datasets/test_vjepa_transforms.py
@@ -37,14 +37,14 @@ class TestVideoTransformFunctionalCrop(unittest.TestCase):
     def test_tensor_numpy(self):
         T, C, H, W = 16, 3, 280, 320
         shape = (T, C, H, W)
-        crop_szie = (10, 10, 224, 224)
+        crop_size = (10, 10, 224, 224)
         video_tensor = torch.randint(low=0, high=255, size=shape, dtype=torch.uint8)
         video_numpy = video_tensor.numpy()
 
-        cropped_tensor = functional.crop_clip(video_tensor, *crop_szie)
+        cropped_tensor = functional.crop_clip(video_tensor, *crop_size)
         self.assertIsInstance(cropped_tensor[0], torch.Tensor)
 
-        cropped_np_array = functional.crop_clip(video_numpy, *crop_szie)
+        cropped_np_array = functional.crop_clip(video_numpy, *crop_size)
         self.assertIsInstance(cropped_np_array[0], np.ndarray)
 
         for clip_tensor, clip_np in zip(cropped_tensor, cropped_np_array):
@@ -72,7 +72,7 @@ def test_tensor_numpy(self):
             clip_tensor = clip_tensor.permute(1, 2, 0)
             diff = torch.mean((torch.abs(clip_tensor - torch.Tensor(clip_np).to(torch.int16))) / (clip_tensor + 1))
 
-            # Transformatinos can not exactly match because of their interpolation functions coming from
+            # Transformations can not exactly match because of their interpolation functions coming from
             # two different sources. Here we check for their relative differences.
             # See the discussion here: https://github.com/fairinternal/jepa-internal/pull/65#issuecomment-2101833959
             self.assertLess(diff, 0.05)