Skip to content

Commit 4a0bdea

Browse files
author
GitLab Mirror Bot
committed
Bug fixes and stability improvements
1 parent b867572 commit 4a0bdea

File tree

12 files changed

+94
-159
lines changed

12 files changed

+94
-159
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ Details of the platform is described in the [Cosmos paper](https://research.nvid
1818
- [Video tokenizers](cosmos1/models/tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
1919
- Video curation pipeline for building your own video dataset. [Coming soon]
2020
- [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
21-
- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
21+
- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](cosmos1/models/tokenizer/nemo/README.md)].
2222

2323
## Model Family
2424

cosmos1/models/autoregressive/modules/embedding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ def __init__(
467467

468468
def forward(
469469
self,
470-
training_type: str = None,
470+
training_type: str | None = None,
471471
) -> torch.Tensor:
472472
T, H, W = self.latent_shape
473473
emb = torch.cat(

cosmos1/models/autoregressive/nemo/cosmos.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def __init__(
4949
self,
5050
seq_len: int,
5151
kv_channels: int,
52-
training_type: str = None,
52+
training_type: str | None = None,
5353
rotary_base: int = 10000,
5454
use_cpu_initialization: bool = False,
5555
latent_shape=[5, 40, 64],

cosmos1/models/autoregressive/nemo/cosmos_video2world.py

Lines changed: 28 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from typing import TYPE_CHECKING, Annotated, Callable, Dict, Optional, Union
2020

2121
import torch
22-
import torch.nn.functional as F
2322
from megatron.core import tensor_parallel
2423
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
2524
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
@@ -42,20 +41,24 @@
4241
from megatron.core.transformer.transformer_config import TransformerConfig
4342
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
4443
from megatron.core.utils import make_viewless_tensor
45-
from torch import Tensor, nn
46-
47-
from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig, CosmosConfig4B, CosmosModel, RotaryEmbedding3D
44+
from torch import nn
45+
46+
from cosmos1.models.autoregressive.nemo.cosmos import (
47+
CosmosConfig,
48+
CosmosConfig4B,
49+
CosmosConfig12B,
50+
CosmosModel,
51+
RotaryEmbedding3D,
52+
)
4853
from cosmos1.models.autoregressive.nemo.inference.inference_controller import CosmosInferenceWrapper
4954
from cosmos1.utils import log
5055

5156
if TYPE_CHECKING:
5257
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
5358

54-
from megatron.core import InferenceParams
5559
from megatron.core.packed_seq_params import PackedSeqParams
5660
from megatron.core.transformer.transformer_block import TransformerBlock
5761
from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank
58-
from nemo.collections.llm.gpt.model.llama import Llama3Config
5962
from nemo.collections.llm.utils import Config
6063
from nemo.lightning import OptimizerModule, io
6164
from nemo.lightning.base import teardown
@@ -64,30 +67,16 @@
6467
class CosmosTransformerBlock(TransformerBlock):
6568
def forward(
6669
self,
67-
hidden_states: Tensor,
68-
attention_mask: Tensor,
69-
context: Tensor = None,
70-
context_mask: Tensor = None,
71-
rotary_pos_emb: Tensor = None,
72-
rotary_pos_cos: Tensor = None,
73-
rotary_pos_sin: Tensor = None,
74-
attention_bias: Tensor = None,
75-
inference_params: InferenceParams = None,
70+
*args,
7671
packed_seq_params: PackedSeqParams = None,
7772
extra_positional_embeddings=None,
73+
**kwargs,
7874
):
7975
packed_seq_params = {"abs_pos_embed": extra_positional_embeddings}
8076
return super().forward(
81-
hidden_states,
82-
attention_mask,
83-
context,
84-
context_mask,
85-
rotary_pos_emb,
86-
rotary_pos_cos,
87-
rotary_pos_sin,
88-
attention_bias,
89-
inference_params,
90-
packed_seq_params,
77+
*args,
78+
packed_seq_params=packed_seq_params,
79+
**kwargs,
9180
)
9281

9382

@@ -361,7 +350,7 @@ def cosmos_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
361350
if "cu_seqlens" in _batch:
362351
raise ValueError("Packed sequence cu_seqlens not supported")
363352

364-
required_device_keys.update(("context", "abs_pos_embed"))
353+
required_device_keys.update(("context", "abs_pos_embed", "action"))
365354
if parallel_state.is_pipeline_first_stage():
366355
required_device_keys.update(("tokens", "position_ids"))
367356
if parallel_state.is_pipeline_last_stage():
@@ -398,38 +387,27 @@ def cosmos_forward_step(model, batch) -> torch.Tensor:
398387

399388

400389
@dataclass
401-
class CosmosConfigVideo2World5B(Llama3Config):
402-
qk_layernorm: bool = True
403-
rope_dim: str = "3D"
390+
class CosmosVideo2WorldConfig:
404391
vocab_size: int = 64064
405392
output_layer_vocab_size: int = 64000
406-
activation_func = F.silu
407-
rotary_base: int = 500_000
408393
seq_length: int = 12864
409-
num_layers: int = 16
410-
hidden_size: int = 4096
411-
ffn_hidden_size: int = 14336
412-
num_attention_heads: int = 32
413-
num_query_groups: int = 8
414-
layernorm_epsilon: float = 1e-5
415-
use_cpu_initialization: bool = True
416-
make_vocab_size_divisible_by: int = 64
417-
kv_channels: int = 128
418-
crossattn_emb_size: int = 1024
419394
latent_shape = [5, 40, 64]
420395
pad_to_multiple_of = 64
421396
forward_step_fn: Callable = cosmos_forward_step
422397
transformer_layer_spec = get_cosmos_video2world_spec()
423398
data_step_fn: Callable = cosmos_data_step
424399
attention_backend: AttnBackend = AttnBackend.flash
400+
crossattn_emb_size: int = 1024
401+
kv_channels: int = 128
402+
training_type: str | None = "text_to_video"
425403

426404
def configure_model(self, tokenizer) -> "MCoreGPTModel":
427405
self.transformer_layer_spec = get_cosmos_video2world_spec()
428406
model = super().configure_model(tokenizer)
429407
if self.rope_dim == "3D":
430408
model.rotary_pos_emb = RotaryEmbedding3D(
431409
seq_len=self.seq_length,
432-
training_type="text_to_video",
410+
training_type=self.training_type,
433411
pad_to_multiple_of=self.pad_to_multiple_of,
434412
kv_channels=self.kv_channels,
435413
max_position_embeddings=self.seq_length,
@@ -467,78 +445,13 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
467445

468446

469447
@dataclass
470-
class CosmosConfigVideo2World13B(Llama3Config):
471-
qk_layernorm: bool = True
472-
rope_dim: str = "3D"
473-
vocab_size: int = 64064
474-
output_layer_vocab_size: int = 64000
475-
activation_func = F.silu
476-
rotary_base: int = 500_000
477-
seq_length: int = 12864
478-
num_layers: int = 40
479-
hidden_size: int = 5120
480-
ffn_hidden_size: int = 14336
481-
num_attention_heads: int = 32
482-
num_query_groups: int = 8
483-
layernorm_epsilon: float = 1e-5
484-
use_cpu_initialization: bool = True
485-
make_vocab_size_divisible_by: int = 128
486-
kv_channels: int = 128
487-
crossattn_emb_size: int = 1024
488-
original_latent_shape = [3, 40, 64]
489-
apply_yarn: bool = True
490-
yarn_beta_fast: int = 4
491-
yarn_beta_slow: int = 1
492-
yarn_scale: int = 2
493-
original_seq_len = 8192
494-
latent_shape = [5, 40, 64]
495-
pad_to_multiple_of = 64
496-
forward_step_fn: Callable = cosmos_forward_step
497-
transformer_layer_spec = get_cosmos_video2world_spec()
498-
data_step_fn: Callable = cosmos_data_step
499-
attention_backend: AttnBackend = AttnBackend.flash
448+
class CosmosConfigVideo2World5B(CosmosVideo2WorldConfig, CosmosConfig4B):
449+
make_vocab_size_divisible_by: int = 64
500450

501-
def configure_model(self, tokenizer) -> "MCoreGPTModel":
502-
self.transformer_layer_spec = get_cosmos_video2world_spec()
503-
model = super().configure_model(tokenizer)
504-
if self.rope_dim == "3D":
505-
model.rotary_pos_emb = RotaryEmbedding3D(
506-
seq_len=self.seq_length,
507-
training_type="text_to_video",
508-
pad_to_multiple_of=self.pad_to_multiple_of,
509-
kv_channels=self.kv_channels,
510-
max_position_embeddings=self.seq_length,
511-
original_max_position_embeddings=self.original_seq_len if hasattr(self, "original_seq_len") else None,
512-
rotary_base=self.rotary_base,
513-
apply_yarn=True if hasattr(self, "apply_yarn") else False,
514-
scale=self.yarn_scale if hasattr(self, "yarn_scale") else None,
515-
extrapolation_factor=1,
516-
attn_factor=1,
517-
beta_fast=self.yarn_beta_fast if hasattr(self, "yarn_beta_fast") else 32,
518-
beta_slow=self.yarn_beta_slow if hasattr(self, "yarn_beta_slow") else 1,
519-
latent_shape=self.latent_shape,
520-
original_latent_shape=self.original_latent_shape if hasattr(self, "original_latent_shape") else None,
521-
)
522-
model.output_layer = tensor_parallel.ColumnParallelLinear(
523-
self.hidden_size,
524-
self.output_layer_vocab_size,
525-
config=self,
526-
init_method=self.init_method,
527-
bias=False,
528-
skip_bias_add=False,
529-
gather_output=False,
530-
skip_weight_param_allocation=False,
531-
embedding_activation_buffer=None,
532-
grad_output_buffer=None,
533-
)
534451

535-
model.decoder = CosmosTransformerBlock(
536-
config=self,
537-
spec=self.transformer_layer_spec,
538-
pre_process=model.pre_process,
539-
post_process=model.post_process,
540-
)
541-
return model
452+
@dataclass
453+
class CosmosConfigVideo2World13B(CosmosVideo2WorldConfig, CosmosConfig12B):
454+
make_vocab_size_divisible_by: int = 128
542455

543456

544457
class CosmosVideo2WorldModel(CosmosModel):
@@ -549,7 +462,9 @@ def __init__(
549462
tokenizer: Optional["TokenizerSpec"] = None,
550463
model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
551464
):
552-
super().__init__(config or CosmosConfig4B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
465+
super().__init__(
466+
config or CosmosConfigVideo2World5B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
467+
)
553468
self.config = config
554469

555470
def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_threshold) -> torch.Tensor:

cosmos1/models/autoregressive/nemo/inference/README.md

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,6 @@ Complete the following steps to run inference on the 4B model.
106106
cd /workspace/Cosmos
107107
git lfs pull $INPUT_DATA
108108

109-
NVTE_FLASH_ATTN=1 \
110-
NVTE_FUSED_ATTN=0 \
111-
NVTE_UNFUSED_ATTN=0 \
112109
torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
113110
--input_image_or_video_path $INPUT_DATA \
114111
--video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \
@@ -138,14 +135,10 @@ Complete the following steps to run inference on the 5B model.
138135
cd /workspace/Cosmos
139136
git lfs pull $INPUT_DATA
140137

141-
NVTE_FLASH_ATTN=1 \
142-
NVTE_FUSED_ATTN=0 \
143-
NVTE_UNFUSED_ATTN=0 \
144138
python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
145139
--input_type video \
146140
--input_image_or_video_path 'cosmos1/models/autoregressive/assets/v1p0/input.mp4' \
147141
--prompt "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
148-
--disable_diffusion_decoder \
149142
--ar_model_dir nvidia/Cosmos-1.0-Autoregressive-5B-Video2World
150143
```
151144

@@ -160,6 +153,8 @@ Complete the following steps to generate a new output video using a post-trained
160153
1. Set the following environment variables:
161154

162155
```bash
156+
pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45
157+
163158
export HF_TOKEN="<your/HF/access/token>"
164159
export HF_HOME="<path/to/store/checkpoints>"
165160

@@ -178,9 +173,6 @@ Complete the following steps to generate a new output video using a post-trained
178173
git lfs pull $INPUT_DATA
179174

180175
# change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
181-
NVTE_FLASH_ATTN=1 \
182-
NVTE_FUSED_ATTN=0 \
183-
NVTE_UNFUSED_ATTN=0 \
184176
torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
185177
--input_image_or_video_path $INPUT_DATA \
186178
--video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \
@@ -194,6 +186,8 @@ Complete the following steps to generate a new output video using a post-trained
194186
1. Set the following environment variables:
195187

196188
```bash
189+
pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45
190+
197191
export HF_TOKEN="<your/HF/access/token>"
198192
export HF_HOME="<path/to/store/checkpoints>"
199193

@@ -213,9 +207,6 @@ Complete the following steps to generate a new output video using a post-trained
213207
git lfs pull $INPUT_DATA
214208

215209
# change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
216-
NVTE_FLASH_ATTN=1 \
217-
NVTE_FUSED_ATTN=0 \
218-
NVTE_UNFUSED_ATTN=0 \
219210
python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
220211
--input_image_or_video_path $INPUT_DATA \
221212
--video_save_name "Cosmos-1.0-Autoregressive-5B-Video2World.mp4" \

cosmos1/models/autoregressive/nemo/post_training/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
101101
1. Set the following environment variables:
102102

103103
```bash
104+
pip install --no-cache-dir imageio[ffmpeg] pyav iopath
105+
104106
export HF_TOKEN="<your/HF/access/token>"
105107
export HF_HOME="<path/to/store/checkpoints>"
106108

@@ -144,6 +146,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
144146
1. Set the following environment variables:
145147

146148
```bash
149+
pip install --no-cache-dir imageio[ffmpeg] pyav iopath
150+
147151
export HF_TOKEN="<your/HF/access/token>"
148152
export HF_HOME="<path/to/store/checkpoints>"
149153

cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,41 +16,30 @@
1616
import json
1717

1818
import torch
19+
from nemo.collections.llm.gpt.data.mock import MockDataModule
1920
from torch.utils.data import Dataset
2021

2122
from cosmos1.models.autoregressive.modules.embedding import SinCosPosEmbAxisTE
23+
from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig
2224

2325
TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
2426
DATA_RESOLUTION_SUPPORTED = [640, 1024]
2527
NUM_CONTEXT_FRAMES = 33
2628
BOV_TOKEN = 64000
2729
PAD_ID = 64002
28-
from nemo.collections.llm.gpt.data.mock import MockDataModule
2930

3031

3132
class CosmosVideo2WorldDataset(Dataset):
3233
def __init__(self, data_path, model_config, split="train"):
3334
self.data_path = data_path
3435
self.model_config = model_config
3536
self.split = split
36-
self.abs_pos_emb = self._initialize_abs_pos_emb()
37+
self.abs_pos_emb = get_abs_pos_embed(model_config, training_type="text_to_video")
3738
metadata_file = f"{self.data_path}/metadata.json"
3839
with open(metadata_file, "r") as f:
3940
metadata = json.load(f)
4041
self.metadata = metadata
4142

42-
def _initialize_abs_pos_emb(self):
43-
pos_emb = SinCosPosEmbAxisTE(
44-
self.model_config.hidden_size,
45-
latent_shape=self.model_config.latent_shape,
46-
pad_to_multiple_of=self.model_config.pad_to_multiple_of,
47-
device="cpu",
48-
)
49-
training_type = "text_to_video"
50-
abs_pos_emb = pos_emb.forward(training_type=training_type)
51-
abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
52-
return abs_pos_emb
53-
5443
def __len__(self):
5544
return self.metadata[f"{self.split}_samples"]
5645

@@ -90,6 +79,18 @@ def collate_fn(self, batch):
9079
return self._collate_fn(batch)
9180

9281

82+
def get_abs_pos_embed(model_config: CosmosConfig, training_type: str | None = "text_to_video"):
83+
pos_emb = SinCosPosEmbAxisTE(
84+
model_config.hidden_size,
85+
latent_shape=model_config.latent_shape,
86+
pad_to_multiple_of=model_config.pad_to_multiple_of,
87+
device="cpu",
88+
)
89+
abs_pos_emb = pos_emb.forward(training_type=training_type)
90+
abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
91+
return abs_pos_emb
92+
93+
9394
class CosmosVideo2WorldDataModule(MockDataModule):
9495
def __init__(self, *args, **kwargs):
9596
data_path = kwargs["data_path"]

0 commit comments

Comments
 (0)