Merge pull request #1905 from AI-Hypercomputer:carlosbus/v6e_small_cluster_recipes_llama_3_1

Google-ML-Automation · Google-ML-Automation · commit 9f1820b472ef · 2025-07-10T09:30:55.000-07:00
PiperOrigin-RevId: 781565589
diff --git a/benchmarks/maxtext_trillium_model_configs.py b/benchmarks/maxtext_trillium_model_configs.py
@@ -867,6 +867,51 @@
     ),
 )
 
+# Config for v6e-64
+llama3_1_8b_8192_bs5 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="llama3_1-8b-8192-bs5",
+        model_type="llama3.1-8b",
+        tuning_params={
+            "per_device_batch_size": 5,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "offload",
+            "out_proj": "offload",
+            "query_proj": "offload",
+            "key_proj": "offload",
+            "value_proj": "offload",
+            "max_target_length": 8192,
+            "attention": "flash",
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "enable_checkpointing": False,
+            "sa_block_q": 2048,
+            "sa_block_kv": 2048,
+            "sa_block_kv_compute": 2048,
+            "sa_block_q_dkv": 2048,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 2048,
+            "sa_block_q_dq": 2048,
+            "sa_block_kv_dq": 2048,
+            "sa_use_fused_bwd_kernel": True,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 5,
+        },
+        xla_flags=(
+            xla_flags_library.DENSE_VMEM_LIMIT_FLAG
+            + xla_flags_library.LAYOUT_FOR_ALL_REDUCE_SCATTER
+            + xla_flags_library.DATA_PARALLEL_OVERLAP
+            + xla_flags_library.CF_FOR_ALL_GATHER
+            + xla_flags_library.ENABLE_SPARSECORE_OFFLOADING_FOR_ALL_REDUCE
+            + xla_flags_library.HOST_OFFLOAD_FLAGS
+        ),
+    ),
+)
+
 
 llama3_1_8b_8192_no_collective_matmul = _add_to_model_dictionary(
   trillium_model_dict,
@@ -956,6 +1001,137 @@
     ),
 )
 
+# Config for v6e-64
+llama3_1_70b_8192_bs2 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="llama3_1-70b-8192-bs2",
+        model_type="llama3.1-70b",
+        tuning_params={
+            "per_device_batch_size": 2,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "offload",
+            "query_proj": "offload",
+            "key_proj": "offload",
+            "value_proj": "offload",
+            "max_target_length": 8192,
+            "attention": "flash",
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "enable_checkpointing": False,
+            "sa_block_q": 2048,
+            "sa_block_kv": 2048,
+            "sa_block_kv_compute": 2048,
+            "sa_block_q_dkv": 2048,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 2048,
+            "sa_block_q_dq": 2048,
+            "sa_block_kv_dq": 2048,
+            "sa_use_fused_bwd_kernel": True,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 5,
+        },
+        xla_flags=(
+            xla_flags_library.DENSE_VMEM_LIMIT_FLAG
+            + xla_flags_library.LAYOUT_FOR_ALL_REDUCE_SCATTER
+            + xla_flags_library.DATA_PARALLEL_OVERLAP
+            + xla_flags_library.CF_FOR_ALL_GATHER
+            + xla_flags_library.HOST_OFFLOAD_FLAGS
+        ),
+    ),
+)
+
+# Config for v6e-32
+llama3_1_70b_8192_bs2_bfloat16_no_collective_matmul = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="llama3_1-70b-8192-bs2-bfloat16-no-collective-matmul",
+        model_type="llama3.1-70b",
+        tuning_params={
+            "per_device_batch_size": 2,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "offload",
+            "query_proj": "offload",
+            "key_proj": "offload",
+            "value_proj": "offload",
+            "max_target_length": 8192,
+            "attention": "flash",
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "enable_checkpointing": False,
+            "sa_block_q": 2048,
+            "sa_block_kv": 2048,
+            "sa_block_kv_compute": 2048,
+            "sa_block_q_dkv": 2048,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 2048,
+            "sa_block_q_dq": 2048,
+            "sa_block_kv_dq": 2048,
+            "sa_use_fused_bwd_kernel": True,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 5,
+            "weight_dtype": "bfloat16",
+        },
+        xla_flags=(
+            xla_flags_library.DENSE_VMEM_LIMIT_FLAG
+            + xla_flags_library.LAYOUT_FOR_ALL_REDUCE_SCATTER
+            + xla_flags_library.DATA_PARALLEL_OVERLAP
+            + xla_flags_library.CF_FOR_ALL_GATHER
+            + xla_flags_library.HOST_OFFLOAD_FLAGS
+            + xla_flags_library.DISABLE_COLLECTIVE_MATMUL
+        ),
+    ),
+)
+
+# Config for v6e-128
+llama3_1_70b_8192_bs4 = _add_to_model_dictionary(
+    trillium_model_dict,
+    MaxTextModel(
+        model_name="llama3_1-70b-8192-bs4",
+        model_type="llama3.1-70b",
+        tuning_params={
+            "per_device_batch_size": 4,
+            "ici_fsdp_parallelism": -1,
+            "remat_policy": "custom",
+            "decoder_layer_input": "offload",
+            "query_proj": "offload",
+            "key_proj": "offload",
+            "value_proj": "offload",
+            "max_target_length": 8192,
+            "attention": "flash",
+            "use_iota_embed": True,
+            "dataset_path": "gs://max-datasets-rogue",
+            "dataset_type": "synthetic",
+            "enable_checkpointing": False,
+            "sa_block_q": 2048,
+            "sa_block_kv": 2048,
+            "sa_block_kv_compute": 2048,
+            "sa_block_q_dkv": 2048,
+            "sa_block_kv_dkv": 2048,
+            "sa_block_kv_dkv_compute": 2048,
+            "sa_block_q_dq": 2048,
+            "sa_block_kv_dq": 2048,
+            "sa_use_fused_bwd_kernel": True,
+            "profiler": "xplane",
+            "skip_first_n_steps_for_profiler": 10,
+            "profiler_steps": 5,
+        },
+        xla_flags=(
+            xla_flags_library.DENSE_VMEM_LIMIT_FLAG
+            + xla_flags_library.LAYOUT_FOR_ALL_REDUCE_SCATTER
+            + xla_flags_library.DATA_PARALLEL_OVERLAP
+            + xla_flags_library.CF_FOR_ALL_GATHER
+            + xla_flags_library.HOST_OFFLOAD_FLAGS
+        ),
+    ),
+)
+
 llama3_1_70b_8192_iter_synthetic = _add_to_model_dictionary(
   trillium_model_dict,
   MaxTextModel(
diff --git a/maxtext_jax_ai_image.Dockerfile b/maxtext_jax_ai_image.Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /deps
 
 # Copy setup files and dependency files separately for better caching
 COPY setup.sh ./
-COPY requirements.txt requirements_with_jax_ai_image.txt ./
+COPY requirements.txt requirements_with_jax_ai_image.txt requirements_with_jax_stable_stack_0_6_1_pipreqs.txt ./
 
 
 # For JAX AI tpu training images 0.4.37 AND 0.4.35
@@ -34,7 +34,15 @@ RUN if [ "$DEVICE" = "tpu" ] && ([ "$JAX_AI_IMAGE_BASEIMAGE" = "us-docker.pkg.de
 RUN apt-get update && apt-get install --yes && apt-get install --yes dnsutils
 # TODO(bvandermoon, parambole): Remove this when it's added to JAX AI Image
 RUN pip install google-cloud-monitoring
-RUN python3 -m pip install -r /deps/requirements_with_jax_ai_image.txt
+
+# Install requirements file that was generated with pipreqs for JSS 0.6.1 using:
+# pipreqs --savepath requirements_with_jax_stable_stack_0_6_1_pipreqs.txt
+# Otherwise use general requirements_with_jax_ai_image.txt
+RUN if [ "$DEVICE" = "tpu" ] && [ "$JAX_STABLE_STACK_BASEIMAGE" = "us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.6.1-rev1" ]; then \
+        python3 -m pip install -r /deps/requirements_with_jax_stable_stack_0_6_1_pipreqs.txt; \
+  else \
+        python3 -m pip install -r /deps/requirements_with_jax_ai_image.txt; \
+  fi
 
 # Now copy the remaining code (source files that may change frequently)
 COPY . .
diff --git a/requirements_with_jax_stable_stack_0_6_1_pipreqs.txt b/requirements_with_jax_stable_stack_0_6_1_pipreqs.txt
@@ -0,0 +1,55 @@
+absl_py==2.2.2
+aqt==25.2.7
+benchmark_db_writer==1.0.0.dev20250610
+benchmark_db_writer.egg==info
+cloud_accelerator_diagnostics==0.1.1
+cloud_tpu_diagnostics==0.1.5
+datasets==3.6.0
+etils==1.12.2
+evaluate==0.4.4
+flax==0.10.6
+grain==0.2.10
+grpcio==1.72.0rc1
+huggingface_hub==0.33.0
+jax==0.6.0
+jaxlib==0.6.0 # Manually adding to ensure consistency in future
+jaxtyping==0.3.2
+jetstream==0.1.0
+jsonlines==4.0.0
+libtpu==0.0.15 # Manually adding to ensure consistency in future
+matplotlib==3.10.3
+ml_collections==1.1.0
+ml_dtypes==0.5.1
+ml_goodput_measurement==0.0.11
+nltk==3.9.1
+numpy==2.3.1
+omegaconf==2.3.0
+optax==0.2.5
+orbax==0.1.9
+pandas==2.3.0
+pathwaysutils==0.1.1
+Pillow==11.2.1
+protobuf==6.31.1
+psutil==7.0.0
+pytest==8.4.1
+PyYAML==6.0.2
+PyYAML==6.0.2
+Requests==2.32.4
+safetensors==0.5.3
+sentencepiece==0.2.0
+setuptools==80.9.0
+tabulate==0.9.0
+tensorboard_plugin_profile==2.13.0
+tensorboardX==2.6.2.2
+tensorboardX==2.6.4
+tensorflow==2.19.0
+tensorflow_datasets==4.9.9
+tensorflow_text==2.19.0
+tensorstore==0.1.75
+tiktoken==0.9.0
+torch==2.7.1
+tqdm==4.67.1
+transformer_engine==2.4.0
+transformers==4.52.4
+trl==0.19.0
+urllib3==2.5.0