Merge branch 'stable' of github.com:tenstorrent/tt-metal into stable

dpopovTT · dpopovTT · commit 614997917aef · 2026-02-13T11:31:35.000+01:00
diff --git a/models/demos/gemma3/tt/model_config.py b/models/demos/gemma3/tt/model_config.py
@@ -863,6 +863,60 @@ def _get_xattn_kv_prefill_mem_cfg(seq_len):
             )  # TODO: try out 3 for short axis and 4 for long axis (TG only) <- should work but untested in model
             self.ccl_dtype = ttnn.bfloat8_b
 
+            # model specific CCL configs
+            default_ln_ag = {"num_links": 1, "chunks_per_sync": 10, "num_workers_per_link": 2}
+            default_agmm = {"num_links": 1, "chunks_per_sync": 10, "num_workers_per_link": 2}
+            default_mlp_rs = {
+                "num_links": self.num_reduce_scatter_links,
+                "chunks_per_sync": 10,
+                "num_workers_per_link": 2,
+                "rs_memory_config": ttnn.DRAM_MEMORY_CONFIG,
+            }
+            default_sampling_force_argmax = {
+                "allow_force_argmax": False,
+                "num_links": 1,
+                "chunks_per_sync": 10,
+                "num_workers_per_link": 2,
+                "topology": ttnn.Topology.Linear,
+            }
+            model_specific_ccl_configs = {
+                "Llama-3.1-8B": {
+                    "attn_ln_ag": {"num_links": 4, "chunks_per_sync": 10, "num_workers_per_link": 1},
+                    "ffn_ln_ag": {"num_links": 4, "chunks_per_sync": 25, "num_workers_per_link": 1},
+                    "attn_agmm": {"num_links": 4, "chunks_per_sync": 1, "num_workers_per_link": 1},
+                    "mlp_rs": {
+                        "num_links": 4,
+                        "chunks_per_sync": 1,
+                        "num_workers_per_link": 1,
+                        "rs_memory_config": ttnn.L1_MEMORY_CONFIG,
+                    },
+                    "sampling_force_argmax": {
+                        "allow_force_argmax": True,
+                        "num_links": 4,
+                        "chunks_per_sync": 10,
+                        "num_workers_per_link": 2,
+                        "topology": ttnn.Topology.Ring,
+                    },
+                }
+            }
+            # Model-specific CCL configs are tuned for Galaxy (TG) with 4 links
+            # Only apply them on Galaxy, otherwise use defaults
+            executed_on_galaxy = ttnn.cluster.get_cluster_type() == ttnn.cluster.ClusterType.GALAXY
+            if executed_on_galaxy and self.base_model_name in model_specific_ccl_configs:
+                self.model_config["ATTN_LN_AG_CONFIG"] = model_specific_ccl_configs[self.base_model_name]["attn_ln_ag"]
+                self.model_config["FFN_LN_AG_CONFIG"] = model_specific_ccl_configs[self.base_model_name]["ffn_ln_ag"]
+                self.model_config["ATTN_AGMM_CONFIG"] = model_specific_ccl_configs[self.base_model_name]["attn_agmm"]
+                self.model_config["MLP_RS_CONFIG"] = model_specific_ccl_configs[self.base_model_name]["mlp_rs"]
+                self.model_config["SAMPLING_AG_CONFIG"] = model_specific_ccl_configs[self.base_model_name][
+                    "sampling_force_argmax"
+                ]
+            else:
+                self.model_config["ATTN_LN_AG_CONFIG"] = default_ln_ag
+                self.model_config["FFN_LN_AG_CONFIG"] = default_ln_ag
+                self.model_config["ATTN_AGMM_CONFIG"] = default_agmm
+                self.model_config["MLP_RS_CONFIG"] = default_mlp_rs
+                self.model_config["SAMPLING_AG_CONFIG"] = default_sampling_force_argmax
+
             logger.info(f"Attention grid: {attn_input_grid}")
             logger.info(f"MLP grid: {mlp_core_grid}")
             logger.info(f"MLP prefill grids @ 32: w1/w3: {mlp1_3_grid(32)}, w2: {mlp2_grid(32)}")