tenstorrent
diff --git a/‎.github/containers/CONTAINER_README.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/containers/CONTAINER_README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/containers/activate-install.sh‎
Lines changed: 53 additions & 0 deletions b/‎.github/containers/activate-install.sh‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎.github/containers/test-docker-smoke.sh‎
Lines changed: 2 additions & 2 deletions b/‎.github/containers/test-docker-smoke.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/call-build-ttmlir-toolchain.yml‎
Lines changed: 12 additions & 6 deletions b/‎.github/workflows/call-build-ttmlir-toolchain.yml‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/broadcast.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/broadcast.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/general_broadcast.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/general_broadcast.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/tutorial/multicore.py‎
Lines changed: 163 additions & 0 deletions b/‎examples/tutorial/multicore.py‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎examples/demo_one.py‎ ‎examples/tutorial/multicore_grid_auto.py‎examples/demo_one.py renamed to examples/tutorial/multicore_grid_auto.py
Lines changed: 4 additions & 4 deletions b/‎examples/demo_one.py‎ ‎examples/tutorial/multicore_grid_auto.py‎examples/demo_one.py renamed to examples/tutorial/multicore_grid_auto.py
Lines changed: 4 additions & 4 deletions
@@ -10,7 +10,7 @@ tt-lang examples are available in `$TTMLIR_TOOLCHAIN_DIR/examples`.
 
 Try running an example:
 ```bash
-python $TTMLIR_TOOLCHAIN_DIR/examples/demo_one.py
+python $TTMLIR_TOOLCHAIN_DIR/examples/tutorial/multicore_grid_auto.py
 ```
 
 ## Available Tools
 
@@ -0,0 +1,53 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+# SPDX-License-Identifier: Apache-2.0
+#
+# tt-lang environment activation for installed location
+# This script is used when tt-lang is installed via cmake --install
+
+# Guard against double activation
+if [ "${TTLANG_ENV_ACTIVATED:-0}" = "1" ]; then
+  return 0 2>/dev/null || exit 0
+fi
+
+# Determine the install prefix (parent of env/)
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_PREFIX="$(dirname "$SCRIPT_DIR")"
+
+# Default TTMLIR_TOOLCHAIN_DIR if not set (assume same as install prefix for Docker)
+: ${TTMLIR_TOOLCHAIN_DIR:=$INSTALL_PREFIX}
+export TTMLIR_TOOLCHAIN_DIR
+
+# Activate tt-mlir toolchain venv
+if [ -f "${TTMLIR_TOOLCHAIN_DIR}/venv/bin/activate" ]; then
+  . "${TTMLIR_TOOLCHAIN_DIR}/venv/bin/activate"
+fi
+
+# Set paths for installed tt-lang
+export TT_LANG_HOME="$INSTALL_PREFIX"
+export PATH="${INSTALL_PREFIX}/bin:${TTMLIR_TOOLCHAIN_DIR}/bin:$PATH"
+export PYTHONPATH="${INSTALL_PREFIX}/python_packages:${TTMLIR_TOOLCHAIN_DIR}/python_packages:${TTMLIR_TOOLCHAIN_DIR}/python_packages/ttrt/runtime/ttnn:$PYTHONPATH"
+export LD_LIBRARY_PATH="${TTMLIR_TOOLCHAIN_DIR}/lib:$LD_LIBRARY_PATH"
+
+# Set TT_METAL_RUNTIME_ROOT
+export TT_METAL_RUNTIME_ROOT="${TTMLIR_TOOLCHAIN_DIR}/tt-metal"
+export TT_METAL_HOME="$TT_METAL_RUNTIME_ROOT"
+
+export TTLANG_ENV_ACTIVATED=1
+
+cat << 'EOF'
+
+████████╗████████╗       ██╗      █████╗  ███╗   ██╗  ██████╗
+╚══██╔══╝╚══██╔══╝       ██║     ██╔══██╗ ████╗  ██║ ██╔════╝
+   ██║      ██║   █████╗ ██║     ███████║ ██╔██╗ ██║ ██║  ███╗
+   ██║      ██║   ╚════╝ ██║     ██╔══██║ ██║╚██╗██║ ██║   ██║
+   ██║      ██║          ███████╗██║  ██║ ██║ ╚████║ ╚██████╔╝
+   ╚═╝      ╚═╝          ╚══════╝╚═╝  ╚═╝ ╚═╝  ╚═══╝  ╚═════╝
+EOF
+echo ""
+echo "  Toolchain: ${TTMLIR_TOOLCHAIN_DIR}"
+echo "  Examples:  ${TTMLIR_TOOLCHAIN_DIR}/examples"
+echo ""
+echo "  Run an example on:"
+echo "   - Python simulator: ttlang-sim $TTMLIR_TOOLCHAIN_DIR/examples/tutorial/multicore_grid_auto.py"
+echo "   - TT hardware:      python $TTMLIR_TOOLCHAIN_DIR/examples/tutorial/multicore_grid_auto.py"
@@ -49,7 +49,7 @@ if [ -e /dev/tenstorrent/0 ]; then
             --device=/dev/tenstorrent/0 \
             -v /dev/hugepages:/dev/hugepages \
             -v /dev/hugepages-1G:/dev/hugepages-1G \
-            tt-lang-user-ubuntu-22-04:latest python /opt/ttmlir-toolchain/examples/demo_one.py
+            tt-lang-dist-ubuntu-22-04:latest python /opt/ttmlir-toolchain/examples/tutorial/multicore_grid_auto.py
 else
     echo "Test 3: SKIPPED (no hardware)"
 fi
@@ -64,7 +64,7 @@ echo ""
 # Test 5: Examples in /root
 echo "Test 5: Examples in /root"
 run_test "Examples in /root" "Examples missing" \
-    sudo docker run --rm tt-lang-user-ubuntu-22-04:latest ls /root/examples/demo_one.py
+    sudo docker run --rm tt-lang-dist-ubuntu-22-04:latest ls /root/examples/tutorial/multicore_grid_auto.py
 echo ""
 
 echo "=== Smoke Test Complete ==="
 
@@ -56,24 +56,31 @@ jobs:
     migrate-cache:
         name: Migrate toolchain cache to LLVM-only
         runs-on: ubuntu-latest
-        if: true  # TODO: Set to false or remove after running once
+        if: true  # TODO: Delete old cache first, then run once, then set to false
+
+        env:
+            TTMLIR_TOOLCHAIN_DIR: ${{ github.workspace }}/ttmlir-toolchain
+
         steps:
-            - name: Checkout
+            - name: Checkout current branch
               uses: actions/checkout@v4
 
             - name: Determine tt-mlir commit
               id: commit
               run: |
                   COMMIT=$(cat third-party/tt-mlir.commit | tr -d '[:space:]')
                   echo "commit=$COMMIT" >> $GITHUB_OUTPUT
+                  echo "Using commit: $COMMIT"
 
             - name: Restore full toolchain cache (from main branch format)
               id: restore-cache
               uses: actions/cache/restore@v4
               with:
-                  path: ttmlir-toolchain
+                  # Main branch saves with absolute path, so restore with same
+                  path: ${{ env.TTMLIR_TOOLCHAIN_DIR }}
                   key: Linux-ttlang-ttmlir-toolchain-${{ steps.commit.outputs.commit }}-v2
                   restore-keys: |
+                      Linux-ttlang-ttmlir-toolchain-${{ steps.commit.outputs.commit }}
                       Linux-ttlang-ttmlir-toolchain-
 
             - name: Check cache was restored
@@ -85,9 +92,6 @@ jobs:
                   fi
                   echo "Cache restored from: ${{ steps.restore-cache.outputs.cache-matched-key }}"
 
-            - name: Cleanup toolchain (create stubs)
-              run: .github/containers/cleanup-toolchain.sh ttmlir-toolchain
-
             - name: Save as LLVM-only cache
               uses: actions/cache/save@v4
               with:
@@ -97,6 +101,8 @@ jobs:
     # First job: Check if caches exist (runs on standard runner)
     check-cache:
         name: Check toolchain cache
+        needs: migrate-cache
+        if: always()  # Run even if migrate-cache is skipped
         runs-on: ubuntu-latest
         timeout-minutes: 10
 
 
@@ -109,7 +109,7 @@ export TT_METAL_HOME=/workspace/tt-mlir/third_party/tt-metal/src/tt-metal
 export TT_METAL_DEVICE_PROFILER=1
 export TT_METAL_PROFILER_MID_RUN_DUMP=1
 export TTLANG_AUTO_PROFILE=1
-python examples/demo_one.py
+python examples/tutorial/multicore_grid_auto.py
 ```
 
 See [docs/auto-profiler-examples/](https://github.com/tenstorrent/tt-lang/tree/main/docs/auto-profiler-examples) for sample profile outputs showing the per-line cycle breakdown format.
 
@@ -5,9 +5,9 @@
 import torch
 
 
-def from_torch(t):
+def from_torch(tensor: ttnn.Tensor):
     return ttnn.from_torch(
-        t,
+        tensor,
         dtype=ttnn.bfloat16,
         layout=ttnn.TILE_LAYOUT,
         device=device,
 
@@ -5,9 +5,9 @@
 import torch
 
 
-def from_torch(t):
+def from_torch(tensor: ttnn.Tensor):
     return ttnn.from_torch(
-        t,
+        tensor,
         dtype=ttnn.bfloat16,
         layout=ttnn.TILE_LAYOUT,
         device=device,
 
@@ -0,0 +1,163 @@
+# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import ttnn
+import torch
+
+
+def from_torch(tensor: ttnn.Tensor):
+    return ttnn.from_torch(
+        tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+
+import ttl
+
+TILE_SIZE = 32
+GRANULARITY = 4
+
+
+@ttl.kernel(grid=(4, 4))
+def __demo_kernel(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor, y: ttnn.Tensor):
+    row_tiles_per_block = GRANULARITY
+    col_tiles_per_block = GRANULARITY
+
+    grid_cols, grid_rows = ttl.grid_size(dims=2)
+
+    rows_per_core = a.shape[0] // TILE_SIZE // row_tiles_per_block // grid_rows
+    cols_per_core = a.shape[1] // TILE_SIZE // col_tiles_per_block // grid_rows
+
+    a_cb = ttl.make_circular_buffer_like(
+        a, shape=(row_tiles_per_block, col_tiles_per_block), buffer_factor=2
+    )
+    b_cb = ttl.make_circular_buffer_like(
+        b, shape=(row_tiles_per_block, col_tiles_per_block), buffer_factor=2
+    )
+    c_cb = ttl.make_circular_buffer_like(
+        c, shape=(row_tiles_per_block, col_tiles_per_block), buffer_factor=2
+    )
+    y_cb = ttl.make_circular_buffer_like(
+        y, shape=(row_tiles_per_block, col_tiles_per_block), buffer_factor=2
+    )
+
+    @ttl.compute()
+    def demo_compute():
+        for _ in range(rows_per_core):
+            for _ in range(cols_per_core):
+                with (
+                    a_cb.wait() as a_blk,
+                    b_cb.wait() as b_blk,
+                    c_cb.wait() as c_blk,
+                    y_cb.reserve() as y_blk,
+                ):
+                    y_blk.store(a_blk * b_blk + c_blk)
+
+    @ttl.datamovement()
+    def demo_read():
+        core_col, core_row = ttl.core(dims=2)
+
+        for local_row in range(rows_per_core):
+            row = core_row * rows_per_core + local_row
+            start_row_tile = row * row_tiles_per_block
+            end_row_tile = (row + 1) * row_tiles_per_block
+
+            for local_col in range(cols_per_core):
+                col = core_col * cols_per_core + local_col
+                start_col_tile = col * col_tiles_per_block
+                end_col_tile = (col + 1) * col_tiles_per_block
+
+                with (
+                    a_cb.reserve() as a_blk,
+                    b_cb.reserve() as b_blk,
+                    c_cb.reserve() as c_blk,
+                ):
+                    tx_a = ttl.copy(
+                        a[
+                            start_row_tile:end_row_tile,
+                            start_col_tile:end_col_tile,
+                        ],
+                        a_blk,
+                    )
+                    tx_b = ttl.copy(
+                        b[
+                            start_row_tile:end_row_tile,
+                            start_col_tile:end_col_tile,
+                        ],
+                        b_blk,
+                    )
+                    tx_c = ttl.copy(
+                        c[
+                            start_row_tile:end_row_tile,
+                            start_col_tile:end_col_tile,
+                        ],
+                        c_blk,
+                    )
+
+                    tx_a.wait()
+                    tx_b.wait()
+                    tx_c.wait()
+
+    @ttl.datamovement()
+    def demo_write():
+        core_col, core_row = ttl.core(dims=2)
+
+        for local_row in range(rows_per_core):
+            row = core_row * rows_per_core + local_row
+            start_row_tile = row * row_tiles_per_block
+            end_row_tile = (row + 1) * row_tiles_per_block
+
+            for local_col in range(cols_per_core):
+                col = core_col * cols_per_core + local_col
+                start_col_tile = col * col_tiles_per_block
+                end_col_tile = (col + 1) * col_tiles_per_block
+
+                with y_cb.wait() as y_blk:
+                    tx = ttl.copy(
+                        y_blk,
+                        y[
+                            start_row_tile:end_row_tile,
+                            start_col_tile:end_col_tile,
+                        ],
+                    )
+                    tx.wait()
+
+
+def demo_kernel(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
+    y = from_torch(torch.zeros((a.shape[0], a.shape[1]), dtype=torch.bfloat16))
+    __demo_kernel(a, b, c, y)
+    return y
+
+
+torch.manual_seed(42)
+
+device = ttnn.open_device(device_id=0)
+
+try:
+    shape = (2048, 2048)
+
+    a = torch.rand(shape, dtype=torch.bfloat16)
+    b = torch.rand(shape, dtype=torch.bfloat16)
+    c = torch.rand(shape, dtype=torch.bfloat16)
+    d = torch.rand(shape, dtype=torch.bfloat16)
+
+    expected_y = (a * b + c) * d
+
+    a = from_torch(a)
+    b = from_torch(b)
+    c = from_torch(c)
+    d = from_torch(d)
+
+    y = ttnn.multiply(demo_kernel(a, b, c), d)
+
+    y = ttnn.to_torch(y)
+    print(y)
+    print(expected_y)
+
+    assert torch.allclose(y, expected_y, rtol=1e-2, atol=1e-2), "Tensors do not match"
+
+finally:
+    ttnn.close_device(device)
@@ -5,9 +5,9 @@
 import torch
 
 
-def from_torch(t):
+def from_torch(tensor: ttnn.Tensor):
     return ttnn.from_torch(
-        t,
+        tensor,
         dtype=ttnn.bfloat16,
         layout=ttnn.TILE_LAYOUT,
         device=device,
@@ -22,7 +22,7 @@ def from_torch(t):
 
 
 @ttl.kernel(grid="auto")
-def __demo_kernel(a, b, c, y):
+def __demo_kernel(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor, y: ttnn.Tensor):
     row_tiles_per_block = GRANULARITY
     col_tiles_per_block = GRANULARITY
 
@@ -139,7 +139,7 @@ def demo_write():
                             tx.wait()
 
 
-def demo_kernel(a, b, c):
+def demo_kernel(a: ttnn.Tensor, b: ttnn.Tensor, c: ttnn.Tensor):
     y = from_torch(torch.zeros((a.shape[0], a.shape[1]), dtype=torch.bfloat16))
     __demo_kernel(a, b, c, y)
     return y