pytorch-labs · xuzhao9 · Oct 25, 2024 · Oct 25, 2024
diff --git a/.ci/tritonbench/install-triton-main.sh b/.ci/tritonbench/install-triton-main.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+if [ -z "${BASE_CONDA_ENV}" ]; then
+  echo "ERROR: BASE_CONDA_ENV is not set"
+  exit 1
+fi
+
+if [ -z "${CONDA_ENV}" ]; then
+  echo "ERROR: CONDA_ENV is not set"
+  exit 1
+fi
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}"
+conda activate "${BASE_CONDA_ENV}"
+# Remove the conda env if exists
+conda remove --name "${CONDA_ENV}" -y --all || true
+conda create --name "${CONDA_ENV}" -y --clone "${BASE_CONDA_ENV}"
+conda activate "${CONDA_ENV}"
+
+. "${SETUP_SCRIPT}"
+
+# Install and build triton from source code
+cd /workspace
+git clone https://github.com/triton-lang/triton.git
+cd /workspace/triton
+pip install ninja cmake wheel pybind11; # build-time dependencies
+pip install -e python
diff --git a/.ci/tritonbench/install.sh b/.ci/tritonbench/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+. "${SETUP_SCRIPT}"
+
+tritonbench_dir=$(dirname "$(readlink -f "$0")")/../..
+cd ${tritonbench_dir}
+
+# Install Tritonbench and all its customized packages
+python install.py --all
diff --git a/.ci/tritonbench/test-install.sh b/.ci/tritonbench/test-install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+. "${SETUP_SCRIPT}"
+
+tritonbench_dir=$(dirname "$(readlink -f "$0")")/../..
+cd ${tritonbench_dir}
+
+# Install Tritonbench and all its customized packages
+python install.py --all --test
diff --git a/.ci/tritonbench/test-operators.sh b/.ci/tritonbench/test-operators.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -x
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+. "${SETUP_SCRIPT}"
+
+# Test Tritonbench operators
+# TODO: test every operator, fwd+bwd
+python run.py --op launch_latency --mode fwd --num-inputs 1 --test-only
+python run.py --op addmm --mode fwd --num-inputs 1 --test-only
+python run.py --op gemm --mode fwd --num-inputs 1 --test-only
+python run.py --op sum --mode fwd --num-inputs 1 --test-only
+python run.py --op softmax --mode fwd --num-inputs 1 --test-only
+python run.py --op layer_norm --mode fwd --num-inputs 1 --test-only
+
+
+# Segfault
+# python run.py --op flash_attention --mode fwd --num-inputs 1 --test-only
+
+# CUDA OOM
+# python run.py --op jagged_layer_norm --mode fwd --num-inputs 1 --test-only
+# python run.py --op jagged_mean --mode fwd --num-inputs 1 --test-only
+# python run.py --op jagged_softmax --mode fwd --num-inputs 1 --test-only
+# python run.py --op jagged_sum --mode fwd --num-inputs 1 --test-only
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -0,0 +1,60 @@
+name: TritonBench Nightly Docker Build
+on:
+  schedule:
+    # Push the nightly docker daily at 3 PM UTC
+    - cron: '0 15 * * *'
+  pull_request:
+    paths:
+      - .github/workflows/docker.yaml
+      - docker/*.dockerfile 
+  workflow_dispatch:
+    inputs:
+      nightly_date:
+        description: "PyTorch nightly version"
+        required: false
+env:
+  CONDA_ENV: "tritonbench"
+  DOCKER_IMAGE: "ghcr.io/pytorch-labs/tritonbench:latest"
+  SETUP_SCRIPT: "/workspace/setup_instance.sh"
+
+jobs:
+  build-push-docker:
+    if: ${{ github.repository_owner == 'pytorch-labs' }}
+    runs-on: 32-core-ubuntu
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          path: tritonbench
+      - name: Login to GitHub Container Registry
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: pytorch-labs
+          password: ${{ secrets.TRITONBENCH_ACCESS_TOKEN }}
+      - name: Build TritonBench nightly docker
+        run: |
+          set -x
+          export NIGHTLY_DATE="${{ github.event.inputs.nightly_date }}"
+          cd tritonbench/docker
+          # branch name is github.head_ref when triggered by pull_request
+          # and it is github.ref_name when triggered by workflow_dispatch
+          branch_name=${{ github.head_ref || github.ref_name }}
+          docker build . --build-arg TRITONBENCH_BRANCH="${branch_name}" --build-arg FORCE_DATE="${NIGHTLY_DATE}" \
+              -f tritonbench-nightly.dockerfile -t ghcr.io/pytorch-labs/tritonbench:latest
+          # Extract pytorch version from the docker
+          PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"')
+          export DOCKER_TAG=$(awk '{match($0, /dev[0-9]+/, arr); print arr[0]}' <<< "${PYTORCH_VERSION}")
+          docker tag ghcr.io/pytorch-labs/tritonbench:latest ghcr.io/pytorch-labs/tritonbench:${DOCKER_TAG}
+      - name: Push docker to remote
+        if: github.event_name != 'pull_request'
+        run: |
+          # Extract pytorch version from the docker
+          PYTORCH_VERSION=$(docker run -e SETUP_SCRIPT="${SETUP_SCRIPT}" ghcr.io/pytorch-labs/tritonbench:latest bash -c '. "${SETUP_SCRIPT}"; python -c "import torch; print(torch.__version__)"')
+          export DOCKER_TAG=$(awk '{match($0, /dev[0-9]+/, arr); print arr[0]}' <<< "${PYTORCH_VERSION}")
+          docker push ghcr.io/pytorch-labs/tritonbench:${DOCKER_TAG}
+          docker push ghcr.io/pytorch-labs/tritonbench:latest
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+.data/
+.DS_Store
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,18 @@
+[submodule "submodules/FBGEMM"]
+	path = submodules/FBGEMM
+	url = https://github.com/pytorch/FBGEMM.git
+[submodule "submodules/cutlass-kernels"]
+	path = submodules/cutlass-kernels
+	url = https://github.com/ColfaxResearch/cutlass-kernels.git
+[submodule "submodules/flash-attention"]
+	path = submodules/flash-attention
+	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "submodules/generative-recommenders"]
+	path = submodules/generative-recommenders
+	url = https://github.com/facebookresearch/generative-recommenders.git
+[submodule "submodules/kernels"]
+	path = submodules/kernels
+	url = https://github.com/triton-lang/kernels.git
+[submodule "submodules/ThunderKittens"]
+	path = submodules/ThunderKittens
+	url = https://github.com/HazyResearch/ThunderKittens.git
diff --git a/run.py b/run.py
@@ -1,3 +1,10 @@
+"""
+Tritonbench benchmark runner.
+
+Note: make sure to `python install.py` first or otherwise make sure the benchmark you are going to run
+      has been installed. This script intentionally does not automate or enforce setup steps.
+"""
+
 import argparse
 import os
 import sys
@@ -211,9 +218,16 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
             from .fb.utils import log_benchmark
 
             if "hardware" in args:
-                log_benchmark(metrics, args.op, args.hardware)
+                log_benchmark(
+                    metrics=metrics,
+                    bencmark_name=args.op,
+                    device=args.device,
+                    hardware=args.hardware,
+                )
             else:
-                log_benchmark(metrics, args.op)
+                log_benchmark(
+                    metrics=metrics, bencmark_name=args.op, device=args.device
+                )
         if args.plot:
             try:
                 opbench.plot()

diff --git a/submodules/FBGEMM b/submodules/FBGEMM
diff --git a/submodules/ThunderKittens b/submodules/ThunderKittens
diff --git a/submodules/cutlass-kernels b/submodules/cutlass-kernels
diff --git a/submodules/flash-attention b/submodules/flash-attention
diff --git a/submodules/generative-recommenders b/submodules/generative-recommenders
diff --git a/submodules/kernels b/submodules/kernels
diff --git a/tritonbench/operators/embedding/__init__.py b/tritonbench/operators/embedding/__init__.py
@@ -0,0 +1 @@
+from .operator import Operator
diff --git a/tritonbench/operators/embedding/operator.py b/tritonbench/operators/embedding/operator.py
@@ -0,0 +1,53 @@
+import argparse
+from typing import Callable, Generator, List, Optional
+
+import torch
+from torch.nn import Embedding
+
+from tritonbench.utils.triton_op import BenchmarkOperator, register_benchmark
+
+try:
+    from liger_kernel.transformers.experimental.embedding import LigerEmbedding
+except ModuleNotFoundError:
+    LigerEmbedding = None
+
+# Reference: https://github.com/linkedin/Liger-Kernel/
+# blob/main/benchmark/scripts/benchmark_embedding.py
+
+
+class Operator(BenchmarkOperator):
+    def __init__(
+        self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
+    ):
+        super().__init__(tb_args, extra_args)
+        # they are generated later
+        self.baseline_op = None
+        self.liger_op = None
+        self.use_cuda_graphs = False
+
+    def get_input_iter(self) -> Generator:
+        for B, T, D in [(32, 512, 768), (8, 2048, 4096)]:
+            for V in [2**i for i in range(10, 18)]:
+                _input = torch.randint(0, V, (B, T), device=self.device)
+                yield V, D, _input
+
+    @register_benchmark(baseline=True)
+    def torch_embedding(self, V, D, input) -> Callable:
+        self.baseline_op = Embedding(V, D).to(self.device).to(self.dtype)
+        return lambda: self.baseline_op(input)
+
+    @register_benchmark()
+    def liger_embedding(self, V, D, input) -> Callable:
+        self.liger_op = LigerEmbedding(V, D).to(self.device).to(self.dtype)
+        return lambda: self.liger_op(input)
+
+    @register_benchmark()
+    def inductor_embedding(self, V, D, input) -> Callable:
+        self.baseline_op = Embedding(V, D).to(self.device).to(self.dtype)
+        compiled = torch.compile(self.baseline_op, dynamic=False)
+        return lambda: compiled(input)
+
+    def get_bwd_fn(self, fwd_fn: Callable) -> Callable:
+        y = fwd_fn()
+        do = torch.randn_like(y)
+        return lambda: y.backward(do)
diff --git a/tritonbench/operators/fused_linear_jsd/__init__.py b/tritonbench/operators/fused_linear_jsd/__init__.py
@@ -0,0 +1 @@
+from .operator import Operator