NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/Llama-3.2-1B-Training/training_perf.py‎
Lines changed: 76 additions & 23 deletions b/‎benchmark/Llama-3.2-1B-Training/training_perf.py‎
Lines changed: 76 additions & 23 deletions
diff --git a/‎benchmark/sdpa_benchmark/benchmark_flash_attention.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/sdpa_benchmark/benchmark_flash_attention.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.csv‎
Lines changed: 0 additions & 41 deletions b/‎benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.csv‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.png‎
-120 KB b/‎benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.png‎
-120 KB
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.23)
 
-project(cudnn_frontend VERSION 1.12.0)
+project(cudnn_frontend VERSION 1.13.0)
 
 option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
 option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
 
@@ -14,6 +14,7 @@
 
     nvidia-smi -i 0 -lgc <min_clock>,<max_clock>
 """
+
 import time
 
 import matplotlib.pyplot as plt
@@ -41,38 +42,54 @@
 
 model_name = "meta-llama/Llama-3.2-1B"
 config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    model_name, trust_remote_code=True
+)
 tokenizer.pad_token = tokenizer.eos_token
-model = LlamaForCausalLM(config).to(device).train()   # set norm layers to training mode
+model = LlamaForCausalLM(config).to(device).train()  # set norm layers to training mode
 loss_fct = torch.nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
 
 # Configuration matrix to test
 batch_seqlen = [(24, 768), (12, 1024), (6, 2048), (3, 4096), (2, 8192), (1, 16384)]
-backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]
+backends = [
+    SDPBackend.CUDNN_ATTENTION,
+    SDPBackend.EFFICIENT_ATTENTION,
+    SDPBackend.FLASH_ATTENTION,
+]
 
 # Run timing experiments
 warmup_iterations = 5  # num of training iterations to run for warmup
 measure_iterations = 100  # num of training iterations to run to measure for timing
 data = []
 for batch_size, seq_len in batch_seqlen:
-    assert seq_len < tokenizer.model_max_length, "seqlen must be less than the model max length"
+    assert (
+        seq_len < tokenizer.model_max_length
+    ), "seqlen must be less than the model max length"
     # create random tensors
     #  - input embedding tensor to simulate a batch of input token sequences converted into embeddings
     #  - attention mask of all ones for full attention
     #  - random target to compute cross entropy loss in training loop
     shape = (batch_size, seq_len, config.hidden_size)
     inputs_embeds = torch.randn(*shape, dtype=dtype, device=device)
     attention_mask = torch.ones(*shape[:2], dtype=torch.int64, device=device)
-    target = torch.randint(2, config.vocab_size-2, shape[:2], dtype=torch.int64, device=device)
+    target = torch.randint(
+        2, config.vocab_size - 2, shape[:2], dtype=torch.int64, device=device
+    )
     for backend in backends:
         backend_name = str(backend).split(".")[-1]
-        print(f"Timing {backend_name} with batch_size={batch_size} and seq_len={seq_len}")
+        print(
+            f"Timing {backend_name} with batch_size={batch_size} and seq_len={seq_len}"
+        )
         with sdpa_kernel(backends=[backend]):
             # warmup iterations: to minimize the effect of system cache
             for _ in range(warmup_iterations):
-                output = model.forward(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
-                loss = loss_fct(output.logits.view(-1, config.vocab_size), target.view(-1))
+                output = model.forward(
+                    inputs_embeds=inputs_embeds, attention_mask=attention_mask
+                )
+                loss = loss_fct(
+                    output.logits.view(-1, config.vocab_size), target.view(-1)
+                )
                 optimizer.zero_grad()
                 loss.backward()
                 optimizer.step()
@@ -81,47 +98,83 @@
             start = time.time()
             # measure iterations: per-iteration time obtained by averaging
             for _ in range(measure_iterations):
-                output = model.forward(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
-                loss = loss_fct(output.logits.view(-1, config.vocab_size), target.view(-1))
+                output = model.forward(
+                    inputs_embeds=inputs_embeds, attention_mask=attention_mask
+                )
+                loss = loss_fct(
+                    output.logits.view(-1, config.vocab_size), target.view(-1)
+                )
                 optimizer.zero_grad()
                 loss.backward()
                 optimizer.step()
-            torch.cuda.synchronize()   # wait for all kernels to finish for accurate timing
+            torch.cuda.synchronize()  # wait for all kernels to finish for accurate timing
             duration = time.time() - start
-            data.append((backend_name, batch_size, seq_len, duration/measure_iterations))
+            data.append(
+                (backend_name, batch_size, seq_len, duration / measure_iterations)
+            )
 
 # Process stats
 df = pd.DataFrame(data, columns=["backend", "batch_size", "seq_len", "time"])
 df["label"] = "BS=" + df["batch_size"].astype(str) + " SL=" + df["seq_len"].astype(str)
 # compute the speedup w.r.t. CUDNN_ATTENTION
 df["speedup_label"] = df["backend"] + " vs EFFICIENT_ATTENTION"
 df["speedup"] = df.apply(
-    lambda row: df.loc[(df["backend"] == "EFFICIENT_ATTENTION") & (df["batch_size"] == row["batch_size"]) & (df["seq_len"] == row["seq_len"]), "time"].values[0] / row["time"],
-    axis=1)
+    lambda row: df.loc[
+        (df["backend"] == "EFFICIENT_ATTENTION")
+        & (df["batch_size"] == row["batch_size"])
+        & (df["seq_len"] == row["seq_len"]),
+        "time",
+    ].values[0]
+    / row["time"],
+    axis=1,
+)
 df.to_csv("training_timing.csv", index=False)
 
 # Create plots
 label_order = [f"BS={b} SL={s}" for b, s in batch_seqlen]  # x-axis order
 hue_order = ["CUDNN_ATTENTION", "FLASH_ATTENTION", "EFFICIENT_ATTENTION"]
-g = sns.barplot(data=df, x="label", y="time", hue="backend",
-                palette=["#76B900", "orchid", "royalblue"], order=label_order, hue_order=hue_order)
+g = sns.barplot(
+    data=df,
+    x="label",
+    y="time",
+    hue="backend",
+    palette=["#76B900", "orchid", "royalblue"],
+    order=label_order,
+    hue_order=hue_order,
+)
 g.set_title("\nTraining Iteration Time")
-g.set(xlabel="Batch size and sequence length", ylabel="Mean iteration time (s), lower is better")
+g.set(
+    xlabel="Batch size and sequence length",
+    ylabel="Mean iteration time (s), lower is better",
+)
 g.get_legend().set_title("")
 plt.legend(fontsize=8)
 plt.xticks(rotation=10, size=8)
 plt.tight_layout()
 plt.savefig("iteration_time.png", dpi=300)
 
 plt.clf()
-hue_order = ["CUDNN_ATTENTION vs EFFICIENT_ATTENTION", "FLASH_ATTENTION vs EFFICIENT_ATTENTION"]
-g = sns.barplot(data=df[df["speedup_label"]!="EFFICIENT_ATTENTION vs EFFICIENT_ATTENTION"],
-                x="label", y="speedup", hue="speedup_label",
-                palette=["#76B900", "orchid"], order=label_order, hue_order=hue_order)
+hue_order = [
+    "CUDNN_ATTENTION vs EFFICIENT_ATTENTION",
+    "FLASH_ATTENTION vs EFFICIENT_ATTENTION",
+]
+g = sns.barplot(
+    data=df[df["speedup_label"] != "EFFICIENT_ATTENTION vs EFFICIENT_ATTENTION"],
+    x="label",
+    y="speedup",
+    hue="speedup_label",
+    palette=["#76B900", "orchid"],
+    order=label_order,
+    hue_order=hue_order,
+)
 for container in g.containers:
     g.bar_label(container, fmt="%.2f", fontsize=6)
-g.set_title("Per-iteration Speed-up of\ncuDNN/Flash Attention Backend vs Efficient Attention")
-g.set(xlabel="Batch size and sequence length", ylabel="Speed-up ratio, higher is better")
+g.set_title(
+    "Per-iteration Speed-up of\ncuDNN/Flash Attention Backend vs Efficient Attention"
+)
+g.set(
+    xlabel="Batch size and sequence length", ylabel="Speed-up ratio, higher is better"
+)
 g.get_legend().set_title("")
 plt.legend(fontsize=8)
 plt.xticks(rotation=10, size=8)
 
@@ -355,7 +355,7 @@ def time_fwd(func, *args, **kwargs):
             q=q_fwd,
             k=k_fwd,
             v=v_fwd,
-            is_inference=is_infer,
+            generate_stats=not is_infer,
             attn_scale=attn_scale,
             use_causal_mask=is_causal,
             dropout=dropout_tuple if is_dropout else None,
@@ -562,7 +562,7 @@ def time_fwd(func, *args, **kwargs):
             descale_s=descale_s_fwd,
             scale_s=scale_s_fwd,
             scale_o=scale_o_fwd,
-            is_inference=is_infer,
+            generate_stats=not is_infer,
             attn_scale=attn_scale,
             use_causal_mask=is_causal,
             use_padding_mask=False,