Skip to content

Commit 9793df5

Browse files
authored
cudnn frontend v1.13.0 (#150)
cudnn frontend v1.13 is the preferred cudnn frontend version for [cudnn version 9.11.0](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html#cudnn-9-11-0) and above. Introduces device descriptor, which allows for device-less compilation of cudnn graph on a target GPU. See newly added [sample](samples/cpp/misc/deviceless_aot_compilation.cpp) and documentation. - Introduced `generate_stats` as an alias to `is_inference`. `generate_stats` will be used to control the stat tensor dump. `is_inference` is now deprecated usage. - Improved support checks for left and right diagonal bands in conjunction with the diagonal alignment. - Improved error handling for large head dimension (d > 128) in sdpa bprop. - Added support for fused Layernorm with Relu and samples for [Layernorm with relu bitmask dump](samples/cpp/norm/layernorm_bitmask_relu.cpp) - Published improved SDPA training benchmarks for fp8 and fp16/bf16 graph patterns. - Enable int4 Weight only Quantization for matmul. See [example](samples/cpp/int4_woq_matmul.cpp) - Allow block scale dequantize (required for low precision matmul) to take 2-D scale factor. - Allow reductions to accept deterministic as a attribute. - Added pybinds for block scale dequantize. - Fixed the sliding window attn_score_modifier function allowing it to set true negative infinity.
1 parent f937055 commit 9793df5

File tree

95 files changed

+5314
-1848
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+5314
-1848
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cmake_minimum_required(VERSION 3.23)
22

3-
project(cudnn_frontend VERSION 1.12.0)
3+
project(cudnn_frontend VERSION 1.13.0)
44

55
option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
66
option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)

benchmark/Llama-3.2-1B-Training/training_perf.py

Lines changed: 76 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
1515
nvidia-smi -i 0 -lgc <min_clock>,<max_clock>
1616
"""
17+
1718
import time
1819

1920
import matplotlib.pyplot as plt
@@ -41,38 +42,54 @@
4142

4243
model_name = "meta-llama/Llama-3.2-1B"
4344
config = transformers.AutoConfig.from_pretrained(model_name, trust_remote_code=True)
44-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
45+
tokenizer = transformers.AutoTokenizer.from_pretrained(
46+
model_name, trust_remote_code=True
47+
)
4548
tokenizer.pad_token = tokenizer.eos_token
46-
model = LlamaForCausalLM(config).to(device).train() # set norm layers to training mode
49+
model = LlamaForCausalLM(config).to(device).train() # set norm layers to training mode
4750
loss_fct = torch.nn.CrossEntropyLoss()
4851
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
4952

5053
# Configuration matrix to test
5154
batch_seqlen = [(24, 768), (12, 1024), (6, 2048), (3, 4096), (2, 8192), (1, 16384)]
52-
backends = [SDPBackend.CUDNN_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]
55+
backends = [
56+
SDPBackend.CUDNN_ATTENTION,
57+
SDPBackend.EFFICIENT_ATTENTION,
58+
SDPBackend.FLASH_ATTENTION,
59+
]
5360

5461
# Run timing experiments
5562
warmup_iterations = 5 # num of training iterations to run for warmup
5663
measure_iterations = 100 # num of training iterations to run to measure for timing
5764
data = []
5865
for batch_size, seq_len in batch_seqlen:
59-
assert seq_len < tokenizer.model_max_length, "seqlen must be less than the model max length"
66+
assert (
67+
seq_len < tokenizer.model_max_length
68+
), "seqlen must be less than the model max length"
6069
# create random tensors
6170
# - input embedding tensor to simulate a batch of input token sequences converted into embeddings
6271
# - attention mask of all ones for full attention
6372
# - random target to compute cross entropy loss in training loop
6473
shape = (batch_size, seq_len, config.hidden_size)
6574
inputs_embeds = torch.randn(*shape, dtype=dtype, device=device)
6675
attention_mask = torch.ones(*shape[:2], dtype=torch.int64, device=device)
67-
target = torch.randint(2, config.vocab_size-2, shape[:2], dtype=torch.int64, device=device)
76+
target = torch.randint(
77+
2, config.vocab_size - 2, shape[:2], dtype=torch.int64, device=device
78+
)
6879
for backend in backends:
6980
backend_name = str(backend).split(".")[-1]
70-
print(f"Timing {backend_name} with batch_size={batch_size} and seq_len={seq_len}")
81+
print(
82+
f"Timing {backend_name} with batch_size={batch_size} and seq_len={seq_len}"
83+
)
7184
with sdpa_kernel(backends=[backend]):
7285
# warmup iterations: to minimize the effect of system cache
7386
for _ in range(warmup_iterations):
74-
output = model.forward(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
75-
loss = loss_fct(output.logits.view(-1, config.vocab_size), target.view(-1))
87+
output = model.forward(
88+
inputs_embeds=inputs_embeds, attention_mask=attention_mask
89+
)
90+
loss = loss_fct(
91+
output.logits.view(-1, config.vocab_size), target.view(-1)
92+
)
7693
optimizer.zero_grad()
7794
loss.backward()
7895
optimizer.step()
@@ -81,47 +98,83 @@
8198
start = time.time()
8299
# measure iterations: per-iteration time obtained by averaging
83100
for _ in range(measure_iterations):
84-
output = model.forward(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
85-
loss = loss_fct(output.logits.view(-1, config.vocab_size), target.view(-1))
101+
output = model.forward(
102+
inputs_embeds=inputs_embeds, attention_mask=attention_mask
103+
)
104+
loss = loss_fct(
105+
output.logits.view(-1, config.vocab_size), target.view(-1)
106+
)
86107
optimizer.zero_grad()
87108
loss.backward()
88109
optimizer.step()
89-
torch.cuda.synchronize() # wait for all kernels to finish for accurate timing
110+
torch.cuda.synchronize() # wait for all kernels to finish for accurate timing
90111
duration = time.time() - start
91-
data.append((backend_name, batch_size, seq_len, duration/measure_iterations))
112+
data.append(
113+
(backend_name, batch_size, seq_len, duration / measure_iterations)
114+
)
92115

93116
# Process stats
94117
df = pd.DataFrame(data, columns=["backend", "batch_size", "seq_len", "time"])
95118
df["label"] = "BS=" + df["batch_size"].astype(str) + " SL=" + df["seq_len"].astype(str)
96119
# compute the speedup w.r.t. CUDNN_ATTENTION
97120
df["speedup_label"] = df["backend"] + " vs EFFICIENT_ATTENTION"
98121
df["speedup"] = df.apply(
99-
lambda row: df.loc[(df["backend"] == "EFFICIENT_ATTENTION") & (df["batch_size"] == row["batch_size"]) & (df["seq_len"] == row["seq_len"]), "time"].values[0] / row["time"],
100-
axis=1)
122+
lambda row: df.loc[
123+
(df["backend"] == "EFFICIENT_ATTENTION")
124+
& (df["batch_size"] == row["batch_size"])
125+
& (df["seq_len"] == row["seq_len"]),
126+
"time",
127+
].values[0]
128+
/ row["time"],
129+
axis=1,
130+
)
101131
df.to_csv("training_timing.csv", index=False)
102132

103133
# Create plots
104134
label_order = [f"BS={b} SL={s}" for b, s in batch_seqlen] # x-axis order
105135
hue_order = ["CUDNN_ATTENTION", "FLASH_ATTENTION", "EFFICIENT_ATTENTION"]
106-
g = sns.barplot(data=df, x="label", y="time", hue="backend",
107-
palette=["#76B900", "orchid", "royalblue"], order=label_order, hue_order=hue_order)
136+
g = sns.barplot(
137+
data=df,
138+
x="label",
139+
y="time",
140+
hue="backend",
141+
palette=["#76B900", "orchid", "royalblue"],
142+
order=label_order,
143+
hue_order=hue_order,
144+
)
108145
g.set_title("\nTraining Iteration Time")
109-
g.set(xlabel="Batch size and sequence length", ylabel="Mean iteration time (s), lower is better")
146+
g.set(
147+
xlabel="Batch size and sequence length",
148+
ylabel="Mean iteration time (s), lower is better",
149+
)
110150
g.get_legend().set_title("")
111151
plt.legend(fontsize=8)
112152
plt.xticks(rotation=10, size=8)
113153
plt.tight_layout()
114154
plt.savefig("iteration_time.png", dpi=300)
115155

116156
plt.clf()
117-
hue_order = ["CUDNN_ATTENTION vs EFFICIENT_ATTENTION", "FLASH_ATTENTION vs EFFICIENT_ATTENTION"]
118-
g = sns.barplot(data=df[df["speedup_label"]!="EFFICIENT_ATTENTION vs EFFICIENT_ATTENTION"],
119-
x="label", y="speedup", hue="speedup_label",
120-
palette=["#76B900", "orchid"], order=label_order, hue_order=hue_order)
157+
hue_order = [
158+
"CUDNN_ATTENTION vs EFFICIENT_ATTENTION",
159+
"FLASH_ATTENTION vs EFFICIENT_ATTENTION",
160+
]
161+
g = sns.barplot(
162+
data=df[df["speedup_label"] != "EFFICIENT_ATTENTION vs EFFICIENT_ATTENTION"],
163+
x="label",
164+
y="speedup",
165+
hue="speedup_label",
166+
palette=["#76B900", "orchid"],
167+
order=label_order,
168+
hue_order=hue_order,
169+
)
121170
for container in g.containers:
122171
g.bar_label(container, fmt="%.2f", fontsize=6)
123-
g.set_title("Per-iteration Speed-up of\ncuDNN/Flash Attention Backend vs Efficient Attention")
124-
g.set(xlabel="Batch size and sequence length", ylabel="Speed-up ratio, higher is better")
172+
g.set_title(
173+
"Per-iteration Speed-up of\ncuDNN/Flash Attention Backend vs Efficient Attention"
174+
)
175+
g.set(
176+
xlabel="Batch size and sequence length", ylabel="Speed-up ratio, higher is better"
177+
)
125178
g.get_legend().set_title("")
126179
plt.legend(fontsize=8)
127180
plt.xticks(rotation=10, size=8)

benchmark/sdpa_benchmark/benchmark_flash_attention.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ def time_fwd(func, *args, **kwargs):
355355
q=q_fwd,
356356
k=k_fwd,
357357
v=v_fwd,
358-
is_inference=is_infer,
358+
generate_stats=not is_infer,
359359
attn_scale=attn_scale,
360360
use_causal_mask=is_causal,
361361
dropout=dropout_tuple if is_dropout else None,
@@ -562,7 +562,7 @@ def time_fwd(func, *args, **kwargs):
562562
descale_s=descale_s_fwd,
563563
scale_s=scale_s_fwd,
564564
scale_o=scale_o_fwd,
565-
is_inference=is_infer,
565+
generate_stats=not is_infer,
566566
attn_scale=attn_scale,
567567
use_causal_mask=is_causal,
568568
use_padding_mask=False,

benchmark/sdpa_benchmark_bf16_training/artifacts/sdpa_benchmark_results_NVIDIA_B200.csv

Lines changed: 0 additions & 41 deletions
This file was deleted.
Binary file not shown.

0 commit comments

Comments
 (0)