Skip to content

Commit 480d69f

Browse files
authored
Adds standard kernel format for geak-openevolve run
1 parent b02b2b9 commit 480d69f

File tree

1 file changed

+178
-19
lines changed

1 file changed

+178
-19
lines changed

README.md

Lines changed: 178 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,13 @@ cd /path/to/geak-openevolve
136136
pip install -e .
137137
```
138138

139-
### Step 4: Install TB-eval-OE (Evaluation Framework)
139+
### Step 4: Install GEAK-eval-OE (Evaluation Framework)
140140

141-
TB-eval-OE is required for ROCm kernel evaluation. Install it in one command:
141+
GEAK-eval-OE is required for ROCm kernel evaluation. Install it in one command:
142142

143143
```bash
144-
git clone git@github.com:AMD-AGI/GEAK-eval.git TB-eval-OE && \
145-
cd TB-eval-OE && \
144+
git clone git@github.com:AMD-AGI/GEAK-eval.git GEAK-eval-OE && \
145+
cd GEAK-eval-OE && \
146146
git checkout openevolve && \
147147
pip install -e . --no-deps && \
148148
cd ..
@@ -169,7 +169,7 @@ Edit `my_env.sh` with your configuration:
169169
export OPENAI_API_KEY="your_actual_api_key_here"
170170

171171
# ROCm Evaluator Configuration
172-
export ROCM_GOLDEN_DATA_PATH="/path/to/TB-eval-OE/tb_eval/data/ROCm/data/performance/golden_results"
172+
export ROCM_GOLDEN_DATA_PATH="/path/to/GEAK-eval-OE/tb_eval/data/ROCm/data/performance/golden_results"
173173

174174
# Optional: GPU Architecture
175175
export GPU_ARCHS="gfx950" # Adjust for your GPU
@@ -257,27 +257,185 @@ def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
257257
# ============================================================
258258
# SECTION 3: Benchmarking & Testing Code (pytest)
259259
# ============================================================
260+
import numpy as np
261+
import random
262+
import torch
263+
import os
264+
from numpy.random import RandomState
260265
import pytest
261-
import torch
262266
from torch.testing import assert_close
267+
from tb_eval.perf.ROCm.performance_utils_pytest import PytestBenchmarker, do_bench_config, save_all_benchmark_results
268+
from typing import Dict
269+
270+
import triton
271+
import triton.language as tl
272+
273+
dtype_mapping = {
274+
'float16': torch.float16,
275+
'float32': torch.float32,
276+
}
277+
278+
result_gold = {}
279+
280+
######################################## HELPERS for Eval ########################################
281+
# Helper function to define GB/s for add_kernel
282+
def calculate_add_gbps(params: Dict, ms: float) -> float:
283+
# params will contain 'SIZE', 'dtype_str'
284+
size = params['SIZE']
285+
dtype = dtype_mapping[params['dtype_str']]
286+
# For add: read x, read y, write output
287+
# If x, y, output are torch.Tensor objects passed to this calculator:
288+
# total_bytes = (x.numel() * x.element_size() +
289+
# y.numel() * y.element_size() +
290+
# output.numel() * output.element_size())
291+
# If only params are available:
292+
bytes_per_element = torch.tensor([], dtype=dtype).element_size()
293+
total_bytes = 3 * size * bytes_per_element # 2 reads, 1 write
294+
gbps = total_bytes / (ms / 1000) / 1e9
295+
return gbps
296+
297+
# Helper function to define TFLOPS for add_kernel
298+
def calculate_add_tflops(params: Dict, ms: float) -> float:
299+
size = params['SIZE']
300+
# For add: N operations (N additions)
301+
flops = size
302+
tflops = flops / (ms / 1000) / 1e12
303+
return tflops
304+
305+
def set_seed(seed: int = 42) -> None:
306+
"""
307+
Set the random seed for reproducibility across multiple libraries and configure PyTorch for deterministic behavior.
308+
309+
Args:
310+
seed (int): The seed value to set. Default is 42.
311+
"""
312+
# Set seed for Python's built-in random module
313+
random.seed(seed)
314+
# Set seed for NumPy
315+
np.random.seed(seed)
316+
# Set seed for PyTorch on CPU
317+
torch.manual_seed(seed)
318+
# Set seed for PyTorch on all GPUs (if available)
319+
if torch.cuda.is_available():
320+
torch.cuda.manual_seed(seed)
321+
torch.cuda.manual_seed_all(seed)
322+
# Ensure deterministic behavior in PyTorch
323+
torch.backends.cudnn.deterministic = True
324+
torch.backends.cudnn.benchmark = False
325+
# Set environment variable for hash-based operations
326+
os.environ['PYTHONHASHSEED'] = str(seed)
327+
328+
######################################## HELPERS for Eval ########################################
329+
330+
331+
263332

264333
@pytest.mark.parametrize('SIZE,BLOCK_SIZE,dtype_str',
265-
[(98432, 1024, 'float32'), (1048576, 2048, 'float16')])
334+
[(98432, 1024, dtype_str) for dtype_str in ['float16', 'float32']])
266335
def test_add(SIZE, BLOCK_SIZE, dtype_str, request):
267-
dtype = torch.float32 if dtype_str == 'float32' else torch.float16
268-
269-
# Create test data
336+
set_seed()
337+
338+
dtype = dtype_mapping[dtype_str]
339+
output = torch.empty(SIZE, device='cuda', dtype=dtype)
270340
x = torch.randn(SIZE, device='cuda', dtype=dtype)
271341
y = torch.randn(SIZE, device='cuda', dtype=dtype)
272-
output = torch.empty(SIZE, device='cuda', dtype=dtype)
273-
274-
# Launch kernel
275-
grid = lambda meta: (triton.cdiv(SIZE, meta['BLOCK_SIZE']), )
342+
343+
def grid(meta):
344+
return (triton.cdiv(SIZE, meta['BLOCK_SIZE']), )
345+
276346
add_kernel[grid](x, y, output, SIZE, BLOCK_SIZE=BLOCK_SIZE)
277-
278-
# Verify correctness
347+
279348
output_torch = x + y
280-
assert_close(output, output_torch, rtol=1e-2, atol=1e-3)
349+
torch.set_printoptions(profile='full')
350+
351+
result_gold['_CALL_SUCCESS_'] = torch.tensor([[1.0]])
352+
353+
################### save tri_out in result_gold ###################
354+
test_case_name = request.node.name
355+
sanitized_key_name = test_case_name.replace("::", "_").replace("[", "_").replace("]", "").replace("-", "_")
356+
result_gold[sanitized_key_name] = output.clone().detach().cpu()
357+
###################################################################
358+
359+
assert_close(output, output_torch, rtol=1e-2, atol=1e-3, check_dtype=False)
360+
361+
362+
OP_NAME_FOR_BENCHMARK = "add_kernel_perf"
363+
364+
@pytest.mark.parametrize('SIZE,BLOCK_SIZE_ARG,dtype_str', # BLOCK_SIZE_ARG is the pytest param name
365+
[(98432, 1024, dtype_str) for dtype_str in ['float16', 'float32']] +
366+
[(1048576, 2048, dtype_str) for dtype_str in ['float16', 'float32']]
367+
)
368+
def test_performance(SIZE, BLOCK_SIZE_ARG, dtype_str, request): # Function accepts BLOCK_SIZE_ARG
369+
set_seed()
370+
dtype = dtype_mapping[dtype_str]
371+
x = torch.randn(SIZE, device='cuda', dtype=dtype)
372+
y = torch.randn(SIZE, device='cuda', dtype=dtype)
373+
output = torch.empty(SIZE, device='cuda', dtype=dtype)
374+
375+
# Kernel launch grid
376+
# The 'meta' dict passed to the grid lambda by Triton contains the constexpr arguments
377+
# that were passed to the kernel launch.
378+
# When we call `add_kernel[grid](..., BLOCK_SIZE=BLOCK_SIZE_ARG)`,
379+
# the `meta` dict will have a key 'BLOCK_SIZE' (the name of the constexpr in the kernel signature)
380+
# and its value will be the runtime `BLOCK_SIZE_ARG`.
381+
grid = lambda meta: (triton.cdiv(SIZE, meta['BLOCK_SIZE']),) # ***** CORRECTED HERE *****
382+
383+
kernel_args = [x, y, output, SIZE]
384+
385+
# The op_lambda passes BLOCK_SIZE_ARG (runtime value) as the kernel's `BLOCK_SIZE` (constexpr name)
386+
op_lambda = lambda: add_kernel[grid](*kernel_args, BLOCK_SIZE=BLOCK_SIZE_ARG)
387+
388+
bench_config = do_bench_config(warm_up=25, repetition=100) # Smaller for faster debug
389+
benchmarker = PytestBenchmarker(op_callable=op_lambda,
390+
op_name=OP_NAME_FOR_BENCHMARK,
391+
config=bench_config)
392+
393+
# The dictionary passed to calculators should use consistent keys
394+
current_params_for_calculators = {"SIZE": SIZE, "BLOCK_SIZE_RUNTIME": BLOCK_SIZE_ARG, "dtype_str": dtype_str}
395+
# Note: I used "BLOCK_SIZE_RUNTIME" here to be explicit that it's the value from parametrize,
396+
# not necessarily the same as the constexpr name if they differed.
397+
# If your calculators expect 'BLOCK_SIZE', then use that:
398+
# current_params_for_calculators = {"SIZE": SIZE, "BLOCK_SIZE": BLOCK_SIZE_ARG, "dtype_str": dtype_str}
399+
400+
401+
benchmarker.run_benchmark(current_params_dict=current_params_for_calculators,
402+
gbps_calculator=calculate_add_gbps,
403+
tflops_calculator=calculate_add_tflops)
404+
405+
######################################## HELPERS for Eval ########################################
406+
# --- Pytest hook to save the dictionary at the end of the session ---
407+
def test_save_results():
408+
"""
409+
Called after whole test run finished, right before returning the exit status to the system.
410+
"""
411+
print('Inside session finish...')
412+
if "_CALL_SUCCESS_" not in result_gold:
413+
result_gold['_CALL_SUCCESS_'] = torch.tensor([[0.0]])
414+
OUTPUT_FILENAME = __file__.replace('.','_') + '.pt'
415+
print(f"\nSaving all y_triton results to {OUTPUT_FILENAME}...")
416+
# Ensure the directory for the output file exists if it's in a subdirectory
417+
output_dir = os.path.dirname(OUTPUT_FILENAME)
418+
if output_dir and not os.path.exists(output_dir):
419+
os.makedirs(output_dir, exist_ok=True)
420+
torch.save(result_gold, OUTPUT_FILENAME)
421+
print(f"Successfully saved {len(result_gold)} y_triton tensors to {OUTPUT_FILENAME}.")
422+
423+
424+
def test_save_performance_results():
425+
"""
426+
Called after the test_performance function finishes.
427+
This is a separate hook to ensure performance results are saved.
428+
"""
429+
print('\nPytest session finishing... Saving benchmark results...')
430+
431+
output_directory = os.path.join(os.path.dirname(__file__), "perf") # Save in a "perf" subdirectory next to the test file
432+
os.makedirs(output_directory, exist_ok=True)
433+
434+
save_all_benchmark_results(output_directory)
435+
print(f"All benchmark results attempted to save to: {output_directory}")
436+
437+
438+
######################################## HELPERS for Eval ########################################
281439
```
282440

283441
### Section Details
@@ -301,7 +459,8 @@ def test_add(SIZE, BLOCK_SIZE, dtype_str, request):
301459
- **Correctness tests**: Validate kernel output against reference (e.g., PyTorch)
302460
- **Test parameters**: Different input sizes, block sizes, data types
303461
- **Assertions**: Use `assert_close` or similar for numerical validation
304-
- Can include performance benchmarking tests with timing measurements
462+
- Copy the test_save_results and test_save_performance_results as is exactly.
463+
- Write the test_performance function with the exact test-cases you want to benchmark your kernel on, with `calculate_tflops` and `calculate_ms` functions implementation according to the kernel.
305464

306465
### Example Files
307466

@@ -409,7 +568,7 @@ geak-openevolve/
409568
│ │ └── */ # ROCmBench and other kernels
410569
│ ├── samples/ # Sample kernels and test utilities
411570
│ └── prompts/ # LLM prompt templates
412-
├── TB-eval-OE/ # Evaluation framework (install separately)
571+
├── GEAK-eval-OE/ # Evaluation framework (install separately)
413572
├── docs/ # Documentation and diagrams
414573
├── setup.py
415574
└── README.md

0 commit comments

Comments
 (0)