@@ -136,13 +136,13 @@ cd /path/to/geak-openevolve
136136pip install -e .
137137```
138138
139- ### Step 4: Install TB -eval-OE (Evaluation Framework)
139+ ### Step 4: Install GEAK -eval-OE (Evaluation Framework)
140140
141- TB -eval-OE is required for ROCm kernel evaluation. Install it in one command:
141+ GEAK -eval-OE is required for ROCm kernel evaluation. Install it in one command:
142142
143143``` bash
144- git clone git@github.com:AMD-AGI/GEAK-eval.git TB -eval-OE && \
145- cd TB -eval-OE && \
144+ git clone git@github.com:AMD-AGI/GEAK-eval.git GEAK -eval-OE && \
145+ cd GEAK -eval-OE && \
146146git checkout openevolve && \
147147pip install -e . --no-deps && \
148148cd ..
@@ -169,7 +169,7 @@ Edit `my_env.sh` with your configuration:
169169export OPENAI_API_KEY=" your_actual_api_key_here"
170170
171171# ROCm Evaluator Configuration
172- export ROCM_GOLDEN_DATA_PATH=" /path/to/TB -eval-OE/tb_eval/data/ROCm/data/performance/golden_results"
172+ export ROCM_GOLDEN_DATA_PATH=" /path/to/GEAK -eval-OE/tb_eval/data/ROCm/data/performance/golden_results"
173173
174174# Optional: GPU Architecture
175175export GPU_ARCHS=" gfx950" # Adjust for your GPU
@@ -257,27 +257,185 @@ def add_kernel(x_ptr, y_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
257257# ============================================================
258258# SECTION 3: Benchmarking & Testing Code (pytest)
259259# ============================================================
260+ import numpy as np
261+ import random
262+ import torch
263+ import os
264+ from numpy.random import RandomState
260265import pytest
261- import torch
262266from torch.testing import assert_close
267+ from tb_eval.perf.ROCm.performance_utils_pytest import PytestBenchmarker, do_bench_config, save_all_benchmark_results
268+ from typing import Dict
269+
270+ import triton
271+ import triton.language as tl
272+
273+ dtype_mapping = {
274+ ' float16 ' : torch.float16,
275+ ' float32 ' : torch.float32,
276+ }
277+
278+ result_gold = {}
279+
280+ # ####################################### HELPERS for Eval ########################################
281+ # Helper function to define GB/s for add_kernel
282+ def calculate_add_gbps(params : Dict, ms: float) -> float:
283+ # params will contain 'SIZE', 'dtype_str'
284+ size = params['SIZE']
285+ dtype = dtype_mapping[params['dtype_str']]
286+ # For add: read x, read y, write output
287+ # If x, y, output are torch.Tensor objects passed to this calculator:
288+ # total_bytes = (x.numel() * x.element_size() +
289+ # y.numel() * y.element_size() +
290+ # output.numel() * output.element_size())
291+ # If only params are available:
292+ bytes_per_element = torch.tensor([], dtype=dtype).element_size()
293+ total_bytes = 3 * size * bytes_per_element # 2 reads, 1 write
294+ gbps = total_bytes / (ms / 1000) / 1e9
295+ return gbps
296+
297+ # Helper function to define TFLOPS for add_kernel
298+ def calculate_add_tflops(params : Dict, ms: float) -> float:
299+ size = params['SIZE']
300+ # For add: N operations (N additions)
301+ flops = size
302+ tflops = flops / (ms / 1000) / 1e12
303+ return tflops
304+
305+ def set_seed(seed : int = 42) -> None:
306+ " " "
307+ Set the random seed for reproducibility across multiple libraries and configure PyTorch for deterministic behavior.
308+
309+ Args:
310+ seed (int): The seed value to set. Default is 42.
311+ " " "
312+ # Set seed for Python's built-in random module
313+ random.seed(seed)
314+ # Set seed for NumPy
315+ np.random.seed(seed)
316+ # Set seed for PyTorch on CPU
317+ torch.manual_seed(seed)
318+ # Set seed for PyTorch on all GPUs (if available)
319+ if torch.cuda.is_available() :
320+ torch.cuda.manual_seed(seed)
321+ torch.cuda.manual_seed_all(seed)
322+ # Ensure deterministic behavior in PyTorch
323+ torch.backends.cudnn.deterministic = True
324+ torch.backends.cudnn.benchmark = False
325+ # Set environment variable for hash-based operations
326+ os.environ['PYTHONHASHSEED'] = str(seed)
327+
328+ # ####################################### HELPERS for Eval ########################################
329+
330+
331+
263332
264333@pytest.mark.parametrize('SIZE,BLOCK_SIZE,dtype_str',
265- [(98432, 1024, 'float32'), (1048576, 2048, 'float16') ])
334+ [(98432, 1024, dtype_str) for dtype_str in [ 'float16', 'float32'] ])
266335def test_add(SIZE, BLOCK_SIZE, dtype_str, request) :
267- dtype = torch.float32 if dtype_str == 'float32' else torch.float16
268-
269- # Create test data
336+ set_seed()
337+
338+ dtype = dtype_mapping[dtype_str]
339+ output = torch.empty(SIZE, device='cuda', dtype=dtype)
270340 x = torch.randn(SIZE, device='cuda', dtype=dtype)
271341 y = torch.randn(SIZE, device='cuda', dtype=dtype)
272- output = torch.empty(SIZE, device='cuda', dtype=dtype)
273-
274- # Launch kernel
275- grid = lambda meta : (triton.cdiv(SIZE, meta['BLOCK_SIZE']), )
342+
343+ def grid(meta) :
344+ return (triton.cdiv(SIZE, meta['BLOCK_SIZE']), )
345+
276346 add_kernel[grid](x, y, output, SIZE, BLOCK_SIZE=BLOCK_SIZE)
277-
278- # Verify correctness
347+
279348 output_torch = x + y
280- assert_close(output, output_torch, rtol=1e-2, atol=1e-3)
349+ torch.set_printoptions(profile='full')
350+
351+ result_gold['_CALL_SUCCESS_'] = torch.tensor([[1.0]])
352+
353+ # ################## save tri_out in result_gold ###################
354+ test_case_name = request.node.name
355+ sanitized_key_name = test_case_name.replace("::", "_").replace("[", "_").replace("]", "").replace("-", "_")
356+ result_gold[sanitized_key_name] = output.clone().detach().cpu()
357+ # ##################################################################
358+
359+ assert_close(output, output_torch, rtol=1e-2, atol=1e-3, check_dtype=False)
360+
361+
362+ OP_NAME_FOR_BENCHMARK = "add_kernel_perf"
363+
364+ @pytest.mark.parametrize('SIZE,BLOCK_SIZE_ARG,dtype_str', # BLOCK_SIZE_ARG is the pytest param name
365+ [(98432, 1024, dtype_str) for dtype_str in ['float16', 'float32']] +
366+ [(1048576, 2048, dtype_str) for dtype_str in ['float16', 'float32']]
367+ )
368+ def test_performance(SIZE, BLOCK_SIZE_ARG, dtype_str, request) : # Function accepts BLOCK_SIZE_ARG
369+ set_seed()
370+ dtype = dtype_mapping[dtype_str]
371+ x = torch.randn(SIZE, device='cuda', dtype=dtype)
372+ y = torch.randn(SIZE, device='cuda', dtype=dtype)
373+ output = torch.empty(SIZE, device='cuda', dtype=dtype)
374+
375+ # Kernel launch grid
376+ # The 'meta' dict passed to the grid lambda by Triton contains the constexpr arguments
377+ # that were passed to the kernel launch.
378+ # When we call `add_kernel[grid](..., BLOCK_SIZE=BLOCK_SIZE_ARG)`,
379+ # the `meta` dict will have a key 'BLOCK_SIZE' (the name of the constexpr in the kernel signature)
380+ # and its value will be the runtime `BLOCK_SIZE_ARG`.
381+ grid = lambda meta : (triton.cdiv(SIZE, meta['BLOCK_SIZE']),) # ***** CORRECTED HERE *****
382+
383+ kernel_args = [x, y, output, SIZE]
384+
385+ # The op_lambda passes BLOCK_SIZE_ARG (runtime value) as the kernel's `BLOCK_SIZE` (constexpr name)
386+ op_lambda = lambda : add_kernel[grid](*kernel_args, BLOCK_SIZE=BLOCK_SIZE_ARG)
387+
388+ bench_config = do_bench_config(warm_up=25, repetition=100) # Smaller for faster debug
389+ benchmarker = PytestBenchmarker(op_callable=op_lambda,
390+ op_name=OP_NAME_FOR_BENCHMARK,
391+ config=bench_config)
392+
393+ # The dictionary passed to calculators should use consistent keys
394+ current_params_for_calculators = {"SIZE" : SIZE, "BLOCK_SIZE_RUNTIME": BLOCK_SIZE_ARG, "dtype_str": dtype_str}
395+ # Note: I used "BLOCK_SIZE_RUNTIME" here to be explicit that it's the value from parametrize,
396+ # not necessarily the same as the constexpr name if they differed.
397+ # If your calculators expect 'BLOCK_SIZE', then use that:
398+ # current_params_for_calculators = {"SIZE": SIZE, "BLOCK_SIZE": BLOCK_SIZE_ARG, "dtype_str": dtype_str}
399+
400+
401+ benchmarker.run_benchmark(current_params_dict=current_params_for_calculators,
402+ gbps_calculator=calculate_add_gbps,
403+ tflops_calculator=calculate_add_tflops)
404+
405+ # ####################################### HELPERS for Eval ########################################
406+ # --- Pytest hook to save the dictionary at the end of the session ---
407+ def test_save_results() :
408+ " " "
409+ Called after whole test run finished, right before returning the exit status to the system.
410+ " " "
411+ print('Inside session finish...')
412+ if "_CALL_SUCCESS_" not in result_gold :
413+ result_gold['_CALL_SUCCESS_'] = torch.tensor([[0.0]])
414+ OUTPUT_FILENAME = __file__.replace('.','_') + '.pt'
415+ print(f"\nSaving all y_triton results to {OUTPUT_FILENAME}...")
416+ # Ensure the directory for the output file exists if it's in a subdirectory
417+ output_dir = os.path.dirname(OUTPUT_FILENAME)
418+ if output_dir and not os.path.exists(output_dir) :
419+ os.makedirs(output_dir, exist_ok=True)
420+ torch.save(result_gold, OUTPUT_FILENAME)
421+ print(f"Successfully saved {len(result_gold)} y_triton tensors to {OUTPUT_FILENAME}.")
422+
423+
424+ def test_save_performance_results() :
425+ " " "
426+ Called after the test_performance function finishes.
427+ This is a separate hook to ensure performance results are saved.
428+ " " "
429+ print('\nPytest session finishing... Saving benchmark results...')
430+
431+ output_directory = os.path.join(os.path.dirname(__file__), "perf") # Save in a "perf" subdirectory next to the test file
432+ os.makedirs(output_directory, exist_ok=True)
433+
434+ save_all_benchmark_results(output_directory)
435+ print(f"All benchmark results attempted to save to : {output_directory}")
436+
437+
438+ # ####################################### HELPERS for Eval ########################################
281439```
282440
283441### Section Details
@@ -301,7 +459,8 @@ def test_add(SIZE, BLOCK_SIZE, dtype_str, request):
301459 - ** Correctness tests** : Validate kernel output against reference (e.g., PyTorch)
302460 - ** Test parameters** : Different input sizes, block sizes, data types
303461 - ** Assertions** : Use ` assert_close ` or similar for numerical validation
304- - Can include performance benchmarking tests with timing measurements
462+ - Copy the test_save_results and test_save_performance_results as is exactly.
463+ - Write the test_performance function with the exact test-cases you want to benchmark your kernel on, with ` calculate_tflops ` and ` calculate_ms ` functions implementation according to the kernel.
305464
306465### Example Files
307466
@@ -409,7 +568,7 @@ geak-openevolve/
409568│ │ └── */ # ROCmBench and other kernels
410569│ ├── samples/ # Sample kernels and test utilities
411570│ └── prompts/ # LLM prompt templates
412- ├── TB -eval-OE/ # Evaluation framework (install separately)
571+ ├── GEAK -eval-OE/ # Evaluation framework (install separately)
413572├── docs/ # Documentation and diagrams
414573├── setup.py
415574└── README.md
0 commit comments