Skip to content

Commit 6b639f7

Browse files
committed
move directories init to parallel execution
1 parent 3f0b5b4 commit 6b639f7

File tree

4 files changed

+62
-49
lines changed

4 files changed

+62
-49
lines changed

autotune/core/benchmark.py

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
from autotune.cache.results import ProfileResults
99
from autotune.core.job import ProfileJobs
10-
from autotune.core.metrics import tensor_to_matmul_mac_count
1110
from autotune.core.parallel import set_neuron_core, split_jobs_into_groups
1211
from autotune.core.run_nki import run_on_neuron_core
1312

@@ -24,27 +23,10 @@ def __init__(self, jobs: ProfileJobs, cache_root_dir: str, warmup: int = 10, ite
2423
self.iters = iters
2524

2625
def __call__(self):
27-
self.results = self._init_results()
28-
self.results.dump_summary()
26+
self.results = ProfileResults(sort_key="min_ms", lower_is_better=True)
2927
self._run_on_neuron_cores()
3028
self.results.dump_summary()
3129

32-
def _init_results(self) -> ProfileResults:
33-
results = ProfileResults(sort_key="min_ms", lower_is_better=True)
34-
for job in self.jobs:
35-
job.cache_root_dir = self.cache_root_dir
36-
matmul_mac_count = tensor_to_matmul_mac_count(job.input_tensor_shapes)
37-
results.add_result(
38-
index=job.index,
39-
kernel=job.kernel,
40-
kernel_kwargs=job.kernel_kwargs,
41-
compiler_flags=job.compiler_flags,
42-
cache_dir=job.cache_dir,
43-
matmul_mac_count=matmul_mac_count,
44-
)
45-
job.init_job_dir()
46-
return results
47-
4830
def _run_on_neuron_cores(self):
4931
"""Main function to launch Neuron core worker subprocesses."""
5032
num_neuron_cores = min(32, self.jobs.num_jobs)
@@ -59,22 +41,23 @@ def _run_on_neuron_cores(self):
5941
"warmup": self.warmup,
6042
"iters": self.iters,
6143
"jobs": self.jobs.subset(rank_job_ids),
62-
"results": [self.results[job_id] for job_id in rank_job_ids],
44+
"cache_root_dir": self.cache_root_dir,
45+
"sort_key": self.results.sort_key,
46+
"lower_is_better": self.results.lower_is_better,
6347
}
6448
future = executor.submit(run_on_neuron_core, **kwargs)
65-
futures[future] = neuron_core_id
49+
futures[future] = (neuron_core_id, rank_job_ids)
6650

6751
with tqdm(
6852
total=self.jobs.num_jobs,
6953
desc=f"Running {self.jobs.num_jobs} kernels on {num_neuron_cores} Neuron cores",
7054
unit="kernels",
7155
) as pbar:
7256
for future in as_completed(futures):
73-
neuron_core_id = futures[future]
57+
neuron_core_id, rank_job_ids = futures[future]
7458
updated_results = future.result()
7559
for updated_result in updated_results:
76-
job_id = updated_result.index
77-
self.results[job_id] = updated_result
60+
self.results.results.append(updated_result)
7861
pbar.update(len(updated_results))
7962
for executor in executors:
8063
executor.shutdown(wait=True)

autotune/core/gemm.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ def __call__(self, lhs: tensor, rhs: tensor) -> Any:
7272
Returns:
7373
Any: _description_
7474
"""
75-
print(self.gemm_config)
76-
print(self.loop_ranges)
7775
if self.transposed_lhs:
7876
self.lhs_hbm = HBMTensor(lhs, axes=("K", "M"))
7977
else:

autotune/core/run_nki.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from autotune.cache.results import ProfileResult, capture_error_message
66
from autotune.core.compile import compile_kernel, create_spike_kernel, run_spike_kernel
77
from autotune.core.job import ProfileJobs
8-
from autotune.core.metrics import extract_metrics
8+
from autotune.core.metrics import extract_metrics, tensor_to_matmul_mac_count
99

1010

1111
def compile_all_kernels(jobs: ProfileJobs, results: List[ProfileResult]) -> None:
@@ -94,14 +94,46 @@ def run_neuron_benchmarks(warmup: int, iters: int, jobs: ProfileJobs, results: L
9494
result.add_error(error_msg)
9595

9696

97-
def run_on_neuron_core(warmup: int, iters: int, jobs: ProfileJobs, results: List[ProfileResult]) -> List[ProfileResult]:
97+
def run_on_neuron_core(
98+
warmup: int, iters: int, jobs: ProfileJobs, cache_root_dir: str, sort_key: str, lower_is_better: bool
99+
) -> List[ProfileResult]:
98100
"""
99101
Run kernels with separated CPU compilation and Neuron execution phases.
100102
101-
This function first compiles all kernels on CPU (without SpikeExecutor),
102-
then runs benchmarks on Neuron cores (with SpikeExecutor).
103+
This function initializes ProfileResult objects for each job, then
104+
compiles all kernels on CPU (without SpikeExecutor), and finally
105+
runs benchmarks on Neuron cores (with SpikeExecutor).
106+
107+
Args:
108+
warmup: Number of warmup iterations
109+
iters: Number of benchmark iterations
110+
jobs: ProfileJobs containing all jobs to run
111+
cache_root_dir: Root directory for cache storage
112+
sort_key: The metric name to use for sorting results
113+
lower_is_better: Whether lower values of the sort key are better
114+
115+
Returns:
116+
List of ProfileResult objects with benchmark results
103117
"""
104118

119+
# Initialize ProfileResult objects for each job
120+
results = []
121+
for job in jobs:
122+
job.cache_root_dir = cache_root_dir
123+
matmul_mac_count = tensor_to_matmul_mac_count(job.input_tensor_shapes)
124+
result = ProfileResult(
125+
index=job.index,
126+
main_metric=sort_key,
127+
lower_is_better=lower_is_better,
128+
kernel=job.kernel,
129+
kernel_kwargs=job.kernel_kwargs,
130+
compiler_flags=job.compiler_flags,
131+
cache_dir=job.cache_dir,
132+
matmul_mac_count=matmul_mac_count,
133+
)
134+
job.init_job_dir()
135+
results.append(result)
136+
105137
# Pre-initialize all input tensors once for all jobs with the same shapes
106138
jobs.initialize_input_tensors()
107139

examples/gemm.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import argparse
5-
import random
65

76
import numpy as np
87
from neuronpy.core.language import bfloat16
98

9+
from autotune.cache.visualize import plot_metric
1010
from autotune.core.benchmark import Benchmark
1111
from autotune.core.gemm_config import generate_gemm_configs
1212
from autotune.core.job import ProfileJobs
@@ -32,15 +32,15 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
3232
meta_kernel = ("/home/ec2-user/workplace/nki-autotune/autotune/core/gemm.py", "lhs_rhs_meta_gemm")
3333

3434
# for M, N, K in [(4096, 4096, 4096), (8192, 8192, 8192), (16384, 16384, 16384), (24576, 24576, 24576)]:
35-
for M, N, K in [(128, 512, 1279)]:
35+
for M, N, K in [(3757, 1647, 2539)]:
3636
if transposed_lhs:
3737
lhs_shape = (K, M)
3838
else:
3939
lhs_shape = (M, K)
4040
rhs_shape = (K, N)
4141
# Generate all possible configurations using the new function
4242
configs = generate_gemm_configs(M=M, N=N, K=K)
43-
configs = random.sample(configs, 1)
43+
# configs = random.sample(configs, 1)
4444
for config in configs:
4545
all_jobs.add_job(
4646
kernel=meta_kernel,
@@ -50,14 +50,14 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
5050
compiler_flags="--target=trn1 --auto-cast=none --internal-tensorizer-opt-level=nki",
5151
postprocessing=postprocessing,
5252
)
53-
# all_jobs.add_job(
54-
# kernel=baseline_kernel,
55-
# input_tensor_shapes=[lhs_shape, rhs_shape],
56-
# data_type=data_type,
57-
# kernel_kwargs={},
58-
# compiler_flags="--target=trn1 --auto-cast=none --model-type=transformer --tensorizer-options='--print-nki'",
59-
# postprocessing=postprocessing,
60-
# )
53+
all_jobs.add_job(
54+
kernel=baseline_kernel,
55+
input_tensor_shapes=[lhs_shape, rhs_shape],
56+
data_type=data_type,
57+
kernel_kwargs={},
58+
compiler_flags="--target=trn1 --auto-cast=none --model-type=transformer --tensorizer-options='--print-nki'",
59+
postprocessing=postprocessing,
60+
)
6161

6262

6363
if __name__ == "__main__":
@@ -80,11 +80,11 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
8080
tuner = Benchmark(jobs=all_jobs, cache_root_dir=args.cache_dir)
8181
tuner()
8282

83-
# if args.mode == "lhsT_rhs" or args.mode == "both":
84-
# kernel_names = ["lhsT_rhs_gemm_np", "lhsT_rhs_meta_gemm"]
85-
# plot_metric(args.cache_dir, "min_ms", kernel_names)
86-
# plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
87-
# if args.mode == "lhs_rhs" or args.mode == "both":
88-
# kernel_names = ["lhs_rhs_gemm_np", "lhs_rhs_meta_gemm"]
89-
# plot_metric(args.cache_dir, "min_ms", kernel_names)
90-
# plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
83+
if args.mode == "lhsT_rhs" or args.mode == "both":
84+
kernel_names = ["lhsT_rhs_gemm_np", "lhsT_rhs_meta_gemm"]
85+
plot_metric(args.cache_dir, "min_ms", kernel_names)
86+
plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
87+
if args.mode == "lhs_rhs" or args.mode == "both":
88+
kernel_names = ["lhs_rhs_gemm_np", "lhs_rhs_meta_gemm"]
89+
plot_metric(args.cache_dir, "min_ms", kernel_names)
90+
plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)

0 commit comments

Comments
 (0)