move directories init to parallel execution

weiT1993 · weiT1993 · commit 6b639f709706 · 2025-09-11T14:18:10.000-04:00
diff --git a/autotune/core/benchmark.py b/autotune/core/benchmark.py
@@ -7,7 +7,6 @@
 
 from autotune.cache.results import ProfileResults
 from autotune.core.job import ProfileJobs
-from autotune.core.metrics import tensor_to_matmul_mac_count
 from autotune.core.parallel import set_neuron_core, split_jobs_into_groups
 from autotune.core.run_nki import run_on_neuron_core
 
@@ -24,27 +23,10 @@ def __init__(self, jobs: ProfileJobs, cache_root_dir: str, warmup: int = 10, ite
         self.iters = iters
 
     def __call__(self):
-        self.results = self._init_results()
-        self.results.dump_summary()
+        self.results = ProfileResults(sort_key="min_ms", lower_is_better=True)
         self._run_on_neuron_cores()
         self.results.dump_summary()
 
-    def _init_results(self) -> ProfileResults:
-        results = ProfileResults(sort_key="min_ms", lower_is_better=True)
-        for job in self.jobs:
-            job.cache_root_dir = self.cache_root_dir
-            matmul_mac_count = tensor_to_matmul_mac_count(job.input_tensor_shapes)
-            results.add_result(
-                index=job.index,
-                kernel=job.kernel,
-                kernel_kwargs=job.kernel_kwargs,
-                compiler_flags=job.compiler_flags,
-                cache_dir=job.cache_dir,
-                matmul_mac_count=matmul_mac_count,
-            )
-            job.init_job_dir()
-        return results
-
     def _run_on_neuron_cores(self):
         """Main function to launch Neuron core worker subprocesses."""
         num_neuron_cores = min(32, self.jobs.num_jobs)
@@ -59,22 +41,23 @@ def _run_on_neuron_cores(self):
                 "warmup": self.warmup,
                 "iters": self.iters,
                 "jobs": self.jobs.subset(rank_job_ids),
-                "results": [self.results[job_id] for job_id in rank_job_ids],
+                "cache_root_dir": self.cache_root_dir,
+                "sort_key": self.results.sort_key,
+                "lower_is_better": self.results.lower_is_better,
             }
             future = executor.submit(run_on_neuron_core, **kwargs)
-            futures[future] = neuron_core_id
+            futures[future] = (neuron_core_id, rank_job_ids)
 
         with tqdm(
             total=self.jobs.num_jobs,
             desc=f"Running {self.jobs.num_jobs} kernels on {num_neuron_cores} Neuron cores",
             unit="kernels",
         ) as pbar:
             for future in as_completed(futures):
-                neuron_core_id = futures[future]
+                neuron_core_id, rank_job_ids = futures[future]
                 updated_results = future.result()
                 for updated_result in updated_results:
-                    job_id = updated_result.index
-                    self.results[job_id] = updated_result
+                    self.results.results.append(updated_result)
                 pbar.update(len(updated_results))
         for executor in executors:
             executor.shutdown(wait=True)
diff --git a/autotune/core/gemm.py b/autotune/core/gemm.py
@@ -72,8 +72,6 @@ def __call__(self, lhs: tensor, rhs: tensor) -> Any:
         Returns:
             Any: _description_
         """
-        print(self.gemm_config)
-        print(self.loop_ranges)
         if self.transposed_lhs:
             self.lhs_hbm = HBMTensor(lhs, axes=("K", "M"))
         else:
diff --git a/autotune/core/run_nki.py b/autotune/core/run_nki.py
@@ -5,7 +5,7 @@
 from autotune.cache.results import ProfileResult, capture_error_message
 from autotune.core.compile import compile_kernel, create_spike_kernel, run_spike_kernel
 from autotune.core.job import ProfileJobs
-from autotune.core.metrics import extract_metrics
+from autotune.core.metrics import extract_metrics, tensor_to_matmul_mac_count
 
 
 def compile_all_kernels(jobs: ProfileJobs, results: List[ProfileResult]) -> None:
@@ -94,14 +94,46 @@ def run_neuron_benchmarks(warmup: int, iters: int, jobs: ProfileJobs, results: L
                 result.add_error(error_msg)
 
 
-def run_on_neuron_core(warmup: int, iters: int, jobs: ProfileJobs, results: List[ProfileResult]) -> List[ProfileResult]:
+def run_on_neuron_core(
+    warmup: int, iters: int, jobs: ProfileJobs, cache_root_dir: str, sort_key: str, lower_is_better: bool
+) -> List[ProfileResult]:
     """
     Run kernels with separated CPU compilation and Neuron execution phases.
 
-    This function first compiles all kernels on CPU (without SpikeExecutor),
-    then runs benchmarks on Neuron cores (with SpikeExecutor).
+    This function initializes ProfileResult objects for each job, then
+    compiles all kernels on CPU (without SpikeExecutor), and finally
+    runs benchmarks on Neuron cores (with SpikeExecutor).
+
+    Args:
+        warmup: Number of warmup iterations
+        iters: Number of benchmark iterations
+        jobs: ProfileJobs containing all jobs to run
+        cache_root_dir: Root directory for cache storage
+        sort_key: The metric name to use for sorting results
+        lower_is_better: Whether lower values of the sort key are better
+
+    Returns:
+        List of ProfileResult objects with benchmark results
     """
 
+    # Initialize ProfileResult objects for each job
+    results = []
+    for job in jobs:
+        job.cache_root_dir = cache_root_dir
+        matmul_mac_count = tensor_to_matmul_mac_count(job.input_tensor_shapes)
+        result = ProfileResult(
+            index=job.index,
+            main_metric=sort_key,
+            lower_is_better=lower_is_better,
+            kernel=job.kernel,
+            kernel_kwargs=job.kernel_kwargs,
+            compiler_flags=job.compiler_flags,
+            cache_dir=job.cache_dir,
+            matmul_mac_count=matmul_mac_count,
+        )
+        job.init_job_dir()
+        results.append(result)
+
     # Pre-initialize all input tensors once for all jobs with the same shapes
     jobs.initialize_input_tensors()
 
diff --git a/examples/gemm.py b/examples/gemm.py
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-import random
 
 import numpy as np
 from neuronpy.core.language import bfloat16
 
+from autotune.cache.visualize import plot_metric
 from autotune.core.benchmark import Benchmark
 from autotune.core.gemm_config import generate_gemm_configs
 from autotune.core.job import ProfileJobs
@@ -32,15 +32,15 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
         meta_kernel = ("/home/ec2-user/workplace/nki-autotune/autotune/core/gemm.py", "lhs_rhs_meta_gemm")
 
     # for M, N, K in [(4096, 4096, 4096), (8192, 8192, 8192), (16384, 16384, 16384), (24576, 24576, 24576)]:
-    for M, N, K in [(128, 512, 1279)]:
+    for M, N, K in [(3757, 1647, 2539)]:
         if transposed_lhs:
             lhs_shape = (K, M)
         else:
             lhs_shape = (M, K)
         rhs_shape = (K, N)
         # Generate all possible configurations using the new function
         configs = generate_gemm_configs(M=M, N=N, K=K)
-        configs = random.sample(configs, 1)
+        # configs = random.sample(configs, 1)
         for config in configs:
             all_jobs.add_job(
                 kernel=meta_kernel,
@@ -50,14 +50,14 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
                 compiler_flags="--target=trn1 --auto-cast=none --internal-tensorizer-opt-level=nki",
                 postprocessing=postprocessing,
             )
-        # all_jobs.add_job(
-        #     kernel=baseline_kernel,
-        #     input_tensor_shapes=[lhs_shape, rhs_shape],
-        #     data_type=data_type,
-        #     kernel_kwargs={},
-        #     compiler_flags="--target=trn1 --auto-cast=none --model-type=transformer --tensorizer-options='--print-nki'",
-        #     postprocessing=postprocessing,
-        # )
+        all_jobs.add_job(
+            kernel=baseline_kernel,
+            input_tensor_shapes=[lhs_shape, rhs_shape],
+            data_type=data_type,
+            kernel_kwargs={},
+            compiler_flags="--target=trn1 --auto-cast=none --model-type=transformer --tensorizer-options='--print-nki'",
+            postprocessing=postprocessing,
+        )
 
 
 if __name__ == "__main__":
@@ -80,11 +80,11 @@ def add_jobs(all_jobs: ProfileJobs, transposed_lhs: bool = False):
     tuner = Benchmark(jobs=all_jobs, cache_root_dir=args.cache_dir)
     tuner()
 
-    # if args.mode == "lhsT_rhs" or args.mode == "both":
-    #     kernel_names = ["lhsT_rhs_gemm_np", "lhsT_rhs_meta_gemm"]
-    #     plot_metric(args.cache_dir, "min_ms", kernel_names)
-    #     plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
-    # if args.mode == "lhs_rhs" or args.mode == "both":
-    #     kernel_names = ["lhs_rhs_gemm_np", "lhs_rhs_meta_gemm"]
-    #     plot_metric(args.cache_dir, "min_ms", kernel_names)
-    #     plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
+    if args.mode == "lhsT_rhs" or args.mode == "both":
+        kernel_names = ["lhsT_rhs_gemm_np", "lhsT_rhs_meta_gemm"]
+        plot_metric(args.cache_dir, "min_ms", kernel_names)
+        plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)
+    if args.mode == "lhs_rhs" or args.mode == "both":
+        kernel_names = ["lhs_rhs_gemm_np", "lhs_rhs_meta_gemm"]
+        plot_metric(args.cache_dir, "min_ms", kernel_names)
+        plot_metric(args.cache_dir, "mfu_estimated_percent", kernel_names)