Skip to content

Commit 1a9e6f8

Browse files
authored
Merge pull request #20 from spcl/numba-validation-fixes
Various fixes
2 parents 0bc108b + 4eba4df commit 1a9e6f8

File tree

6 files changed

+42
-25
lines changed

6 files changed

+42
-25
lines changed

frameworks.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ The base `Framework` class (found in [`npbench/infrastructure/framework.py`](npb
2121
- impl_files: Returns a list of the framework's implementation files for the input benchmark. Each element in the list is a tuple of the implementation filename and a description (e.g. `default` or `nopython-parallel`).
2222
- implementations: Returns a list of the framework's implementations for the input benchmark. Each element in the list is a tuple of the implementation method and a description (as above).
2323
- args: Returns a list with the names of the input arguments for running the input implementation of the input benchmark.
24-
- out_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
24+
- mutable_args: Returns a list with the input arguments for running the input implementation of the input benchmark **and** have to be copied(for example, because they may be modified during benchmark execution).
25+
- inout_args: Returns a list with the input arguments that are also output, i.e., they must be validated.
2526
- arg_str: Returns the argument-string needed to call the input implementation of the input benchmark.
2627
- out_arg_str: Returns the argument-string with the input arguments that must be copied.
2728
- setup_str: Returns the setup-string of the code that should be executed for, e.g., copying data, before executing the benchmark implementation.

npbench/benchmarks/polybench/cholesky/cholesky_numba_npr.py renamed to npbench/benchmarks/polybench/cholesky/cholesky_numba_np.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def kernel(A):
77

88
A[0, 0] = np.sqrt(A[0, 0])
99
for i in range(1, A.shape[0]):
10-
for j in nb.prange(i):
10+
for j in range(i):
1111
A[i, j] -= np.dot(A[i, :j], A[j, :j])
1212
A[i, j] /= A[j, j]
1313
A[i, i] -= np.dot(A[i, :i], A[i, :i])

npbench/infrastructure/dace_framework.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,14 @@ def parallelize(sdfg):
185185
try:
186186

187187
def autoopt(sdfg, device, symbols): #, nofuse):
188-
# Mark arrays as on the GPU
189-
if device == dtypes.DeviceType.GPU:
190-
for k, v in sdfg.arrays.items():
191-
if not v.transient and type(v) == dace.data.Array:
192-
v.storage = dace.dtypes.StorageType.GPU_Global
188+
# # Mark arrays as on the GPU
189+
# if device == dtypes.DeviceType.GPU:
190+
# for k, v in sdfg.arrays.items():
191+
# if not v.transient and type(v) == dace.data.Array:
192+
# v.storage = dace.dtypes.StorageType.GPU_Global
193193

194194
# Auto-optimize SDFG
195-
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols)
195+
opt.auto_optimize(auto_opt_sdfg, device, symbols=symbols, use_gpu_storage=True)
196196

197197
auto_opt_sdfg = copy.deepcopy(strict_sdfg)
198198
auto_opt_sdfg._name = 'auto_opt'
@@ -229,9 +229,10 @@ def vectorize(sdfg, vec_len=None):
229229
dace.Config.set('library', 'blas', 'default_implementation', value='cuBLAS')
230230

231231
def copy_to_gpu(sdfg):
232-
for k, v in sdfg.arrays.items():
233-
if not v.transient and isinstance(v, dace.data.Array):
234-
v.storage = dace.dtypes.StorageType.GPU_Global
232+
opt.apply_gpu_storage(sdfg)
233+
# for k, v in sdfg.arrays.items():
234+
# if not v.transient and isinstance(v, dace.data.Array):
235+
# v.storage = dace.dtypes.StorageType.GPU_Global
235236

236237
if self.info["arch"] == "gpu":
237238
import cupy as cp
@@ -242,9 +243,9 @@ def copy_to_gpu(sdfg):
242243
fe_time = t
243244
if sdfg._name != 'auto_opt':
244245
device = dtypes.DeviceType.GPU if self.info["arch"] == "gpu" else dtypes.DeviceType.CPU
245-
if self.info["arch"] == "cpu":
246-
# GPUTransform will set GPU schedules by itself
247-
opt.set_fast_implementations(sdfg, device)
246+
# if self.info["arch"] == "cpu":
247+
# # GPUTransform will set GPU schedules by itself
248+
opt.set_fast_implementations(sdfg, device)
248249
if self.info["arch"] == "gpu":
249250
if sdfg._name in ['strict', 'parallel', 'fusion']:
250251
_, gpu_time1 = util.benchmark("copy_to_gpu(sdfg)",

npbench/infrastructure/framework.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,17 +91,25 @@ def args(self, bench: Benchmark, impl: Callable = None):
9191
for a in bench.info["input_args"]
9292
]
9393

94-
def out_args(self, bench: Benchmark, impl: Callable = None):
94+
def mutable_args(self, bench: Benchmark, impl: Callable = None):
9595
""" Generates the input/output arguments that should be copied during
9696
the setup.
9797
:param bench: A benchmark.
9898
:param impl: A benchmark implementation.
9999
"""
100100

101101
return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["array_args"]]
102+
102103

103-
# def params(self, bench: Benchmark, impl: Callable = None):
104-
# return list(bench.info["input_params"])
104+
def inout_args(self, bench: Benchmark, impl: Callable = None):
105+
""" Generates the input/output arguments that should be checked during
106+
validation.
107+
:param bench: A benchmark.
108+
:param impl: A benchmark implementation.
109+
"""
110+
111+
return ["__npb_{pr}_{a}".format(pr=self.info["prefix"], a=a) for a in bench.info["output_args"]]
112+
105113

106114
def arg_str(self, bench: Benchmark, impl: Callable = None):
107115
""" Generates the argument-string that should be used for calling
@@ -119,7 +127,7 @@ def out_arg_str(self, bench: Benchmark, impl: Callable = None):
119127
:param impl: A benchmark implementation.
120128
"""
121129

122-
output_args = self.out_args(bench, impl)
130+
output_args = self.mutable_args(bench, impl)
123131
return ", ".join(output_args)
124132

125133
def setup_str(self, bench: Benchmark, impl: Callable = None):

npbench/infrastructure/test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,11 @@ def _execute(self, frmwrk: Framework, impl: Callable, impl_name: str, mode: str,
4343
out = [out]
4444
else:
4545
out = []
46-
if "out_args" in self.bench.info.keys():
47-
out += [ldict[a] for a in self.frmwrk.args(self.bench)]
46+
if "output_args" in self.bench.info.keys():
47+
num_return_args = len(out)
48+
num_output_args = len(self.bench.info["output_args"])
49+
out += [ldict[a] for a in frmwrk.inout_args(self.bench)]
50+
assert len(out) == num_return_args + num_output_args, "Number of output arguments does not match."
4851
return out, timelist
4952

5053
def run(self, preset: str, validate: bool, repeat: int, timeout: float = 200.0, ignore_errors: bool = True):

npbench/infrastructure/utilities.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,16 +134,20 @@ def inner(_it, _timer{init}):
134134

135135
def benchmark(stmt, setup="pass", out_text="", repeat=1, context={}, output=None, verbose=True):
136136

137-
timeit.template = timeit_tmpl.format(init='{init}', setup='{setup}', stmt='{stmt}', output=output)
138-
139137
ldict = {**context}
140-
output = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
141-
res = output[0][1]
142-
raw_time_list = [a for a, _ in output]
138+
raw_time_list = timeit.repeat(stmt, setup=setup, repeat=repeat, number=1, globals=ldict)
143139
raw_time = np.median(raw_time_list)
144140
ms_time = time_to_ms(raw_time)
145141
if verbose:
146142
print("{}: {}ms".format(out_text, ms_time))
143+
144+
if output is not None:
145+
exec(setup, context)
146+
exec(stmt, context)
147+
res = context[output]
148+
else:
149+
res = None
150+
147151
return res, raw_time_list
148152

149153

0 commit comments

Comments
 (0)