Skip to content

Commit a9b2746

Browse files
PyOpenCL target: Add, test overflow of large argument counts into SVM struct
Co-authored-by: Matthias Diener <[email protected]>
1 parent 7bea826 commit a9b2746

File tree

9 files changed

+426
-79
lines changed

9 files changed

+426
-79
lines changed

loopy/codegen/result.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,16 @@
2020
THE SOFTWARE.
2121
"""
2222

23-
from typing import Any, Sequence, Mapping, Tuple, Optional
23+
from typing import Any, Sequence, Mapping, Tuple, Optional, TYPE_CHECKING
2424
from dataclasses import dataclass, replace
2525

2626
import islpy as isl
2727

2828

29+
if TYPE_CHECKING:
30+
from loopy.codegen import CodeGenerationState
31+
32+
2933
def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]:
3034
seen_preamble_tags = set()
3135
dedup_preambles = []
@@ -170,7 +174,8 @@ def all_code(self):
170174
+ "\n\n"
171175
+ str(self.host_program.ast))
172176

173-
def current_program(self, codegen_state):
177+
def current_program(
178+
self, codegen_state: "CodeGenerationState") -> GeneratedProgram:
174179
if codegen_state.is_generating_device_code:
175180
if self.device_programs:
176181
result = self.device_programs[-1]
@@ -329,13 +334,23 @@ def generate_host_or_device_program(codegen_state, schedule_index):
329334

330335
cur_prog = codegen_result.current_program(codegen_state)
331336
body_ast = cur_prog.ast
332-
fdecl_ast = ast_builder.get_function_declaration(
337+
fdef_preambles, fdecl_ast = ast_builder.get_function_declaration(
333338
codegen_state, codegen_result, schedule_index)
334339

335340
fdef_ast = ast_builder.get_function_definition(
336341
codegen_state, codegen_result,
337342
schedule_index, fdecl_ast, body_ast)
338343

344+
if fdef_preambles:
345+
if codegen_state.is_generating_device_code:
346+
codegen_result = codegen_result.copy(
347+
device_preambles=(
348+
codegen_result.device_preambles + tuple(fdef_preambles)))
349+
else:
350+
codegen_result = codegen_result.copy(
351+
host_preambles=(
352+
codegen_result.host_preambles + tuple(fdef_preambles)))
353+
339354
codegen_result = codegen_result.with_new_program(
340355
codegen_state,
341356
cur_prog.copy(

loopy/target/__init__.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ def get_function_definition(
203203
def get_function_declaration(
204204
self, codegen_state: CodeGenerationState,
205205
codegen_result: CodeGenerationResult, schedule_index: int
206-
) -> ASTType:
206+
) -> Tuple[Sequence[Tuple[str, str]], ASTType]:
207+
"""Returns preambles and the AST for the function declaration."""
207208
raise NotImplementedError
208209

209210
def generate_top_of_body(
@@ -289,14 +290,16 @@ def __str__(self):
289290
return ""
290291

291292

292-
class DummyHostASTBuilder(ASTBuilderBase):
293+
class DummyHostASTBuilder(ASTBuilderBase[None]):
293294
def get_function_definition(self, codegen_state, codegen_result,
294295
schedule_index, function_decl, function_body):
295296
return function_body
296297

297-
def get_function_declaration(self, codegen_state, codegen_result,
298-
schedule_index):
299-
return None
298+
def get_function_declaration(
299+
self, codegen_state, codegen_result,
300+
schedule_index,
301+
) -> Tuple[Sequence[Tuple[str, str]], None]:
302+
return [], None
300303

301304
def get_temporary_decls(self, codegen_state, schedule_index):
302305
return []

loopy/target/c/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
THE SOFTWARE.
2424
"""
2525

26-
from typing import cast, Tuple, Optional
26+
from typing import cast, Tuple, Optional, Sequence
2727
import re
2828

2929
import numpy as np # noqa
@@ -817,8 +817,10 @@ def get_function_definition(
817817
else:
818818
return Collection(result+[Line(), fbody])
819819

820-
def get_function_declaration(self, codegen_state: CodeGenerationState,
821-
codegen_result: CodeGenerationResult, schedule_index: int) -> Generable:
820+
def get_function_declaration(
821+
self, codegen_state: CodeGenerationState,
822+
codegen_result: CodeGenerationResult, schedule_index: int
823+
) -> Tuple[Sequence[Tuple[str, str]], Generable]:
822824
kernel = codegen_state.kernel
823825

824826
assert codegen_state.kernel.linearization is not None
@@ -846,7 +848,7 @@ def get_function_declaration(self, codegen_state: CodeGenerationState,
846848
passed_names = [arg.name for arg in kernel.args]
847849
written_names = kernel.get_written_variables()
848850

849-
return FunctionDeclarationWrapper(
851+
return [], FunctionDeclarationWrapper(
850852
FunctionDeclaration(
851853
name,
852854
[self.arg_to_cgen_declarator(

loopy/target/cuda.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323
THE SOFTWARE.
2424
"""
2525

26+
from typing import Tuple, Sequence
27+
2628
import numpy as np
2729
from pymbolic import var
2830
from pytools import memoize_method
29-
from cgen import Declarator, Const
31+
from cgen import Declarator, Const, Generable
3032

3133
from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
3234
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
@@ -35,6 +37,8 @@
3537
from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
3638
from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg, ArrayArg
3739
from loopy.kernel.function_interface import ScalarCallable
40+
from loopy.codegen.result import CodeGenerationResult
41+
from loopy.codegen import CodeGenerationState
3842

3943

4044
# {{{ vector types
@@ -320,9 +324,11 @@ def known_callables(self):
320324

321325
# {{{ top-level codegen
322326

323-
def get_function_declaration(self, codegen_state, codegen_result,
324-
schedule_index):
325-
fdecl = super().get_function_declaration(
327+
def get_function_declaration(
328+
self, codegen_state: CodeGenerationState,
329+
codegen_result: CodeGenerationResult, schedule_index: int
330+
) -> Tuple[Sequence[Tuple[str, str]], Generable]:
331+
preambles, fdecl = super().get_function_declaration(
326332
codegen_state, codegen_result, schedule_index)
327333

328334
from loopy.target.c import FunctionDeclarationWrapper
@@ -352,7 +358,7 @@ def get_function_declaration(self, codegen_state, codegen_result,
352358

353359
fdecl = CudaLaunchBounds(nthreads, fdecl)
354360

355-
return FunctionDeclarationWrapper(fdecl)
361+
return preambles, FunctionDeclarationWrapper(fdecl)
356362

357363
def preamble_generators(self):
358364

loopy/target/ispc.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"""
2525

2626

27-
from typing import cast, Tuple
27+
from typing import cast, Tuple, Sequence
2828

2929
import numpy as np # noqa
3030
import pymbolic.primitives as p
@@ -202,8 +202,10 @@ def get_dtype_registry(self):
202202
class ISPCASTBuilder(CFamilyASTBuilder):
203203
# {{{ top-level codegen
204204

205-
def get_function_declaration(self, codegen_state: CodeGenerationState,
206-
codegen_result: CodeGenerationResult, schedule_index: int) -> Generable:
205+
def get_function_declaration(
206+
self, codegen_state: CodeGenerationState,
207+
codegen_result: CodeGenerationResult, schedule_index: int
208+
) -> Tuple[Sequence[Tuple[str, str]], Generable]:
207209
name = codegen_result.current_program(codegen_state).name
208210
kernel = codegen_state.kernel
209211

@@ -243,7 +245,7 @@ def get_function_declaration(self, codegen_state: CodeGenerationState,
243245
arg_decls))
244246

245247
from loopy.target.c import FunctionDeclarationWrapper
246-
return FunctionDeclarationWrapper(result)
248+
return [], FunctionDeclarationWrapper(result)
247249

248250
def get_kernel_call(self, codegen_state: CodeGenerationState,
249251
subkernel_name: str,

loopy/target/opencl.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323
THE SOFTWARE.
2424
"""
2525

26+
from typing import Tuple, Sequence
27+
2628
import numpy as np
2729
from pymbolic import var
2830
from pytools import memoize_method
29-
from cgen import Declarator
31+
from cgen import Declarator, Generable
3032

3133
from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
3234
from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
@@ -36,6 +38,8 @@
3638
from loopy.kernel.array import VectorArrayDimTag, FixedStrideArrayDimTag, ArrayBase
3739
from loopy.kernel.data import AddressSpace, ImageArg, ConstantArg
3840
from loopy.kernel.function_interface import ScalarCallable
41+
from loopy.codegen import CodeGenerationState
42+
from loopy.codegen.result import CodeGenerationResult
3943

4044

4145
# {{{ dtype registry wrappers
@@ -624,20 +628,26 @@ def preamble_generators(self):
624628

625629
# {{{ top-level codegen
626630

627-
def get_function_declaration(self, codegen_state, codegen_result,
628-
schedule_index):
629-
fdecl = super().get_function_declaration(
631+
def get_function_declaration(
632+
self, codegen_state: CodeGenerationState,
633+
codegen_result: CodeGenerationResult, schedule_index: int
634+
) -> Tuple[Sequence[Tuple[str, str]], Generable]:
635+
preambles, fdecl = super().get_function_declaration(
630636
codegen_state, codegen_result, schedule_index)
631637

632638
from loopy.target.c import FunctionDeclarationWrapper
633639
assert isinstance(fdecl, FunctionDeclarationWrapper)
634640
if not codegen_state.is_entrypoint:
635641
# auxiliary kernels need not mention opencl speicific qualifiers
636642
# for a functions signature
637-
return fdecl
643+
return preambles, fdecl
638644

639-
fdecl = fdecl.subdecl
645+
return preambles, FunctionDeclarationWrapper(
646+
self._wrap_kernel_decl(codegen_state, schedule_index, fdecl.subdecl))
640647

648+
def _wrap_kernel_decl(
649+
self, codegen_state: CodeGenerationState, schedule_index: int,
650+
fdecl: Declarator) -> Declarator:
641651
from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
642652
fdecl = CLKernel(fdecl)
643653

@@ -654,7 +664,7 @@ def get_function_declaration(self, codegen_state, codegen_result,
654664

655665
fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)
656666

657-
return FunctionDeclarationWrapper(fdecl)
667+
return fdecl
658668

659669
def generate_top_of_body(self, codegen_state):
660670
from loopy.kernel.data import ImageArg

0 commit comments

Comments
 (0)