From ae29f74ad64c7b7229f6c504ffb29ab698324b1c Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Mon, 14 Oct 2024 09:08:22 -0700 Subject: [PATCH 01/16] [SYCL] Add SYCL Module splitting. --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 224 +++++ .../include/llvm/Transforms/Utils/SYCLUtils.h | 137 +++ llvm/lib/Transforms/Utils/CMakeLists.txt | 2 + llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 935 ++++++++++++++++++ llvm/lib/Transforms/Utils/SYCLUtils.cpp | 263 +++++ .../device-code-split/amd-kernel-split.ll | 17 + .../device-code-split/auto-module-split-1.ll | 121 +++ .../device-code-split/auto-module-split-2.ll | 127 +++ .../device-code-split/auto-module-split-3.ll | 112 +++ .../auto-module-split-func-ptr.ll | 50 + .../device-code-split/basic-module-split.ll | 122 +++ .../complex-indirect-call-chain.ll | 88 ++ .../indirectly-callable-auto-split.ll | 45 + .../indirectly-callable-per-kernel-split.ll | 53 + .../one-kernel-per-module.ll | 133 +++ .../device-code-split/per-aspect-split-1.ll | 133 +++ .../device-code-split/per-aspect-split-2.ll | 59 ++ .../device-code-split/per-aspect-split-3.ll | 94 ++ .../per-reqd-sub-group-size-split-1.ll | 133 +++ .../per-reqd-sub-group-size-split-2.ll | 60 ++ .../per-reqd-wg-size-split-1.ll | 133 +++ .../per-reqd-wg-size-split-2.ll | 59 ++ .../split-with-kernel-declarations.ll | 74 ++ llvm/tools/llvm-split/llvm-split.cpp | 72 ++ 24 files changed, 3246 insertions(+) create mode 100644 llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h create mode 100644 llvm/include/llvm/Transforms/Utils/SYCLUtils.h create mode 100644 llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp create mode 100644 llvm/lib/Transforms/Utils/SYCLUtils.cpp create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h new file mode 100644 index 0000000000000..9b9b237f3b94b --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -0,0 +1,224 @@ +//===-------- SYCLModuleSplit.h - module split ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Functionality to split a module into call graphs. A callgraph here is a set +// of entry points with all functions reachable from them via a call. The result +// of the split is new modules containing corresponding callgraph. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SYCL_MODULE_SPLIT_H +#define LLVM_SYCL_MODULE_SPLIT_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Error.h" + +#include +#include +#include +#include + +// TODO(maksimsab): +// * check GenXSPIRVWriterAdaptor comments +// * Maybe fix doxygen comments. + +namespace llvm { + +class Function; +class Module; + +enum IRSplitMode { + SPLIT_PER_TU, // one module per translation unit + SPLIT_PER_KERNEL, // one module per kernel + SPLIT_AUTO, // automatically select split mode + SPLIT_NONE // no splitting +}; + +// \returns IRSplitMode value if \p S is recognized. Otherwise, std::nullopt is +// returned. +std::optional convertStringToSplitMode(StringRef S); + +// A vector that contains all entry point functions in a split module. +using EntryPointSet = SetVector; + +// enum class SyclEsimdSplitStatus { SYCL_ONLY, ESIMD_ONLY, SYCL_AND_ESIMD }; + +// Describes scope covered by each entry in the module-entry points map +// populated by the groupEntryPointsByScope function. +enum EntryPointsGroupScope { + Scope_PerKernel, // one entry per kernel + Scope_PerModule, // one entry per module + Scope_Global // single entry in the map for all kernels +}; + +// Represents a named group of device code entry points - kernels and +// SYCL_EXTERNAL functions. +struct EntryPointGroup { + // Properties an entry point (EP) group + struct Properties { + // Scope represented by EPs in a group + EntryPointsGroupScope Scope = Scope_Global; + }; + + std::string GroupId; + EntryPointSet Functions; + Properties Props; + + EntryPointGroup(StringRef GroupId = "") : GroupId(GroupId) {} + EntryPointGroup(StringRef GroupId, EntryPointSet Functions) + : GroupId(GroupId), Functions(std::move(Functions)) {} + EntryPointGroup(StringRef GroupId, EntryPointSet Functions, + const Properties &Props) + : GroupId(GroupId), Functions(std::move(Functions)), Props(Props) {} + + void saveNames(std::vector &Dest) const; + void rebuildFromNames(const std::vector &Names, const Module &M); + void rebuild(const Module &M); +}; + +using EntryPointGroupVec = std::vector; + +// Annotates an llvm::Module with information necessary to perform and track +// result of device code (llvm::Module instances) splitting: +// - entry points of the module determined e.g. by a module splitter, as well +// as information about entry point origin (e.g. result of a scoped split) +// - its properties, such as whether it has specialization constants uses +// It also provides convenience functions for entry point set transformation +// between llvm::Function object and string representations. +class ModuleDesc { + std::unique_ptr M; + EntryPointGroup EntryPoints; + +public: + ModuleDesc(std::unique_ptr M) : M(std::move(M)) {} + + ModuleDesc(std::unique_ptr M, EntryPointGroup EntryPoints) + : M(std::move(M)), EntryPoints(std::move(EntryPoints)) {} + + ModuleDesc(std::unique_ptr M, const std::vector &Names) + : M(std::move(M)) { + rebuildEntryPoints(Names); + } + + const EntryPointSet &entries() const { return EntryPoints.Functions; } + const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } + EntryPointSet &entries() { return EntryPoints.Functions; } + Module &getModule() { return *M; } + const Module &getModule() const { return *M; } + std::unique_ptr releaseModulePtr() { return std::move(M); } + + // Sometimes, during module transformations, some Function objects within the + // module are replaced with different Function objects with the same name. + // Entry points need to be updated to include the replacement function. + // save/rebuild pair of functions is provided to automate this process. + void saveEntryPointNames(std::vector &Dest) { + EntryPoints.saveNames(Dest); + } + + void rebuildEntryPoints(const std::vector &Names) { + EntryPoints.rebuildFromNames(Names, getModule()); + } + + void rebuildEntryPoints(const Module &M) { EntryPoints.rebuild(M); } + + void rebuildEntryPoints() { EntryPoints.rebuild(*M); } + + // Cleans up module IR - removes dead globals, debug info etc. + void cleanup(); + + ModuleDesc clone() const; + + std::string makeSymbolTable() const; + + void dump(raw_ostream &OS) const; +}; + +// Module split support interface. +// It gets a module (in a form of module descriptor, to get additional info) and +// a collection of entry points groups. Each group specifies subset entry points +// from input module that should be included in a split module. +class ModuleSplitterBase { +protected: + ModuleDesc Input; + EntryPointGroupVec Groups; + +protected: + EntryPointGroup nextGroup() { + assert(hasMoreSplits() && "Reached end of entry point groups list."); + EntryPointGroup Res = std::move(Groups.back()); + Groups.pop_back(); + return Res; + } + + Module &getInputModule() { return Input.getModule(); } + + std::unique_ptr releaseInputModule() { + return Input.releaseModulePtr(); + } + +public: + ModuleSplitterBase(ModuleDesc MD, EntryPointGroupVec GroupVec) + : Input(std::move(MD)), Groups(std::move(GroupVec)) { + assert(!Groups.empty() && "Entry points groups collection is empty!"); + } + + virtual ~ModuleSplitterBase() = default; + + // Gets next subsequence of entry points in an input module and provides split + // submodule containing these entry points and their dependencies. + virtual ModuleDesc nextSplit() = 0; + + // Returns a number of remaining modules, which can be split out using this + // splitter. The value is reduced by 1 each time nextSplit is called. + size_t remainingSplits() const { return Groups.size(); } + + // Check that there are still submodules to split. + bool hasMoreSplits() const { return remainingSplits() > 0; } +}; + +std::unique_ptr +getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, + bool EmitOnlyKernelsAsEntryPoints); + +void dumpEntryPoints(raw_ostream &OS, const EntryPointSet &C, + std::string_view Msg = ""); +void dumpEntryPoints(raw_ostream &OS, const Module &M, + bool OnlyKernelsAreEntryPoints = false, + std::string_view Msg = ""); + +struct SYCLSplitModule { + std::string ModuleFilePath; + std::string Symbols; + + SYCLSplitModule() = default; + SYCLSplitModule(const SYCLSplitModule &) = default; + SYCLSplitModule &operator=(const SYCLSplitModule &) = default; + SYCLSplitModule(SYCLSplitModule &&) = default; + SYCLSplitModule &operator=(SYCLSplitModule &&) = default; + + SYCLSplitModule(std::string_view File, std::string Symbols) + : ModuleFilePath(File), Symbols(std::move(Symbols)) {} +}; + +struct ModuleSplitterSettings { + IRSplitMode Mode; + bool OutputAssembly = false; // Bitcode or LLVM IR. + StringRef OutputPrefix; +}; + +/// Parses the string table. +Expected> +parseSYCLSplitModulesFromFile(StringRef File); + +/// Splits the given module \p M according to the given \p Settings. +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); + +} // namespace llvm + +#endif // LLVM_SYCL_MODULE_SPLIT_H diff --git a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h new file mode 100644 index 0000000000000..45ddc1734f922 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h @@ -0,0 +1,137 @@ +//===------------ SYCLUtils.h - SYCL utility functions --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Utility functions for SYCL. +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_UTILS_SYCLUTILS_H +#define LLVM_TRANSFORMS_UTILS_SYCLUTILS_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" + +#include +#include +#include + +namespace llvm { + +constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id"; +constexpr char ATTR_SYCL_OPTLEVEL[] = "sycl-optlevel"; + +using CallGraphNodeAction = ::std::function; +using CallGraphFunctionFilter = + std::function; + +// Traverses call graph starting from given function up the call chain applying +// given action to each function met on the way. If \c ErrorOnNonCallUse +// parameter is true, then no functions' uses are allowed except calls. +// Otherwise, any function where use of the current one happened is added to the +// call graph as if the use was a call. +// The 'functionFilter' parameter is a callback function that can be used to +// control which functions will be added to a call graph. +// +// The callback is invoked whenever a function being traversed is used +// by some instruction which is not a call to this instruction (e.g. storing +// function pointer to memory) - the first parameter is the using instructions, +// the second - the function being traversed. The parent function of the +// instruction is added to the call graph depending on whether the callback +// returns 'true' (added) or 'false' (not added). +// Functions which are part of the visited set ('Visited' parameter) are not +// traversed. + +void traverseCallgraphUp( + llvm::Function *F, CallGraphNodeAction NodeF, + SmallPtrSetImpl &Visited, bool ErrorOnNonCallUse, + const CallGraphFunctionFilter &functionFilter = + [](const Instruction *, const Function *) { return true; }); + +template +void traverseCallgraphUp( + Function *F, CallGraphNodeActionF ActionF, + SmallPtrSetImpl &Visited, bool ErrorOnNonCallUse, + const CallGraphFunctionFilter &functionFilter = + [](const Instruction *, const Function *) { return true; }) { + traverseCallgraphUp(F, CallGraphNodeAction(ActionF), Visited, + ErrorOnNonCallUse, functionFilter); +} + +template +void traverseCallgraphUp( + Function *F, CallGraphNodeActionF ActionF, bool ErrorOnNonCallUse = true, + const CallGraphFunctionFilter &functionFilter = + [](const Instruction *, const Function *) { return true; }) { + SmallPtrSet Visited; + traverseCallgraphUp(F, CallGraphNodeAction(ActionF), Visited, + ErrorOnNonCallUse, functionFilter); +} + +/// Tells if this value is a bit cast or address space cast. +bool isCast(const Value *V); + +/// Tells if this value is a GEP instructions with all zero indices. +bool isZeroGEP(const Value *V); + +/// Climbs up the use-def chain of given value until a value which is not a +/// bit cast or address space cast is met. +const Value *stripCasts(const Value *V); +Value *stripCasts(Value *V); + +/// Climbs up the use-def chain of given value until a value is met which is +/// neither of: +/// - bit cast +/// - address space cast +/// - GEP instruction with all zero indices +const Value *stripCastsAndZeroGEPs(const Value *V); +Value *stripCastsAndZeroGEPs(Value *V); + +/// Collects uses of given value "looking through" casts. I.e. if a use is a +/// cast (chain), then uses of the result of the cast (chain) are collected. +void collectUsesLookThroughCasts(const Value *V, + SmallPtrSetImpl &Uses); + +/// Collects uses of given pointer-typed value "looking through" casts and GEPs +/// with all zero indices - those pointer transformation instructions which +/// don't change pointed-to value. E.g. if a use is a cast (chain), then uses of +/// the result of the cast (chain) are collected. +void collectUsesLookThroughCastsAndZeroGEPs(const Value *V, + SmallPtrSetImpl &Uses); + +void collectUsesLookThroughCasts(const Value *V, + SmallPtrSetImpl &Uses); + +void collectUsesLookThroughCastsAndZeroGEPs(const Value *V, + SmallPtrSetImpl &Uses); + +bool collectPossibleStoredVals( + Value *Addr, SmallPtrSetImpl &Vals, + std::function EscapesIfAddrIsArgOf = + [](const CallInst *) { return true; }); + +inline bool isSYCLExternalFunction(const Function *F) { + return F->hasFnAttribute(ATTR_SYCL_MODULE_ID); +} + +/// Removes the global variable "llvm.used" and returns true on success. +/// "llvm.used" is a global constant array containing references to kernels +/// available in the module and callable from host code. The elements of +/// the array are ConstantExpr bitcast to i8*. +/// The variable must be removed as it is a) has done the job to the moment +/// of this function call and b) the references to the kernels callable from +/// host must not have users. +bool removeSYCLKernelsConstRefArray(Module &M); + +using SYCLStringTable = std::vector>; + +void writeSYCLStringTable(const SYCLStringTable &Table, raw_ostream &OS); + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_SYCLUTILS_H diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 36761cf356974..b221c9724b8f1 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -81,6 +81,8 @@ add_llvm_component_library(LLVMTransformUtils SizeOpts.cpp SplitModule.cpp StripNonLineTableDebugInfo.cpp + SYCLModuleSplit.cpp + SYCLUtils.cpp SymbolRewriter.cpp UnifyFunctionExitNodes.cpp UnifyLoopExits.cpp diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp new file mode 100644 index 0000000000000..83b0f55dba5e9 --- /dev/null +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -0,0 +1,935 @@ +//===-------- SYCLModuleSplitter.cpp - split a module into callgraphs -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See comments in the header. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SYCLModuleSplit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Bitcode/BitcodeWriterPass.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PassManagerImpl.h" +#include "llvm/IRPrinter/IRPrintingPasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/LineIterator.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/GlobalDCE.h" +#include "llvm/Transforms/IPO/Internalize.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h" +#include "llvm/Transforms/IPO/StripSymbols.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SYCLUtils.h" + +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "sycl_module_split" + +namespace { +// Identifying name for global scope +constexpr char GLOBAL_SCOPE_NAME[] = ""; +constexpr char SYCL_SCOPE_NAME[] = ""; + +EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M, + IRSplitMode Mode, + bool AutoSplitIsGlobalScope) { + switch (Mode) { + case SPLIT_PER_TU: + return Scope_PerModule; + + case SPLIT_PER_KERNEL: + return Scope_PerKernel; + + case SPLIT_AUTO: { + if (AutoSplitIsGlobalScope) + return Scope_Global; + + // At the moment, we assume that per-source split is the best way of + // splitting device code and can always be used except for cases handled + // above. + return Scope_PerModule; + } + + case SPLIT_NONE: + return Scope_Global; + } + + llvm_unreachable("unsupported split mode"); +} + +// Return true if the function is a SPIRV or SYCL builtin, e.g. +// _Z28__spirv_GlobalInvocationId_xv +bool isSpirvSyclBuiltin(StringRef FName) { + if (!FName.consume_front("_Z")) + return false; + // now skip the digits + FName = FName.drop_while([](char C) { return std::isdigit(C); }); + + return FName.starts_with("__spirv_") || FName.starts_with("__sycl_"); +} + +// Return true if the function name starts with "__builtin_" +bool isGenericBuiltin(StringRef FName) { + return FName.starts_with("__builtin_"); +} + +bool isKernel(const Function &F) { + return F.getCallingConv() == CallingConv::SPIR_KERNEL || + F.getCallingConv() == CallingConv::AMDGPU_KERNEL; +} + +bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { + // Skip declarations, if any: they should not be included into a vector of + // entry points groups or otherwise we will end up with incorrectly generated + // list of symbols. + if (F.isDeclaration()) + return false; + + // Kernels are always considered to be entry points + if (isKernel(F)) + return true; + + if (!EmitOnlyKernelsAsEntryPoints) { + // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute + // are also considered as entry points (except __spirv_* and __sycl_* + // functions) + return llvm::isSYCLExternalFunction(&F) && + !isSpirvSyclBuiltin(F.getName()) && !isGenericBuiltin(F.getName()); + } + + // Even if we are emitting only kernels as entry points, virtual functions + // should still be treated as entry points, because they are going to be + // outlined into separate device images and linked in later. + return F.hasFnAttribute("indirectly-callable"); +} + +// Represents "dependency" or "use" graph of global objects (functions and +// global variables) in a module. It is used during device code split to +// understand which global variables and functions (other than entry points) +// should be included into a split module. +// +// Nodes of the graph represent LLVM's GlobalObjects, edges "A" -> "B" represent +// the fact that if "A" is included into a module, then "B" should be included +// as well. +// +// Examples of dependencies which are represented in this graph: +// - Function FA calls function FB +// - Function FA uses global variable GA +// - Global variable GA references (initialized with) function FB +// - Function FA stores address of a function FB somewhere +// +// The following cases are treated as dependencies between global objects: +// 1. Global object A is used within by a global object B in any way (store, +// bitcast, phi node, call, etc.): "A" -> "B" edge will be added to the +// graph; +// 2. function A performs an indirect call of a function with signature S and +// there is a function B with signature S marked with "referenced-indirectly" +// attribute. "A" -> "B" edge will be added to the graph; +class DependencyGraph { +public: + using GlobalSet = SmallPtrSet; + + DependencyGraph(const Module &M) { + // Group functions by their signature to handle case (2) described above + DenseMap + FuncTypeToFuncsMap; + for (const auto &F : M.functions()) { + // Kernels can't be called (either directly or indirectly) in SYCL + if (isKernel(F)) + continue; + + // Only functions which are marked with "referenced-indireclty" attribute + // are considered to be indirect callee candidates. + if (!F.hasFnAttribute("referenced-indirectly")) + continue; + + FuncTypeToFuncsMap[F.getFunctionType()].insert(&F); + } + + for (const auto &F : M.functions()) { + // case (1), see comment above the class definition + for (const Value *U : F.users()) + addUserToGraphRecursively(cast(U), &F); + + // case (2), see comment above the class definition + for (const auto &I : instructions(F)) { + const auto *CI = dyn_cast(&I); + if (!CI || !CI->isIndirectCall()) // Direct calls were handled above + continue; + + // TODO: consider limiting set of potential callees to functions marked + // with special attribute (like [[intel::device_indirectly_callable]]) + const FunctionType *Signature = CI->getFunctionType(); + // Note: strictly speaking, virtual functions are allowed to use + // co-variant return types, i.e. we can actually miss a potential callee + // here, because it has different signature (different return type). + // However, this is not a problem for two reasons: + // - opaque pointers will be enabled at some point and will make + // signatures the same in that case + // - all virtual functions are referenced from vtable and therefore will + // anyway be preserved in a module + const auto &PotentialCallees = FuncTypeToFuncsMap[Signature]; + Graph[&F].insert(PotentialCallees.begin(), PotentialCallees.end()); + } + } + + // And every global variable (but their handling is a bit simpler) + for (const auto &GV : M.globals()) + for (const Value *U : GV.users()) + addUserToGraphRecursively(cast(U), &GV); + } + + iterator_range + dependencies(const GlobalValue *Val) const { + auto It = Graph.find(Val); + return (It == Graph.end()) + ? make_range(EmptySet.begin(), EmptySet.end()) + : make_range(It->second.begin(), It->second.end()); + } + +private: + void addUserToGraphRecursively(const User *Root, const GlobalValue *V) { + SmallVector WorkList; + WorkList.push_back(Root); + + while (!WorkList.empty()) { + const User *U = WorkList.pop_back_val(); + if (const auto *I = dyn_cast(U)) { + const auto *UFunc = I->getFunction(); + Graph[UFunc].insert(V); + } else if (isa(U)) { + if (const auto *GV = dyn_cast(U)) + Graph[GV].insert(V); + // This could be a global variable or some constant expression (like + // bitcast or gep). We trace users of this constant further to reach + // global objects they are used by and add them to the graph. + for (const auto *UU : U->users()) + WorkList.push_back(UU); + } else + llvm_unreachable("Unhandled type of function user"); + } + } + + DenseMap Graph; + SmallPtrSet EmptySet; +}; + +void collectFunctionsAndGlobalVariablesToExtract( + SetVector &GVs, const Module &M, + const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &Deps, + const std::function &IncludeFunctionPredicate = + nullptr) { + // We start with module entry points + for (const auto *F : ModuleEntryPoints.Functions) + GVs.insert(F); + + // Non-discardable global variables are also include into the initial set + for (const auto &GV : M.globals()) { + if (!GV.isDiscardableIfUnused()) + GVs.insert(&GV); + } + + // GVs has SetVector type. This type inserts a value only if it is not yet + // present there. So, recursion is not expected here. + size_t Idx = 0; + while (Idx < GVs.size()) { + const auto *Obj = GVs[Idx++]; + + for (const GlobalValue *Dep : Deps.dependencies(Obj)) { + if (const auto *Func = dyn_cast(Dep)) { + if (Func->isDeclaration()) + continue; + + // Functions can be additionally filtered + if (!IncludeFunctionPredicate || IncludeFunctionPredicate(Func)) + GVs.insert(Func); + } else { + // Global variables are added unconditionally + GVs.insert(Dep); + } + } + } +} + +// Check "spirv.ExecutionMode" named metadata in the module and remove nodes +// that reference kernels that have dead prototypes or don't reference any +// kernel at all (nullptr). Dead prototypes are removed as well. +void processSubModuleNamedMetadata(Module *M) { + auto ExecutionModeMD = M->getNamedMetadata("spirv.ExecutionMode"); + if (!ExecutionModeMD) + return; + + bool ContainsNodesToRemove = false; + std::vector ValueVec; + for (auto Op : ExecutionModeMD->operands()) { + assert(Op->getNumOperands() > 0); + if (!Op->getOperand(0)) { + ContainsNodesToRemove = true; + continue; + } + + // If the first operand is not nullptr then it has to be a kernel + // function. + Value *Val = cast(Op->getOperand(0))->getValue(); + Function *F = cast(Val); + // If kernel function is just a prototype and unused then we can remove it + // and later remove corresponding spirv.ExecutionMode metadata node. + if (F->isDeclaration() && F->use_empty()) { + F->eraseFromParent(); + ContainsNodesToRemove = true; + continue; + } + + // Rememver nodes which we need to keep in the module. + ValueVec.push_back(Op); + } + if (!ContainsNodesToRemove) + return; + + if (ValueVec.empty()) { + // If all nodes need to be removed then just remove named metadata + // completely. + ExecutionModeMD->eraseFromParent(); + } else { + ExecutionModeMD->clearOperands(); + for (auto MD : ValueVec) + ExecutionModeMD->addOperand(MD); + } +} + +ModuleDesc extractSubModule(const ModuleDesc &MD, + const SetVector GVs, + EntryPointGroup ModuleEntryPoints) { + const Module &M = MD.getModule(); + // For each group of entry points collect all dependencies. + ValueToValueMapTy VMap; + // Clone definitions only for needed globals. Others will be added as + // declarations and removed later. + std::unique_ptr SubM = CloneModule( + M, VMap, [&](const GlobalValue *GV) { return GVs.count(GV); }); + // Replace entry points with cloned ones. + EntryPointSet NewEPs; + const EntryPointSet &EPs = ModuleEntryPoints.Functions; + std::for_each(EPs.begin(), EPs.end(), [&](const Function *F) { + NewEPs.insert(cast(VMap[F])); + }); + ModuleEntryPoints.Functions = std::move(NewEPs); + return ModuleDesc{std::move(SubM), std::move(ModuleEntryPoints)}; +} + +// The function produces a copy of input LLVM IR module M with only those +// functions and globals that can be called from entry points that are specified +// in ModuleEntryPoints vector, in addition to the entry point functions. +ModuleDesc extractCallGraph(const ModuleDesc &MD, + EntryPointGroup ModuleEntryPoints, + const DependencyGraph &CG, + const std::function + &IncludeFunctionPredicate = nullptr) { + SetVector GVs; + collectFunctionsAndGlobalVariablesToExtract( + GVs, MD.getModule(), ModuleEntryPoints, CG, IncludeFunctionPredicate); + + ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints)); + LLVM_DEBUG(SplitM.dump(dbgs())); + SplitM.cleanup(); + + return SplitM; +} + +class ModuleCopier : public ModuleSplitterBase { +public: + using ModuleSplitterBase::ModuleSplitterBase; // to inherit base constructors + + ModuleDesc nextSplit() override { + ModuleDesc Desc{releaseInputModule(), nextGroup()}; + // Do some basic optimization like unused symbol removal + // even if there was no split. + Desc.cleanup(); + return Desc; + } +}; + +class ModuleSplitter : public ModuleSplitterBase { +public: + ModuleSplitter(ModuleDesc MD, EntryPointGroupVec GroupVec) + : ModuleSplitterBase(std::move(MD), std::move(GroupVec)), + CG(Input.getModule()) {} + + ModuleDesc nextSplit() override { + return extractCallGraph(Input, nextGroup(), CG); + } + +private: + DependencyGraph CG; +}; + +} // namespace + +namespace llvm { + +std::optional convertStringToSplitMode(StringRef S) { + static const StringMap Values = {{"kernel", SPLIT_PER_KERNEL}, + {"source", SPLIT_PER_TU}, + {"auto", SPLIT_AUTO}, + {"none", SPLIT_NONE}}; + + auto It = Values.find(S); + if (It == Values.end()) + return std::nullopt; + + return It->second; +} + +void dumpEntryPoints(raw_ostream &OS, const EntryPointSet &C, + std::string_view Msg) { + constexpr size_t INDENT = 4; + OS.indent(INDENT) << "ENTRY POINTS" + << " " << Msg << " {\n"; + for (const Function *F : C) + OS.indent(INDENT) << " " << F->getName() << "\n"; + + OS.indent(INDENT) << "}\n"; +} + +void dumpEntryPoints(raw_ostream &OS, const Module &M, + bool OnlyKernelsAreEntryPoints, std::string_view Msg) { + constexpr size_t INDENT = 4; + OS.indent(INDENT) << "ENTRY POINTS (Module)" + << " " << Msg << " {\n"; + for (const auto &F : M) + if (isEntryPoint(F, OnlyKernelsAreEntryPoints)) + OS.indent(INDENT) << " " << F.getName() << "\n"; + + OS.indent(INDENT) << "}\n"; +} + +void ModuleDesc::cleanup() { + // Externalize them so they are not dropped by GlobalDCE + for (Function &F : *M) + if (F.hasFnAttribute("indirectly-callable")) + F.setLinkage(GlobalValue::LinkageTypes::ExternalLinkage); + + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + ModulePassManager MPM; + // Do cleanup. + MPM.addPass(GlobalDCEPass()); // Delete unreachable globals. + MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. + MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. + MPM.run(*M, MAM); + + // Original module may have named metadata (spirv.ExecutionMode) referencing + // kernels in the module. Some of the Metadata nodes may reference kernels + // which are not included into the extracted submodule, in such case + // CloneModule either leaves that metadata nodes as is but they will reference + // dead prototype of the kernel or operand will be replace with nullptr. So + // process all nodes in the named metadata and remove nodes which are + // referencing kernels which are not included into submodule. + processSubModuleNamedMetadata(M.get()); +} + +ModuleDesc ModuleDesc::clone() const { + std::unique_ptr NewModule = CloneModule(getModule()); + ModuleDesc NewMD(std::move(NewModule)); + NewMD.EntryPoints.Props = EntryPoints.Props; + return NewMD; +} + +void ModuleDesc::dump(raw_ostream &OS) const { + assert(M && "dump of empty ModuleDesc"); + OS << "split_module::ModuleDesc[" << M->getName() << "] {\n"; + dumpEntryPoints(OS, entries(), EntryPoints.GroupId.c_str()); + OS << "}\n"; +} + +void EntryPointGroup::saveNames(std::vector &Dest) const { + Dest.reserve(Dest.size() + Functions.size()); + std::transform(Functions.begin(), Functions.end(), + std::inserter(Dest, Dest.end()), + [](const Function *F) { return F->getName().str(); }); +} + +void EntryPointGroup::rebuildFromNames(const std::vector &Names, + const Module &M) { + Functions.clear(); + auto It0 = Names.cbegin(); + auto It1 = Names.cend(); + std::for_each(It0, It1, [&](const std::string &Name) { + // Sometimes functions considered entry points (those for which isEntryPoint + // returned true) may be dropped by optimizations, such as AlwaysInliner. + // For example, if a linkonce_odr function is inlined and there are no other + // uses, AlwaysInliner drops it. It is responsibility of the user to make an + // entry point not have internal linkage (such as linkonce_odr) to guarantee + // its availability in the resulting device binary image. + if (Function *F = M.getFunction(Name)) + Functions.insert(F); + }); +} + +void EntryPointGroup::rebuild(const Module &M) { + for (const Function &F : M.functions()) + if (F.getCallingConv() == CallingConv::SPIR_KERNEL) + Functions.insert(const_cast(&F)); +} + +std::string ModuleDesc::makeSymbolTable() const { + std::string ST; + for (const Function *F : EntryPoints.Functions) + ST += (Twine(F->getName()) + "\n").str(); + + return ST; +} + +namespace { +// This is a helper class, which allows to group/categorize function based on +// provided rules. It is intended to be used in device code split +// implementation. +// +// "Rule" is a simple routine, which returns a string for an llvm::Function +// passed to it. There could be more than one rule and they are applied in order +// of their registration. Results obtained from those rules are concatenated +// together to produce the final result. +// +// There are some predefined rules for the most popular use-cases, like grouping +// functions together based on an attribute value or presence of a metadata. +// However, there is also a possibility to register a custom callback function +// as a rule, to implement custom/more complex logic. +class FunctionsCategorizer { +public: + FunctionsCategorizer() = default; + + std::string computeCategoryFor(Function *) const; + + // Accepts a callback, which should return a string based on provided + // function, which will be used as an entry points group identifier. + void registerRule(const std::function &Callback) { + Rules.emplace_back(Rule::RKind::K_Callback, Callback); + } + + // Creates a simple rule, which adds a value of a string attribute into a + // resulting identifier. + void registerSimpleStringAttributeRule(StringRef AttrName) { + Rules.emplace_back(Rule::RKind::K_SimpleStringAttribute, AttrName); + } + + // Creates a simple rule, which adds a value of a string metadata into a + // resulting identifier. + void registerSimpleStringMetadataRule(StringRef MetadataName) { + Rules.emplace_back(Rule::RKind::K_SimpleStringMetadata, MetadataName); + } + + // Creates a simple rule, which adds one or another value to a resulting + // identifier based on the presence of a metadata on a function. + void registerSimpleFlagAttributeRule(StringRef AttrName, + StringRef IfPresentStr, + StringRef IfAbsentStr = "") { + Rules.emplace_back(Rule::RKind::K_FlagAttribute, + Rule::FlagRuleData{AttrName, IfPresentStr, IfAbsentStr}); + } + + // Creates a simple rule, which adds one or another value to a resulting + // identifier based on the presence of a metadata on a function. + void registerSimpleFlagMetadataRule(StringRef MetadataName, + StringRef IfPresentStr, + StringRef IfAbsentStr = "") { + Rules.emplace_back( + Rule::RKind::K_FlagMetadata, + Rule::FlagRuleData{MetadataName, IfPresentStr, IfAbsentStr}); + } + + // Creates a rule, which adds a list of dash-separated integers converted + // into strings listed in a metadata to a resulting identifier. + void registerListOfIntegersInMetadataRule(StringRef MetadataName) { + Rules.emplace_back(Rule::RKind::K_IntegersListMetadata, MetadataName); + } + + // Creates a rule, which adds a list of sorted dash-separated integers + // converted into strings listed in a metadata to a resulting identifier. + void registerListOfIntegersInMetadataSortedRule(StringRef MetadataName) { + Rules.emplace_back(Rule::RKind::K_SortedIntegersListMetadata, MetadataName); + } + + // Creates a rule, which adds a list of sorted dash-separated integers from + // converted into strings listed in a metadata to a resulting identifier. + // The form of the metadata is expected to be a metadata node, with its + // operands being either an integer or another metadata node with the + // form of {!"", iN }. + void registerAspectListRule(StringRef MetadataName) { + registerRule([MetadataName](Function *F) { + SmallString<128> Result; + if (MDNode *UsedAspects = F->getMetadata(MetadataName)) { + SmallVector Values; + for (const MDOperand &MDOp : UsedAspects->operands()) { + if (auto MDN = dyn_cast(MDOp)) { + assert(MDN->getNumOperands() == 2); + Values.push_back(mdconst::extract(MDN->getOperand(1)) + ->getZExtValue()); + } else if (auto C = mdconst::dyn_extract(MDOp)) + Values.push_back(C->getZExtValue()); + } + + llvm::sort(Values); + for (std::uint64_t V : Values) + Result += ("-" + Twine(V)).str(); + } + + return std::string(Result); + }); + } + +private: + struct Rule { + struct FlagRuleData { + StringRef Name, IfPresentStr, IfAbsentStr; + }; + + private: + std::variant> + Storage; + + public: + enum class RKind { + // Custom callback function + K_Callback, + // Copy value of the specified attribute, if present + K_SimpleStringAttribute, + // Copy value of the specified metadata, if present + K_SimpleStringMetadata, + // Use one or another string based on the specified metadata presence + K_FlagMetadata, + // Use one or another string based on the specified attribute presence + K_FlagAttribute, + // Concatenate and use list of integers from the specified metadata + K_IntegersListMetadata, + // Sort, concatenate and use list of integers from the specified metadata + K_SortedIntegersListMetadata + }; + RKind Kind; + + // Returns an index into std::variant<...> Storage defined above, which + // corresponds to the specified rule Kind. + constexpr static std::size_t storage_index(RKind K) { + switch (K) { + case RKind::K_SimpleStringAttribute: + case RKind::K_IntegersListMetadata: + case RKind::K_SimpleStringMetadata: + case RKind::K_SortedIntegersListMetadata: + return 0; + case RKind::K_Callback: + return 2; + case RKind::K_FlagMetadata: + case RKind::K_FlagAttribute: + return 1; + } + // can't use llvm_unreachable in constexpr context + return std::variant_npos; + } + + template auto getStorage() const { + return std::get(Storage); + } + + template + Rule(RKind K, Args... args) : Storage(args...), Kind(K) { + assert(storage_index(K) == Storage.index()); + } + + Rule(Rule &&Other) = default; + }; + + std::vector Rules; +}; + +std::string FunctionsCategorizer::computeCategoryFor(Function *F) const { + SmallString<256> Result; + for (const auto &R : Rules) { + StringRef AttrName; + StringRef MetadataName; + Rule::FlagRuleData Data; + + switch (R.Kind) { + case Rule::RKind::K_Callback: + Result += R.getStorage()(F); + break; + + case Rule::RKind::K_SimpleStringAttribute: + AttrName = R.getStorage(); + if (F->hasFnAttribute(AttrName)) { + Attribute Attr = F->getFnAttribute(AttrName); + Result += Attr.getValueAsString(); + } + break; + + case Rule::RKind::K_SimpleStringMetadata: + MetadataName = R.getStorage(); + if (F->hasMetadata(MetadataName)) { + auto *MDN = F->getMetadata(MetadataName); + for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) { + MDString *S = cast(MDN->getOperand(I).get()); + Result += "-" + S->getString().str(); + } + } + break; + + case Rule::RKind::K_FlagMetadata: + Data = R.getStorage(); + if (F->hasMetadata(Data.Name)) + Result += Data.IfPresentStr; + else + Result += Data.IfAbsentStr; + break; + + case Rule::RKind::K_IntegersListMetadata: + MetadataName = R.getStorage(); + if (F->hasMetadata(MetadataName)) { + auto *MDN = F->getMetadata(MetadataName); + for (const MDOperand &MDOp : MDN->operands()) + Result += + "-" + std::to_string( + mdconst::extract(MDOp)->getZExtValue()); + } + break; + + case Rule::RKind::K_SortedIntegersListMetadata: + MetadataName = R.getStorage(); + if (F->hasMetadata(MetadataName)) { + MDNode *MDN = F->getMetadata(MetadataName); + + SmallVector Values; + for (const MDOperand &MDOp : MDN->operands()) + Values.push_back(mdconst::extract(MDOp)->getZExtValue()); + + llvm::sort(Values); + + for (std::uint64_t V : Values) + Result += "-" + std::to_string(V); + } + break; + + case Rule::RKind::K_FlagAttribute: + Data = R.getStorage(); + if (F->hasFnAttribute(Data.Name)) + Result += Data.IfPresentStr; + else + Result += Data.IfAbsentStr; + break; + } + + Result += "-"; + } + + return static_cast(Result); +} +} // namespace + +std::unique_ptr +getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, + bool EmitOnlyKernelsAsEntryPoints) { + FunctionsCategorizer Categorizer; + + EntryPointsGroupScope Scope = + selectDeviceCodeGroupScope(MD.getModule(), Mode, IROutputOnly); + + switch (Scope) { + case Scope_Global: + // We simply perform entry points filtering, but group all of them together. + Categorizer.registerRule( + [](Function *) -> std::string { return GLOBAL_SCOPE_NAME; }); + break; + case Scope_PerKernel: + // Per-kernel split is quite simple: every kernel goes into a separate + // module and that's it, no other rules required. + Categorizer.registerRule( + [](Function *F) -> std::string { return F->getName().str(); }); + break; + case Scope_PerModule: + // The most complex case, because we should account for many other features + // like aspects used in a kernel, large-grf mode, reqd-work-group-size, etc. + + // This is core of per-source device code split + Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_MODULE_ID); + + // This attribute marks virtual functions and effectively dictates how they + // should be groupped together. By design we won't split those groups of + // virtual functions further even if functions from the same group use + // different optional features and therefore this rule is put here. + // Strictly speaking, we don't even care about module-id splitting for + // those, but to avoid that we need to refactor the whole categorizer. + // However, this is good enough as it is for an initial version. + // TODO: for AOT use case we shouldn't be outlining those and instead should + // only select those functions which are compatible with the target device + Categorizer.registerSimpleStringAttributeRule("indirectly-callable"); + + // Optional features + // Note: Add more rules at the end of the list to avoid chaning orders of + // output files in existing tests. + Categorizer.registerSimpleStringAttributeRule("sycl-register-alloc-mode"); + Categorizer.registerSimpleStringAttributeRule("sycl-grf-size"); + Categorizer.registerAspectListRule("sycl_used_aspects"); + Categorizer.registerListOfIntegersInMetadataRule("reqd_work_group_size"); + Categorizer.registerListOfIntegersInMetadataRule("work_group_num_dim"); + Categorizer.registerListOfIntegersInMetadataRule( + "intel_reqd_sub_group_size"); + Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_OPTLEVEL); + Categorizer.registerSimpleStringMetadataRule("sycl_joint_matrix"); + Categorizer.registerSimpleStringMetadataRule("sycl_joint_matrix_mad"); + break; + } + + // std::map is used here to ensure stable ordering of entry point groups, + // which is based on their contents, this greatly helps LIT tests + std::map EntryPointsMap; + + // Only process module entry points: + for (auto &F : MD.getModule().functions()) { + if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints)) + continue; + + std::string Key = Categorizer.computeCategoryFor(&F); + EntryPointsMap[std::move(Key)].insert(&F); + } + + EntryPointGroupVec Groups; + if (EntryPointsMap.empty()) { + // No entry points met, record this. + Groups.emplace_back(GLOBAL_SCOPE_NAME, EntryPointSet{}); + } else { + Groups.reserve(EntryPointsMap.size()); + // Start with properties of a source module + EntryPointGroup::Properties MDProps = MD.getEntryPointGroup().Props; + for (auto &[Key, EntryPoints] : EntryPointsMap) + Groups.emplace_back(Key, std::move(EntryPoints), MDProps); + } + + bool DoSplit = (Mode != SPLIT_NONE && + (Groups.size() > 1 || !Groups.cbegin()->Functions.empty())); + + if (DoSplit) + return std::make_unique(std::move(MD), std::move(Groups)); + + return std::make_unique(std::move(MD), std::move(Groups)); +} + +static Error saveModuleIRInFile(Module &M, StringRef FilePath, + bool OutputAssembly) { + int FD = -1; + if (std::error_code EC = sys::fs::openFileForWrite(FilePath, FD)) + return errorCodeToError(EC); + + raw_fd_ostream OS(FD, true); + ModulePassManager MPM; + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + if (OutputAssembly) + MPM.addPass(PrintModulePass(OS)); + else + MPM.addPass(BitcodeWriterPass(OS)); + + MPM.run(M, MAM); + return Error::success(); +} + +static Expected +saveModuleDesc(ModuleDesc &MD, std::string Prefix, bool OutputAssembly) { + SYCLSplitModule SM; + Prefix += OutputAssembly ? ".ll" : ".bc"; + Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly); + if (E) + return E; + + SM.ModuleFilePath = Prefix; + SM.Symbols = MD.makeSymbolTable(); + return SM; +} + +Expected> +parseSYCLSplitModulesFromFile(StringRef File) { + auto EntriesMBOrErr = llvm::MemoryBuffer::getFile(File); + if (!EntriesMBOrErr) + return createFileError(File, EntriesMBOrErr.getError()); + + line_iterator LI(**EntriesMBOrErr); + if (LI.is_at_eof() || *LI != "[Code|Symbols]") + return createStringError(inconvertibleErrorCode(), + "invalid SYCL Table file."); + + // "Code" and "Symbols" at the moment. + static constexpr int NUMBER_COLUMNS = 2; + ++LI; + std::vector Modules; + while (!LI.is_at_eof()) { + StringRef Line = *LI; + if (Line.empty()) + return createStringError("invalid SYCL table row."); + + SmallVector Parts; + Line.split(Parts, "|"); + if (Parts.size() != NUMBER_COLUMNS) + return createStringError("invalid SYCL Table row."); + + auto [IRFilePath, SymbolsFilePath] = std::tie(Parts[0], Parts[1]); + if (SymbolsFilePath.empty()) + return createStringError("invalid SYCL Table row."); + + auto MBOrErr = MemoryBuffer::getFile(SymbolsFilePath); + if (!MBOrErr) + return createFileError(SymbolsFilePath, MBOrErr.getError()); + + auto &MB2 = *MBOrErr; + std::string Symbols = + std::string(MB2->getBufferStart(), MB2->getBufferEnd()); + Modules.emplace_back(IRFilePath, std::move(Symbols)); + ++LI; + } + + return Modules; +} + +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { + ModuleDesc MD = std::move(M); + auto Splitter = getDeviceCodeSplitter(std::move(MD), Settings.Mode, + /*IROutputOnly=*/false, + /*EmitOnlyKernelsAsEntryPoints=*/false); + + size_t ID = 0; + std::vector OutputImages; + while (Splitter->hasMoreSplits()) { + ModuleDesc MD = Splitter->nextSplit(); + + std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str(); + auto SplitImageOrErr = + saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); + if (!SplitImageOrErr) + return SplitImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*SplitImageOrErr)); + ++ID; + } + + return OutputImages; +} + +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/SYCLUtils.cpp b/llvm/lib/Transforms/Utils/SYCLUtils.cpp new file mode 100644 index 0000000000000..95ce5522a2600 --- /dev/null +++ b/llvm/lib/Transforms/Utils/SYCLUtils.cpp @@ -0,0 +1,263 @@ +//===------------ SYCLUtils.cpp - SYCL utility functions ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SYCL utility functions. +//===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/SYCLUtils.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/GlobalStatus.h" + +namespace llvm { + +void traverseCallgraphUp(llvm::Function *F, CallGraphNodeAction ActionF, + SmallPtrSetImpl &FunctionsVisited, + bool ErrorOnNonCallUse, + const CallGraphFunctionFilter &functionFilter) { + SmallVector Worklist; + + if (FunctionsVisited.count(F) == 0) + Worklist.push_back(F); + + while (!Worklist.empty()) { + Function *CurF = Worklist.pop_back_val(); + FunctionsVisited.insert(CurF); + // Apply the action function. + ActionF(CurF); + + // Update all callers as well. + for (auto It = CurF->use_begin(); It != CurF->use_end(); It++) { + auto FCall = It->getUser(); + auto ErrMsg = + llvm::Twine(__FILE__ " ") + + "Function use other than call detected while traversing call\n" + "graph up to a kernel"; + if (!isa(FCall)) { + // A use other than a call is met... + if (ErrorOnNonCallUse) { + // ... non-call is an error - report + llvm::report_fatal_error(ErrMsg); + } else { + // ... non-call is OK - add using function to the worklist + if (auto *I = dyn_cast(FCall)) { + if (!functionFilter(I, CurF)) { + continue; + } + + auto UseF = I->getFunction(); + + if (FunctionsVisited.count(UseF) == 0) { + Worklist.push_back(UseF); + } + } + } + } else { + auto *CI = cast(FCall); + + if ((CI->getCalledFunction() != CurF)) { + // CurF is used in a call, but not as the callee. + if (ErrorOnNonCallUse) + llvm::report_fatal_error(ErrMsg); + } else { + auto FCaller = CI->getFunction(); + + if (!FunctionsVisited.count(FCaller)) { + Worklist.push_back(FCaller); + } + } + } + } + } +} + +bool isCast(const Value *V) { + int Opc = Operator::getOpcode(V); + return (Opc == Instruction::BitCast) || (Opc == Instruction::AddrSpaceCast); +} + +bool isZeroGEP(const Value *V) { + const auto *GEPI = dyn_cast(V); + return GEPI && GEPI->hasAllZeroIndices(); +} + +Value *stripCasts(Value *V) { + return const_cast(stripCasts(const_cast(V))); +} + +const Value *stripCastsAndZeroGEPs(const Value *V); + +Value *stripCastsAndZeroGEPs(Value *V) { + return const_cast( + stripCastsAndZeroGEPs(const_cast(V))); +} + +const Value *stripCasts(const Value *V) { + if (!V->getType()->isPtrOrPtrVectorTy()) + return V; + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet Visited; + Visited.insert(V); + + do { + if (isCast(V)) { + V = cast(V)->getOperand(0); + } + assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!"); + } while (Visited.insert(V).second); + return V; +} + +const Value *stripCastsAndZeroGEPs(const Value *V) { + if (!V->getType()->isPtrOrPtrVectorTy()) + return V; + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet Visited; + Visited.insert(V); + + do { + if (isCast(V)) { + V = cast(V)->getOperand(0); + } else if (isZeroGEP(V)) { + V = cast(V)->getOperand(0); + } + assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!"); + } while (Visited.insert(V).second); + return V; +} + +void collectUsesLookThroughCasts(const Value *V, + SmallPtrSetImpl &Uses) { + for (const Use &U : V->uses()) { + Value *VV = U.getUser(); + + if (isCast(VV)) { + collectUsesLookThroughCasts(VV, Uses); + } else { + Uses.insert(&U); + } + } +} + +void collectUsesLookThroughCastsAndZeroGEPs( + const Value *V, SmallPtrSetImpl &Uses) { + assert(V->getType()->isPtrOrPtrVectorTy() && "pointer type expected"); + + for (const Use &U : V->uses()) { + Value *VV = U.getUser(); + + if (isCast(VV) || isZeroGEP(VV)) { + collectUsesLookThroughCastsAndZeroGEPs(VV, Uses); + } else { + Uses.insert(&U); + } + } +} + +// Tries to find possible values stored into given address. +// Returns true if the set of values could be reliably found, false otherwise. +bool collectPossibleStoredVals( + Value *Addr, SmallPtrSetImpl &Vals, + std::function EscapesIfAddrIsArgOf) { + SmallPtrSet Visited; + AllocaInst *LocalVar = dyn_cast_or_null(stripCasts(Addr)); + + if (!LocalVar) { + return false; + } + SmallPtrSet Uses; + collectUsesLookThroughCasts(LocalVar, Uses); + + for (const Use *U : Uses) { + Value *V = U->getUser(); + + if (auto *StI = dyn_cast(V)) { + if (U != &StI->getOperandUse(StoreInst::getPointerOperandIndex())) { + // this is double indirection - not supported + return false; + } + V = stripCasts(StI->getValueOperand()); + + if (auto *LI = dyn_cast(V)) { + // A value loaded from another address is stored at this address - + // recurse into the other address + if (!collectPossibleStoredVals(LI->getPointerOperand(), Vals)) { + return false; + } + } else { + Vals.insert(V); + } + continue; + } + if (const auto *CI = dyn_cast(V)) { + if (EscapesIfAddrIsArgOf(CI)) { + return false; + } + continue; + } + if (isa(V)) { + // LoadInst from this addr is OK, as it does not affect what can be stored + // through the addr + continue; + } + return false; + } + return true; +} + +bool removeSYCLKernelsConstRefArray(Module &M) { + GlobalVariable *GV = M.getGlobalVariable("llvm.used"); + + if (!GV) + return false; + + assert(GV->user_empty() && "Unexpected llvm.used users"); + Constant *Initializer = GV->getInitializer(); + GV->setInitializer(nullptr); + GV->eraseFromParent(); + + // Destroy the initializer and all operands of it. + SmallVector IOperands; + for (auto It = Initializer->op_begin(); It != Initializer->op_end(); It++) + IOperands.push_back(cast(*It)); + assert(llvm::isSafeToDestroyConstant(Initializer) && + "Cannot remove initializer of llvm.used global"); + Initializer->destroyConstant(); + for (auto It = IOperands.begin(); It != IOperands.end(); It++) { + auto Op = (*It)->stripPointerCasts(); + auto *F = dyn_cast(Op); + if (llvm::isSafeToDestroyConstant(*It)) + (*It)->destroyConstant(); + else if (F && F->getCallingConv() == CallingConv::SPIR_KERNEL && + !F->use_empty()) { + // The element in "llvm.used" array has other users. That is Ok for + // specialization constants, but is wrong for kernels. + llvm::report_fatal_error("Unexpected usage of SYCL kernel"); + } + + // Remove unused kernel declarations to avoid LLVM IR check fails. + if (F && F->isDeclaration() && F->use_empty()) + F->eraseFromParent(); + } + + return true; +} + +void writeSYCLStringTable(const SYCLStringTable &Table, raw_ostream &OS) { + assert(Table.size() > 0 && "table should contain at least column titles"); + size_t numberColumns = Table[0].size(); + assert(numberColumns > 0 && "table should be non-empty"); + OS << '[' << join(Table[0].begin(), Table[0].end(), "|") << "]\n"; + for (size_t I = 1, E = Table.size(); I != E; ++I) { + assert(Table[I].size() == numberColumns && "row's size should be equal"); + OS << join(Table[I].begin(), Table[I].end(), "|") << '\n'; + } +} + +} // namespace llvm diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll new file mode 100644 index 0000000000000..6b0305d12400f --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll @@ -0,0 +1,17 @@ +; -- Per-kernel split +; RUN: llvm-split -sycl-split=kernel -S < %s -o %tC +; RUN: FileCheck %s -input-file=%tC_0.ll --check-prefixes CHECK-A0 +; RUN: FileCheck %s -input-file=%tC_1.ll --check-prefixes CHECK-A1 + +define dso_local amdgpu_kernel void @Kernel1() { + ret void +} + +define dso_local amdgpu_kernel void @Kernel2() { + ret void +} + +; CHECK-A0: define dso_local amdgpu_kernel void @Kernel2() +; CHECK-A0-NOT: define dso_local amdgpu_kernel void @Kernel1() +; CHECK-A1-NOT: define dso_local amdgpu_kernel void @Kernel2() +; CHECK-A1: define dso_local amdgpu_kernel void @Kernel1() diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll new file mode 100644 index 0000000000000..539adf551ea96 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll @@ -0,0 +1,121 @@ +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; By default auto mode is equal to source mode +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +; CHECK-TU1-NOT: @{{.*}}GV{{.*}} +; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK: !0 = !{i32 1, i32 2} +; CHECK: !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll new file mode 100644 index 0000000000000..33dde7b965755 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll @@ -0,0 +1,127 @@ +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; +; This is the same as auto-module-split-1 test with the only difference is that +; @_Z3foov is marked with "referenced-indirectly" attribute. +; The purpose of this test is to make sure that we can still perform device code +; split as usual, because that function is not a part of any indirect calls +; +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +; CHECK-TU1-NOT: @{{.*}}GV{{.*}} +; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() #2 { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } +attributes #2 = { "referenced-indirectly" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK: !0 = !{i32 1, i32 2} +; CHECK: !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll new file mode 100644 index 0000000000000..3c40986a31e62 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll @@ -0,0 +1,112 @@ +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; +; In precense of indirect calls we start matching functions using their +; signatures, i.e. we have an indirect call to i32(i32) function within +; @_Z3foov, which means that all functions with i32(i32) signature should be +; placed in the same module as @_Z3foov. +; +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0-IR \ +; RUN: --implicit-check-not TU0_kernel --implicit-check-not _Z3foov \ +; RUN: --implicit-check-not _Z4foo3v +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1-IR \ +; RUN: --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v \ +; RUN: --implicit-check-not _Z4foo1v +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM + +; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0 +; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1 +; +; CHECK-TU1-SYM: _ZTSZ4mainE10TU0_kernel +; +; CHECK-TU0-IR: @_ZL2GV = internal addrspace(1) constant +; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0 +; CHECK-TU0-IR: define {{.*}} spir_func i32 @_Z4foo1v +; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1 +; CHECK-TU0-IR: define {{.*}} spir_func void @_Z4foo2v +; +; CHECK-TU1-IR: define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel +; CHECK-TU1-IR: define {{.*}} spir_func void @_Z3foov +; CHECK-TU1-IR: define {{.*}} spir_func i32 @_Z4foo3v + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %ptr = bitcast i32* %a to i32 (i32)* + %call = call spir_func i32 %ptr(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0() #1 { +entry: + %a = alloca i32, align 4 + %arg = load i32, i32* %a, align 4 + %call = call spir_func i32 @_Z4foo1v(i32 %arg) + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func i32 @_Z4foo1v(i32 %arg) { +entry: + %a = alloca i32, align 4 + store i32 %arg, i32* %a, align 4 + ret i32 %arg +} + +; Function Attrs: nounwind +define dso_local spir_func i32 @_Z4foo3v(i32 %arg) #2 { +entry: + %a = alloca i32, align 4 + store i32 %arg, i32* %a, align 4 + ret i32 %arg +} + +define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } +attributes #2 = { "referenced-indirectly" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll new file mode 100644 index 0000000000000..dd10a9bec6269 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll @@ -0,0 +1,50 @@ +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1 +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1 + +; This test checkes that we can properly perform device code split by tracking +; all uses of functions (not only direct calls) + +; CHECK-SYM0: kernel2 +; CHECK-SYM1: kernel1 +; +; CHECK-IR0: define dso_local spir_kernel void @kernel2 +; +; CHECK-IR1: @_Z2f1iTable = weak global ptr @_Z2f1i +; CHECK-IR1: define {{.*}} i32 @_Z2f1i +; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64_x86_64-unknown-unknown" + +@_Z2f1iTable = weak global ptr @_Z2f1i, align 8 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define dso_local spir_func i32 @_Z2f1i(i32 %a) #0 { +entry: + ret i32 %a +} + +; Function Attrs: convergent norecurse +define weak_odr dso_local spir_kernel void @kernel1() #1 { +entry: + %0 = call i32 @indirect_call(ptr addrspace(4) addrspacecast ( ptr getelementptr inbounds ( [1 x ptr] , ptr @_Z2f1iTable, i64 0, i64 0) to ptr addrspace(4)), i32 0) + ret void +} + +; Function Attrs: convergent norecurse +define dso_local spir_kernel void @kernel2() #2 { +entry: + ret void +} + +declare dso_local spir_func i32 @indirect_call(ptr addrspace(4), i32) local_unnamed_addr + +attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn } +attributes #1 = { convergent norecurse "sycl-module-id"="TU1.cpp" } +attributes #2 = { convergent norecurse "sycl-module-id"="TU2.cpp" } + +; CHECK: kernel1 +; CHECK: kernel2 diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll new file mode 100644 index 0000000000000..a916fdfa82b76 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll @@ -0,0 +1,122 @@ +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +; ModuleID = 'basic-module-split.ll' +source_filename = "basic-module-split.ll" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +;CHECK-TU1-NOT: @{{.*}}GV{{.*}} +;CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK; !0 = !{i32 1, i32 2} +; CHECK; !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll new file mode 100644 index 0000000000000..413769947aaaf --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll @@ -0,0 +1,88 @@ +; The idea of the test is to ensure that sycl-post-link can trace through more +; complex call stacks involving several nested indirect calls + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C +; +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C +; +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C + +; CHECK0-DAG: define spir_kernel void @kernel_C +; CHECK0-DAG: define spir_func i32 @bar +; CHECK0-DAG: define spir_func void @BAZ + +; CHECK1-DAG: define spir_kernel void @kernel_B +; CHECK1-DAG: define {{.*}}spir_func i32 @foo +; CHECK1-DAG: define spir_func i32 @bar +; CHECK1-DAG: define spir_func void @BAZ + +; CHECK2-DAG: define spir_kernel void @kernel_A +; CHECK2-DAG: define {{.*}}spir_func void @baz + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define spir_func i32 @foo(i32 (i32, void ()*)* %ptr1, void ()* %ptr2) { + %1 = call spir_func i32 %ptr1(i32 42, void ()* %ptr2) + ret i32 %1 +} + +define spir_func i32 @bar(i32 %arg, void ()* %ptr) #3 { + call spir_func void %ptr() + ret i32 %arg +} + +define spir_func void @baz() { + ret void +} + +define spir_func void @BAZ() #3 { + ret void +} + +define spir_kernel void @kernel_A() #0 { + call spir_func void @baz() + ret void +} + +define spir_kernel void @kernel_B() #1 { + call spir_func i32 @foo(i32 (i32, void ()*)* null, void ()* null) + ret void +} + +define spir_kernel void @kernel_C() #2 { + call spir_func i32 @bar(i32 42, void ()* null) + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } +attributes #2 = { "sycl-module-id"="TU3.cpp" } +attributes #3 = { "referenced-indirectly" } diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll new file mode 100644 index 0000000000000..69ee88d572960 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll @@ -0,0 +1,45 @@ +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; +; This test checks that functions marked with "indirectly-callable" LLVM IR +; attribute are outlined into separate device image(s) in accordance with the +; attribute value. +; +; Current device code split implementation may split those groups further if +; they use different optional kernel features for example, but we don't care +; about that subsequent split and don't test it. +; +; RUN: FileCheck %s --input-file=%t_0.ll --check-prefix CHECK-IR0 \ +; RUN: --implicit-check-not kernel --implicit-check-not foo +; RUN: FileCheck %s --input-file=%t_1.ll --check-prefix CHECK-IR1 \ +; RUN: --implicit-check-not kernel --implicit-check-not bar \ +; RUN: --implicit-check-not baz +; RUN: FileCheck %s --input-file=%t_2.ll --check-prefix CHECK-IR2 \ +; RUN: --implicit-check-not foo --implicit-check-not bar \ +; RUN: --implicit-check-not baz + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +define spir_func void @foo() #0 { +entry: + ret void +} + +define spir_func void @bar() #1 { +entry: + ret void +} + +define spir_func void @baz() #1 { +entry: + ret void +} + +define weak_odr dso_local spir_kernel void @kernel() #2 { +entry: + ret void +} + +attributes #0 = { "indirectly-callable"="set-1" "sycl-module-id"="v.cpp" } +attributes #1 = { "indirectly-callable"="set-2" "sycl-module-id"="v.cpp" } +attributes #2 = { "sycl-module-id"="v.cpp" } diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll new file mode 100644 index 0000000000000..a35ebedf387fc --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll @@ -0,0 +1,53 @@ +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; +; This test checks that functions marked with "indirectly-callable" LLVM IR +; attribute are outlined into separate device image(s) in accordance with the +; attribute value. +; +; This version of the test is focused on per-kernel device code split +; +; RUN: FileCheck %s --input-file=%t_0.ll --check-prefix CHECK-IR0 \ +; RUN: --implicit-check-not foo --implicit-check-not bar \ +; RUN: --implicit-check-not baz +; RUN: FileCheck %s --input-file=%t_1.ll --check-prefix CHECK-IR1 \ +; RUN: --implicit-check-not kernel --implicit-check-not bar \ +; RUN: --implicit-check-not baz +; RUN: FileCheck %s --input-file=%t_2.ll --check-prefix CHECK-IR2 \ +; RUN: --implicit-check-not kernel --implicit-check-not foo \ +; RUN: --implicit-check-not bar +; RUN: FileCheck %s --input-file=%t_3.ll --check-prefix CHECK-IR3 \ +; RUN: --implicit-check-not kernel --implicit-check-not foo \ +; RUN: --implicit-check-not baz +; +; CHECK-IR0: define weak_odr dso_local spir_kernel void @kernel +; CHECK-IR1: define spir_func void @foo +; CHECK-IR2: define spir_func void @baz +; CHECK-IR3: define spir_func void @bar + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +define spir_func void @foo() #0 { +entry: + ret void +} + +define spir_func void @bar() #1 { +entry: + ret void +} + +define spir_func void @baz() #1 { +entry: + ret void +} + +define weak_odr dso_local spir_kernel void @kernel() #2 { +entry: + ret void +} + +attributes #0 = { "indirectly-callable"="set-1" "sycl-module-id"="v.cpp" } +attributes #1 = { "indirectly-callable"="set-2" "sycl-module-id"="v.cpp" } +attributes #2 = { "sycl-module-id"="v.cpp" } + diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll new file mode 100644 index 0000000000000..f61623d377bcd --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll @@ -0,0 +1,133 @@ +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t.files +; RUN: FileCheck %s -input-file=%t.files_0.ll --check-prefixes CHECK-MODULE0,CHECK +; RUN: FileCheck %s -input-file=%t.files_0.sym --check-prefixes CHECK-MODULE0-TXT +; RUN: FileCheck %s -input-file=%t.files_1.ll --check-prefixes CHECK-MODULE1,CHECK +; RUN: FileCheck %s -input-file=%t.files_1.sym --check-prefixes CHECK-MODULE1-TXT +; RUN: FileCheck %s -input-file=%t.files_2.ll --check-prefixes CHECK-MODULE2,CHECK +; RUN: FileCheck %s -input-file=%t.files_2.sym --check-prefixes CHECK-MODULE2-TXT + +; ModuleID = 'one-kernel-per-module.ll' +source_filename = "one-kernel-per-module.ll" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +;CHECK-MODULE2-NOT: @{{.*}}GV{{.*}} +;CHECK-MODULE1-NOT: @{{.*}}GV{{.*}} +;CHECK-MODULE0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-MODULE2: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE2-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE1-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-MODULE2: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-MODULE2: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-MODULE1-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-MODULE0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-MODULE2: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-MODULE2: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-MODULE1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-MODULE0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-MODULE2-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE2-TXT-NOT: {{.*}}TU0_kernel1{{.*}} +; CHECK-MODULE1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-MODULE0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-MODULE1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-MODULE2-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-MODULE1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-MODULE0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-MODULE2-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE2-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-MODULE1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-MODULE0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-MODULE0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-MODULE2-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-MODULE1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-MODULE0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-MODULE0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK; !0 = !{i32 1, i32 2} +; CHECK; !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll new file mode 100644 index 0000000000000..cbd91724959b8 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no aspects used +; TU0_kernel1 - 1st translation unit, aspect 1 is used +; TU1_kernel2 - 2nd translation unit, no aspects used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use aspects from kernels which doesn't use aspects regardless of device +; code split mode + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !sycl_used_aspects !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 1} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll new file mode 100644 index 0000000000000..f3f919fe45534 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll @@ -0,0 +1,59 @@ +; The test is intended to check that SYCL Module splitting correctly groups kernels +; by unique sets of aspects used in them + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel3 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel3 + +; CHECK-M1-SYMS: kernel1 +; CHECK-M1-SYMS: kernel2 + +; CHECK-M2-SYMS: kernel0 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !sycl_used_aspects !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !sycl_used_aspects !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !sycl_used_aspects !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !sycl_used_aspects !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 1} +!2 = !{i32 1, i32 2} +!3 = !{i32 2, i32 1} +!4 = !{i32 2, i32 3, i32 4} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll new file mode 100644 index 0000000000000..2ac32bdefa61a --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll @@ -0,0 +1,94 @@ +; This test is intended to check that per-aspect device code split works as +; expected with SYCL_EXTERNAL functions + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel1 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not foo --implicit-check-not kernel0 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not foo \ +; RUN: --implicit-check-not bar +; +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not bar + +; We expect to see 3 modules generated: +; +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; sycl-post-link aims to achieve two goals while doing splitting: +; - each kernel must be self-contained, i.e. all functions called from a +; kernel must reside in the same device image +; - each entry point should be assigned to a correct device image in +; accordance with selected device code split mode +; +; In this test @bar and @foo are SYCL_EXTERNAL functions and they are treated +; as entry points. +; +; @bar uses the same list of aspects as @kernel0 which calls it and therefore +; they can be put into the same device image. There also goes @baz, because of +; the same list of used aspects. +; +; CHECK-M0-SYMS: bar +; CHECK-M0-SYMS: baz +; CHECK-M0-SYMS: kernel0 +; +; List of aspects used by @foo is different from the one attached to @kernel1 +; which calls @foo (for example, @kernel1 uses an extra optional feature besides +; ones used in @foo). As a result, @foo should be both included into the same +; device image as @kernel1 to make it self contained, but at the same time it +; should also present in a separate device image, because it is an entry point +; with unique set of used aspects. +; +; CHECK-M1-SYMS: kernel1 +; +; CHECK-M2-SYMS: foo +; +; @kernel1 uses @foo and therefore @foo should be present in the same module as +; @kernel1 as well +; CHECK-M1-IR-DAG: define {{.*}}spir_func void @foo +; CHECK-M1-IR-DAG: define spir_kernel void @kernel1 + + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_func void @foo() #0 !sycl_used_aspects !1 { + ret void +} + +define spir_func void @bar() #1 !sycl_used_aspects !2 { + ret void +} + +define spir_func void @baz() #1 !sycl_used_aspects !2 { + ret void +} + +define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 { +entry: + call void @bar() + ret void +} + +define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 { +entry: + call void @foo() + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!1 = !{i32 1} +!2 = !{i32 2} +!3 = !{i32 3, i32 1} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll new file mode 100644 index 0000000000000..9436e4308ac99 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_sub_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_sub_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_sub_group_size attribute used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use reqd_sub_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; RUN: llvm-split -sycl-split=auto -S %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: llvm-split -sycl-split=kernel -S %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: llvm-split -sycl-split=source -S %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} \ No newline at end of file diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll new file mode 100644 index 0000000000000..49976fec60c26 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll @@ -0,0 +1,60 @@ +; The test is intended to check that sycl-post-link correctly groups kernels +; by unique reqd_sub_group_size values used in them + +; RUN: llvm-split -sycl-split=auto -S %s -o %t +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel3 + +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel1 +; CHECK-M0-SYMS: kernel2 + +; CHECK-M1-SYMS: kernel0 + +; CHECK-M2-SYMS: kernel3 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !intel_reqd_sub_group_size !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !intel_reqd_sub_group_size !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !intel_reqd_sub_group_size !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 32} +!2 = !{i32 64} +!3 = !{i32 64} +!4 = !{i32 16} \ No newline at end of file diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll new file mode 100644 index 0000000000000..64acdc04e957c --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_work_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_work_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_work_group_size attribute used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use reqd_work_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !reqd_work_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll new file mode 100644 index 0000000000000..569bdeb8ff14c --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll @@ -0,0 +1,59 @@ +; The test is intended to check that sycl-post-link correctly groups kernels +; by unique reqd_work_group_size values used in them + +; RUN: llvm-split -sycl-split=auto -S < %s -o %t +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel3 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel0 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel1 +; CHECK-M0-SYMS: kernel2 + +; CHECK-M1-SYMS: kernel0 + +; CHECK-M2-SYMS: kernel3 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !reqd_work_group_size !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !reqd_work_group_size !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !reqd_work_group_size !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !reqd_work_group_size !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 32} +!2 = !{i32 64, i32 64} +!3 = !{i32 64, i32 64} +!4 = !{i32 16, i32 16, i32 16} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll new file mode 100644 index 0000000000000..2632641a69a5c --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll @@ -0,0 +1,74 @@ +; Purpose of this test is to check that sycl-post-link does not treat +; declarations as entry points. + +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-PER-SOURCE-TABLE +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-PER-SOURCE-SYM0 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-PER-SOURCE-SYM1 +; +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 + +; With per-source split, there should be two device images +; CHECK-PER-SOURCE-TABLE: [Code|Symbols] +; CHECK-PER-SOURCE-TABLE: {{.*}}_0.ll|{{.*}}_0.sym +; CHECK-PER-SOURCE-TABLE-NEXT: {{.*}}_1.ll|{{.*}}_1.sym +; CHECK-PER-SOURCE-TABLE-EMPTY: +; +; CHECK-PER-SOURCE-SYM1-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-SOURCE-SYM1: _ZTSZ4mainE11TU0_kernel0 +; CHECK-PER-SOURCE-SYM1-NEXT: _ZTSZ4mainE11TU0_kernel1 +; CHECK-PER-SOURCE-SYM1-EMPTY: +; +; CHECK-PER-SOURCE-SYM0-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-SOURCE-SYM0: _ZTSZ4mainE10TU1_kernel0 +; CHECK-PER-SOURCE-SYM0-EMPTY: + +; With per-kernel split, there should be three device images +; CHECK-PER-KERNEL-TABLE: [Code|Symbols] +; CHECK-PER-KERNEL-TABLE: {{.*}}_0.ll|{{.*}}_0.sym +; CHECK-PER-KERNEL-TABLE-NEXT: {{.*}}_1.ll|{{.*}}_1.sym +; CHECK-PER-KERNEL-TABLE-NEXT: {{.*}}_2.ll|{{.*}}_2.sym +; CHECK-PER-KERNEL-TABLE-EMPTY: +; +; CHECK-PER-KERNEL-SYM0-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM0: _ZTSZ4mainE10TU1_kernel0 +; CHECK-PER-KERNEL-SYM0-EMPTY: +; +; CHECK-PER-KERNEL-SYM2-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM2: _ZTSZ4mainE11TU0_kernel0 +; CHECK-PER-KERNEL-SYM2-EMPTY: +; +; CHECK-PER-KERNEL-SYM1-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM1: _ZTSZ4mainE11TU0_kernel1 +; CHECK-PER-KERNEL-SYM1-EMPTY: + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + ret void +} + +define spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + ret void +} + +define spir_kernel void @_ZTSZ4mainE10TU1_kernel0() #1 { + ret void +} + +declare spir_kernel void @_ZTS4mainE10TU1_kernel1() #1 + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp index c456403e6bc68..deaec74f99b32 100644 --- a/llvm/tools/llvm-split/llvm-split.cpp +++ b/llvm/tools/llvm-split/llvm-split.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" @@ -27,6 +28,8 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/SYCLModuleSplit.h" +#include "llvm/Transforms/Utils/SYCLUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" using namespace llvm; @@ -70,6 +73,65 @@ static cl::opt MCPU("mcpu", cl::desc("Target CPU, ignored if -mtriple is not used"), cl::value_desc("cpu"), cl::cat(SplitCategory)); +cl::opt SYCLSplitMode( + "sycl-split", cl::desc("module split mode"), cl::Optional, + cl::init(SPLIT_NONE), + cl::values( + clEnumValN(SPLIT_PER_TU, "source", + "1 output module per source (translation unit)"), + clEnumValN(SPLIT_PER_KERNEL, "kernel", "1 output module per kernel"), + clEnumValN(SPLIT_AUTO, "auto", "Choose split mode automatically")), + cl::cat(SplitCategory)); + +cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), + cl::cat(SplitCategory)}; + +void writeStringToFile(std::string_view Content, StringRef Path) { + std::error_code EC; + raw_fd_ostream OS(Path, EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + OS << Content << "\n"; +} + +void dumpSplitModulesAsTable(const std::vector &SplitModules, + StringRef Path) { + std::vector Columns = {"Code", "Symbols"}; + SYCLStringTable Table; + Table.emplace_back(std::move(Columns)); + for (const auto &[I, SM] : enumerate(SplitModules)) { + std::string SymbolsFile = (Twine(Path) + "_" + Twine(I) + ".sym").str(); + writeStringToFile(SM.Symbols, SymbolsFile); + std::vector Row = {SM.ModuleFilePath, SymbolsFile}; + Table.emplace_back(std::move(Row)); + } + + std::error_code EC; + raw_fd_ostream OS((Path + ".table").str(), EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + writeSYCLStringTable(Table, OS); +} + +Error runSYCLSplitModule(std::unique_ptr M) { + ModuleSplitterSettings Settings; + Settings.Mode = SYCLSplitMode; + Settings.OutputAssembly = OutputAssembly; + Settings.OutputPrefix = OutputFilename; + auto SplitModulesOrErr = splitSYCLModule(std::move(M), Settings); + if (!SplitModulesOrErr) + return SplitModulesOrErr.takeError(); + + dumpSplitModulesAsTable(*SplitModulesOrErr, OutputFilename); + return Error::success(); +} + int main(int argc, char **argv) { InitLLVM X(argc, argv); @@ -123,6 +185,16 @@ int main(int argc, char **argv) { Out->keep(); }; + if (SYCLSplitMode != IRSplitMode::SPLIT_NONE) { + auto E = runSYCLSplitModule(std::move(M)); + if (E) { + errs() << E << "\n"; + Err.print(argv[0], errs()); + } + + return 0; + } + if (TM) { if (PreserveLocals) { errs() << "warning: -preserve-locals has no effect when using " From 63d93edd09658b9c4965e16f783ceaf7cd890de1 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Thu, 31 Oct 2024 09:15:09 -0700 Subject: [PATCH 02/16] address some code review questions --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 71 ++++--- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 178 +++++++----------- llvm/tools/llvm-split/llvm-split.cpp | 17 +- 3 files changed, 110 insertions(+), 156 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 9b9b237f3b94b..bb5284815b4b4 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -24,7 +24,6 @@ #include // TODO(maksimsab): -// * check GenXSPIRVWriterAdaptor comments // * Maybe fix doxygen comments. namespace llvm { @@ -32,32 +31,30 @@ namespace llvm { class Function; class Module; -enum IRSplitMode { - SPLIT_PER_TU, // one module per translation unit - SPLIT_PER_KERNEL, // one module per kernel - SPLIT_AUTO, // automatically select split mode - SPLIT_NONE // no splitting +enum class IRSplitMode { + IRSM_PER_TU, // one module per translation unit + IRSM_PER_KERNEL, // one module per kernel + IRSM_AUTO, // automatically select split mode + IRSM_NONE // no splitting }; -// \returns IRSplitMode value if \p S is recognized. Otherwise, std::nullopt is -// returned. +/// \returns IRSplitMode value if \p S is recognized. Otherwise, std::nullopt is +/// returned. std::optional convertStringToSplitMode(StringRef S); // A vector that contains all entry point functions in a split module. using EntryPointSet = SetVector; -// enum class SyclEsimdSplitStatus { SYCL_ONLY, ESIMD_ONLY, SYCL_AND_ESIMD }; - -// Describes scope covered by each entry in the module-entry points map -// populated by the groupEntryPointsByScope function. +/// Describes scope covered by each entry in the module-entry points map +/// populated by the groupEntryPointsByScope function. enum EntryPointsGroupScope { Scope_PerKernel, // one entry per kernel Scope_PerModule, // one entry per module Scope_Global // single entry in the map for all kernels }; -// Represents a named group of device code entry points - kernels and -// SYCL_EXTERNAL functions. +/// Represents a named group of device code entry points - kernels and +/// SYCL_EXTERNAL functions. struct EntryPointGroup { // Properties an entry point (EP) group struct Properties { @@ -81,15 +78,15 @@ struct EntryPointGroup { void rebuild(const Module &M); }; -using EntryPointGroupVec = std::vector; +using EntryPointGroupVec = SmallVector; -// Annotates an llvm::Module with information necessary to perform and track -// result of device code (llvm::Module instances) splitting: -// - entry points of the module determined e.g. by a module splitter, as well -// as information about entry point origin (e.g. result of a scoped split) -// - its properties, such as whether it has specialization constants uses -// It also provides convenience functions for entry point set transformation -// between llvm::Function object and string representations. +/// Annotates an llvm::Module with information necessary to perform and track +/// result of device code (llvm::Module instances) splitting: +/// - entry points of the module determined e.g. by a module splitter, as well +/// as information about entry point origin (e.g. result of a scoped split) +/// - its properties, such as whether it has specialization constants uses +/// It also provides convenience functions for entry point set transformation +/// between llvm::Function object and string representations. class ModuleDesc { std::unique_ptr M; EntryPointGroup EntryPoints; @@ -135,12 +132,12 @@ class ModuleDesc { std::string makeSymbolTable() const; - void dump(raw_ostream &OS) const; + void dump() const; }; -// Module split support interface. -// It gets a module (in a form of module descriptor, to get additional info) and -// a collection of entry points groups. Each group specifies subset entry points +/// Module split support interface. +/// It gets a module (in a form of module descriptor, to get additional info) and +/// a collection of entry points groups. Each group specifies subset entry points // from input module that should be included in a split module. class ModuleSplitterBase { protected: @@ -169,15 +166,15 @@ class ModuleSplitterBase { virtual ~ModuleSplitterBase() = default; - // Gets next subsequence of entry points in an input module and provides split - // submodule containing these entry points and their dependencies. + /// Gets next subsequence of entry points in an input module and provides split + /// submodule containing these entry points and their dependencies. virtual ModuleDesc nextSplit() = 0; - // Returns a number of remaining modules, which can be split out using this - // splitter. The value is reduced by 1 each time nextSplit is called. + /// Returns a number of remaining modules, which can be split out using this + /// splitter. The value is reduced by 1 each time nextSplit is called. size_t remainingSplits() const { return Groups.size(); } - // Check that there are still submodules to split. + /// Check that there are still submodules to split. bool hasMoreSplits() const { return remainingSplits() > 0; } }; @@ -185,12 +182,8 @@ std::unique_ptr getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, bool EmitOnlyKernelsAsEntryPoints); -void dumpEntryPoints(raw_ostream &OS, const EntryPointSet &C, - std::string_view Msg = ""); -void dumpEntryPoints(raw_ostream &OS, const Module &M, - bool OnlyKernelsAreEntryPoints = false, - std::string_view Msg = ""); - +/// The structure represents a split LLVM Module accompanied by additional information. +/// Split Modules are being stored at disk due to the high RAM consumption during the whole splitting process. struct SYCLSplitModule { std::string ModuleFilePath; std::string Symbols; @@ -212,11 +205,11 @@ struct ModuleSplitterSettings { }; /// Parses the string table. -Expected> +Expected> parseSYCLSplitModulesFromFile(StringRef File); /// Splits the given module \p M according to the given \p Settings. -Expected> +Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index 83b0f55dba5e9..88bd847bee839 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -41,6 +41,7 @@ #include #include #include +#include using namespace llvm; @@ -55,13 +56,13 @@ EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M, IRSplitMode Mode, bool AutoSplitIsGlobalScope) { switch (Mode) { - case SPLIT_PER_TU: + case IRSplitMode::IRSM_PER_TU: return Scope_PerModule; - case SPLIT_PER_KERNEL: + case IRSplitMode::IRSM_PER_KERNEL: return Scope_PerKernel; - case SPLIT_AUTO: { + case IRSplitMode::IRSM_AUTO: { if (AutoSplitIsGlobalScope) return Scope_Global; @@ -71,7 +72,7 @@ EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M, return Scope_PerModule; } - case SPLIT_NONE: + case IRSplitMode::IRSM_NONE: return Scope_Global; } @@ -272,52 +273,6 @@ void collectFunctionsAndGlobalVariablesToExtract( } } -// Check "spirv.ExecutionMode" named metadata in the module and remove nodes -// that reference kernels that have dead prototypes or don't reference any -// kernel at all (nullptr). Dead prototypes are removed as well. -void processSubModuleNamedMetadata(Module *M) { - auto ExecutionModeMD = M->getNamedMetadata("spirv.ExecutionMode"); - if (!ExecutionModeMD) - return; - - bool ContainsNodesToRemove = false; - std::vector ValueVec; - for (auto Op : ExecutionModeMD->operands()) { - assert(Op->getNumOperands() > 0); - if (!Op->getOperand(0)) { - ContainsNodesToRemove = true; - continue; - } - - // If the first operand is not nullptr then it has to be a kernel - // function. - Value *Val = cast(Op->getOperand(0))->getValue(); - Function *F = cast(Val); - // If kernel function is just a prototype and unused then we can remove it - // and later remove corresponding spirv.ExecutionMode metadata node. - if (F->isDeclaration() && F->use_empty()) { - F->eraseFromParent(); - ContainsNodesToRemove = true; - continue; - } - - // Rememver nodes which we need to keep in the module. - ValueVec.push_back(Op); - } - if (!ContainsNodesToRemove) - return; - - if (ValueVec.empty()) { - // If all nodes need to be removed then just remove named metadata - // completely. - ExecutionModeMD->eraseFromParent(); - } else { - ExecutionModeMD->clearOperands(); - for (auto MD : ValueVec) - ExecutionModeMD->addOperand(MD); - } -} - ModuleDesc extractSubModule(const ModuleDesc &MD, const SetVector GVs, EntryPointGroup ModuleEntryPoints) { @@ -351,7 +306,7 @@ ModuleDesc extractCallGraph(const ModuleDesc &MD, GVs, MD.getModule(), ModuleEntryPoints, CG, IncludeFunctionPredicate); ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints)); - LLVM_DEBUG(SplitM.dump(dbgs())); + LLVM_DEBUG(SplitM.dump()); SplitM.cleanup(); return SplitM; @@ -389,10 +344,10 @@ class ModuleSplitter : public ModuleSplitterBase { namespace llvm { std::optional convertStringToSplitMode(StringRef S) { - static const StringMap Values = {{"kernel", SPLIT_PER_KERNEL}, - {"source", SPLIT_PER_TU}, - {"auto", SPLIT_AUTO}, - {"none", SPLIT_NONE}}; + static const StringMap Values = {{"kernel", IRSplitMode::IRSM_PER_KERNEL}, + {"source", IRSplitMode::IRSM_PER_TU}, + {"auto", IRSplitMode::IRSM_AUTO}, + {"none", IRSplitMode::IRSM_NONE}}; auto It = Values.find(S); if (It == Values.end()) @@ -401,27 +356,61 @@ std::optional convertStringToSplitMode(StringRef S) { return It->second; } -void dumpEntryPoints(raw_ostream &OS, const EntryPointSet &C, +static void dumpEntryPoints(const EntryPointSet &C, std::string_view Msg) { constexpr size_t INDENT = 4; - OS.indent(INDENT) << "ENTRY POINTS" + dbgs().indent(INDENT) << "ENTRY POINTS" << " " << Msg << " {\n"; for (const Function *F : C) - OS.indent(INDENT) << " " << F->getName() << "\n"; + dbgs().indent(INDENT) << " " << F->getName() << "\n"; - OS.indent(INDENT) << "}\n"; + dbgs().indent(INDENT) << "}\n"; } -void dumpEntryPoints(raw_ostream &OS, const Module &M, - bool OnlyKernelsAreEntryPoints, std::string_view Msg) { - constexpr size_t INDENT = 4; - OS.indent(INDENT) << "ENTRY POINTS (Module)" - << " " << Msg << " {\n"; - for (const auto &F : M) - if (isEntryPoint(F, OnlyKernelsAreEntryPoints)) - OS.indent(INDENT) << " " << F.getName() << "\n"; +// Check "spirv.ExecutionMode" named metadata in the module and remove nodes +// that reference kernels that have dead prototypes or don't reference any +// kernel at all (nullptr). Dead prototypes are removed as well. +static void processSubModuleNamedMetadata(Module *M) { + auto ExecutionModeMD = M->getNamedMetadata("spirv.ExecutionMode"); + if (!ExecutionModeMD) + return; + + bool ContainsNodesToRemove = false; + SmallVector ValueVec; + for (auto Op : ExecutionModeMD->operands()) { + assert(Op->getNumOperands() > 0); + if (!Op->getOperand(0)) { + ContainsNodesToRemove = true; + continue; + } + + // If the first operand is not nullptr then it has to be a kernel + // function. + Value *Val = cast(Op->getOperand(0))->getValue(); + Function *F = cast(Val); + // If kernel function is just a prototype and unused then we can remove it + // and later remove corresponding spirv.ExecutionMode metadata node. + if (F->isDeclaration() && F->use_empty()) { + F->eraseFromParent(); + ContainsNodesToRemove = true; + continue; + } + + // Rememver nodes which we need to keep in the module. + ValueVec.push_back(Op); + } + if (!ContainsNodesToRemove) + return; - OS.indent(INDENT) << "}\n"; + if (ValueVec.empty()) { + // If all nodes need to be removed then just remove named metadata + // completely. + ExecutionModeMD->eraseFromParent(); + } else { + ExecutionModeMD->clearOperands(); + for (auto MD : ValueVec) + ExecutionModeMD->addOperand(MD); + } } void ModuleDesc::cleanup() { @@ -456,11 +445,11 @@ ModuleDesc ModuleDesc::clone() const { return NewMD; } -void ModuleDesc::dump(raw_ostream &OS) const { +void ModuleDesc::dump() const { assert(M && "dump of empty ModuleDesc"); - OS << "split_module::ModuleDesc[" << M->getName() << "] {\n"; - dumpEntryPoints(OS, entries(), EntryPoints.GroupId.c_str()); - OS << "}\n"; + dbgs() << "split_module::ModuleDesc[" << M->getName() << "] {\n"; + dumpEntryPoints(entries(), EntryPoints.GroupId.c_str()); + dbgs() << "}\n"; } void EntryPointGroup::saveNames(std::vector &Dest) const { @@ -570,34 +559,6 @@ class FunctionsCategorizer { Rules.emplace_back(Rule::RKind::K_SortedIntegersListMetadata, MetadataName); } - // Creates a rule, which adds a list of sorted dash-separated integers from - // converted into strings listed in a metadata to a resulting identifier. - // The form of the metadata is expected to be a metadata node, with its - // operands being either an integer or another metadata node with the - // form of {!"", iN }. - void registerAspectListRule(StringRef MetadataName) { - registerRule([MetadataName](Function *F) { - SmallString<128> Result; - if (MDNode *UsedAspects = F->getMetadata(MetadataName)) { - SmallVector Values; - for (const MDOperand &MDOp : UsedAspects->operands()) { - if (auto MDN = dyn_cast(MDOp)) { - assert(MDN->getNumOperands() == 2); - Values.push_back(mdconst::extract(MDN->getOperand(1)) - ->getZExtValue()); - } else if (auto C = mdconst::dyn_extract(MDOp)) - Values.push_back(C->getZExtValue()); - } - - llvm::sort(Values); - for (std::uint64_t V : Values) - Result += ("-" + Twine(V)).str(); - } - - return std::string(Result); - }); - } - private: struct Rule { struct FlagRuleData { @@ -659,7 +620,7 @@ class FunctionsCategorizer { Rule(Rule &&Other) = default; }; - std::vector Rules; + SmallVector Rules; }; std::string FunctionsCategorizer::computeCategoryFor(Function *F) const { @@ -787,14 +748,11 @@ getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, // output files in existing tests. Categorizer.registerSimpleStringAttributeRule("sycl-register-alloc-mode"); Categorizer.registerSimpleStringAttributeRule("sycl-grf-size"); - Categorizer.registerAspectListRule("sycl_used_aspects"); Categorizer.registerListOfIntegersInMetadataRule("reqd_work_group_size"); Categorizer.registerListOfIntegersInMetadataRule("work_group_num_dim"); Categorizer.registerListOfIntegersInMetadataRule( "intel_reqd_sub_group_size"); Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_OPTLEVEL); - Categorizer.registerSimpleStringMetadataRule("sycl_joint_matrix"); - Categorizer.registerSimpleStringMetadataRule("sycl_joint_matrix_mad"); break; } @@ -823,8 +781,8 @@ getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, Groups.emplace_back(Key, std::move(EntryPoints), MDProps); } - bool DoSplit = (Mode != SPLIT_NONE && - (Groups.size() > 1 || !Groups.cbegin()->Functions.empty())); + bool DoSplit = (Mode != IRSplitMode::IRSM_NONE && + (Groups.size() > 1 || !Groups.begin()->Functions.empty())); if (DoSplit) return std::make_unique(std::move(MD), std::move(Groups)); @@ -864,7 +822,7 @@ saveModuleDesc(ModuleDesc &MD, std::string Prefix, bool OutputAssembly) { return SM; } -Expected> +Expected> parseSYCLSplitModulesFromFile(StringRef File) { auto EntriesMBOrErr = llvm::MemoryBuffer::getFile(File); if (!EntriesMBOrErr) @@ -878,7 +836,7 @@ parseSYCLSplitModulesFromFile(StringRef File) { // "Code" and "Symbols" at the moment. static constexpr int NUMBER_COLUMNS = 2; ++LI; - std::vector Modules; + SmallVector Modules; while (!LI.is_at_eof()) { StringRef Line = *LI; if (Line.empty()) @@ -907,7 +865,7 @@ parseSYCLSplitModulesFromFile(StringRef File) { return Modules; } -Expected> +Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); auto Splitter = getDeviceCodeSplitter(std::move(MD), Settings.Mode, @@ -915,7 +873,7 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { /*EmitOnlyKernelsAsEntryPoints=*/false); size_t ID = 0; - std::vector OutputImages; + SmallVector OutputImages; while (Splitter->hasMoreSplits()) { ModuleDesc MD = Splitter->nextSplit(); diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp index deaec74f99b32..66fad2e0db3d8 100644 --- a/llvm/tools/llvm-split/llvm-split.cpp +++ b/llvm/tools/llvm-split/llvm-split.cpp @@ -32,6 +32,9 @@ #include "llvm/Transforms/Utils/SYCLUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" +#include +#include + using namespace llvm; static cl::OptionCategory SplitCategory("Split Options"); @@ -75,12 +78,12 @@ static cl::opt cl::opt SYCLSplitMode( "sycl-split", cl::desc("module split mode"), cl::Optional, - cl::init(SPLIT_NONE), + cl::init(IRSplitMode::IRSM_NONE), cl::values( - clEnumValN(SPLIT_PER_TU, "source", + clEnumValN(IRSplitMode::IRSM_PER_TU, "source", "1 output module per source (translation unit)"), - clEnumValN(SPLIT_PER_KERNEL, "kernel", "1 output module per kernel"), - clEnumValN(SPLIT_AUTO, "auto", "Choose split mode automatically")), + clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", "1 output module per kernel"), + clEnumValN(IRSplitMode::IRSM_AUTO, "auto", "Choose split mode automatically")), cl::cat(SplitCategory)); cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), @@ -97,7 +100,7 @@ void writeStringToFile(std::string_view Content, StringRef Path) { OS << Content << "\n"; } -void dumpSplitModulesAsTable(const std::vector &SplitModules, +void writeSplitModulesAsTable(ArrayRef SplitModules, StringRef Path) { std::vector Columns = {"Code", "Symbols"}; SYCLStringTable Table; @@ -128,7 +131,7 @@ Error runSYCLSplitModule(std::unique_ptr M) { if (!SplitModulesOrErr) return SplitModulesOrErr.takeError(); - dumpSplitModulesAsTable(*SplitModulesOrErr, OutputFilename); + writeSplitModulesAsTable(*SplitModulesOrErr, OutputFilename); return Error::success(); } @@ -185,7 +188,7 @@ int main(int argc, char **argv) { Out->keep(); }; - if (SYCLSplitMode != IRSplitMode::SPLIT_NONE) { + if (SYCLSplitMode != IRSplitMode::IRSM_NONE) { auto E = runSYCLSplitModule(std::move(M)); if (E) { errs() << E << "\n"; From a77328076abb7e16f31f65685d96ba2ee7d19234 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Mon, 4 Nov 2024 06:23:47 -0800 Subject: [PATCH 03/16] remove unused rebuild functions --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 29 ---- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 31 ---- .../device-code-split/per-aspect-split-1.ll | 133 ------------------ .../device-code-split/per-aspect-split-2.ll | 59 -------- .../device-code-split/per-aspect-split-3.ll | 94 ------------- 5 files changed, 346 deletions(-) delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index bb5284815b4b4..7d443988f1354 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -21,10 +21,6 @@ #include #include #include -#include - -// TODO(maksimsab): -// * Maybe fix doxygen comments. namespace llvm { @@ -72,10 +68,6 @@ struct EntryPointGroup { EntryPointGroup(StringRef GroupId, EntryPointSet Functions, const Properties &Props) : GroupId(GroupId), Functions(std::move(Functions)), Props(Props) {} - - void saveNames(std::vector &Dest) const; - void rebuildFromNames(const std::vector &Names, const Module &M); - void rebuild(const Module &M); }; using EntryPointGroupVec = SmallVector; @@ -97,11 +89,6 @@ class ModuleDesc { ModuleDesc(std::unique_ptr M, EntryPointGroup EntryPoints) : M(std::move(M)), EntryPoints(std::move(EntryPoints)) {} - ModuleDesc(std::unique_ptr M, const std::vector &Names) - : M(std::move(M)) { - rebuildEntryPoints(Names); - } - const EntryPointSet &entries() const { return EntryPoints.Functions; } const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } EntryPointSet &entries() { return EntryPoints.Functions; } @@ -109,22 +96,6 @@ class ModuleDesc { const Module &getModule() const { return *M; } std::unique_ptr releaseModulePtr() { return std::move(M); } - // Sometimes, during module transformations, some Function objects within the - // module are replaced with different Function objects with the same name. - // Entry points need to be updated to include the replacement function. - // save/rebuild pair of functions is provided to automate this process. - void saveEntryPointNames(std::vector &Dest) { - EntryPoints.saveNames(Dest); - } - - void rebuildEntryPoints(const std::vector &Names) { - EntryPoints.rebuildFromNames(Names, getModule()); - } - - void rebuildEntryPoints(const Module &M) { EntryPoints.rebuild(M); } - - void rebuildEntryPoints() { EntryPoints.rebuild(*M); } - // Cleans up module IR - removes dead globals, debug info etc. void cleanup(); diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index 88bd847bee839..6a3370bdff75b 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -41,7 +41,6 @@ #include #include #include -#include using namespace llvm; @@ -452,36 +451,6 @@ void ModuleDesc::dump() const { dbgs() << "}\n"; } -void EntryPointGroup::saveNames(std::vector &Dest) const { - Dest.reserve(Dest.size() + Functions.size()); - std::transform(Functions.begin(), Functions.end(), - std::inserter(Dest, Dest.end()), - [](const Function *F) { return F->getName().str(); }); -} - -void EntryPointGroup::rebuildFromNames(const std::vector &Names, - const Module &M) { - Functions.clear(); - auto It0 = Names.cbegin(); - auto It1 = Names.cend(); - std::for_each(It0, It1, [&](const std::string &Name) { - // Sometimes functions considered entry points (those for which isEntryPoint - // returned true) may be dropped by optimizations, such as AlwaysInliner. - // For example, if a linkonce_odr function is inlined and there are no other - // uses, AlwaysInliner drops it. It is responsibility of the user to make an - // entry point not have internal linkage (such as linkonce_odr) to guarantee - // its availability in the resulting device binary image. - if (Function *F = M.getFunction(Name)) - Functions.insert(F); - }); -} - -void EntryPointGroup::rebuild(const Module &M) { - for (const Function &F : M.functions()) - if (F.getCallingConv() == CallingConv::SPIR_KERNEL) - Functions.insert(const_cast(&F)); -} - std::string ModuleDesc::makeSymbolTable() const { std::string ST; for (const Function *F : EntryPoints.Functions) diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll deleted file mode 100644 index cbd91724959b8..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-1.ll +++ /dev/null @@ -1,133 +0,0 @@ -; This test emulates two translation units with 3 kernels: -; TU0_kernel0 - 1st translation unit, no aspects used -; TU0_kernel1 - 1st translation unit, aspect 1 is used -; TU1_kernel2 - 2nd translation unit, no aspects used - -; The test is intended to check that sycl-post-link correctly separates kernels -; that use aspects from kernels which doesn't use aspects regardless of device -; code split mode - -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 - -; RUN: llvm-split -sycl-split=source -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 - -; RUN: llvm-split -sycl-split=kernel -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 - -; Regardless of device code split mode, each kernel should go into a separate -; device image - -; CHECK-M2-IR: define {{.*}} @TU0_kernel0 -; CHECK-M2-SYMS: TU0_kernel0 - -; CHECK-M1-IR: define {{.*}} @TU0_kernel1 -; CHECK-M1-SYMS: TU0_kernel1 - -; CHECK-M0-IR: define {{.*}} @TU1_kernel2 -; CHECK-M0-SYMS: TU1_kernel2 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -; FIXME: device globals should also be properly distributed across device images -; if they are of optional type -@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 - -define dso_local spir_kernel void @TU0_kernel0() #0 { -entry: - call spir_func void @foo() - ret void -} - -define dso_local spir_func void @foo() { -entry: - %a = alloca i32, align 4 - %call = call spir_func i32 @bar(i32 1) - %add = add nsw i32 2, %call - store i32 %add, i32* %a, align 4 - ret void -} - -; Function Attrs: nounwind -define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { -entry: - %arg.addr = alloca i32, align 4 - store i32 %arg, i32* %arg.addr, align 4 - %0 = load i32, i32* %arg.addr, align 4 - ret i32 %0 -} - -define dso_local spir_kernel void @TU0_kernel1() #0 !sycl_used_aspects !2 { -entry: - call spir_func void @foo1() - ret void -} - -; Function Attrs: nounwind -define dso_local spir_func void @foo1() { -entry: - %a = alloca i32, align 4 - store i32 2, i32* %a, align 4 - ret void -} - -define dso_local spir_kernel void @TU1_kernel2() #1 { -entry: - call spir_func void @foo2() - ret void -} - -; Function Attrs: nounwind -define dso_local spir_func void @foo2() { -entry: - %a = alloca i32, align 4 - %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 - %add = add nsw i32 4, %0 - store i32 %add, i32* %a, align 4 - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } - -!opencl.spir.version = !{!0, !0} -!spirv.Source = !{!1, !1} - -!0 = !{i32 1, i32 2} -!1 = !{i32 4, i32 100000} -!2 = !{i32 1} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll deleted file mode 100644 index f3f919fe45534..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-2.ll +++ /dev/null @@ -1,59 +0,0 @@ -; The test is intended to check that SYCL Module splitting correctly groups kernels -; by unique sets of aspects used in them - -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE -; -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel3 --implicit-check-not kernel1 \ -; RUN: --implicit-check-not kernel2 -; -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ -; RUN: --implicit-check-not kernel2 -; -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 - -; CHECK-TABLE: Code -; CHECK-TABLE-NEXT: _0.sym -; CHECK-TABLE-NEXT: _1.sym -; CHECK-TABLE-NEXT: _2.sym -; CHECK-TABLE-EMPTY: - -; CHECK-M0-SYMS: kernel3 - -; CHECK-M1-SYMS: kernel1 -; CHECK-M1-SYMS: kernel2 - -; CHECK-M2-SYMS: kernel0 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -define dso_local spir_kernel void @kernel0() #0 !sycl_used_aspects !1 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel1() #0 !sycl_used_aspects !2 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel2() #0 !sycl_used_aspects !3 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel3() #0 !sycl_used_aspects !4 { -entry: - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } - -!1 = !{i32 1} -!2 = !{i32 1, i32 2} -!3 = !{i32 2, i32 1} -!4 = !{i32 2, i32 3, i32 4} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll deleted file mode 100644 index 2ac32bdefa61a..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-aspect-split-3.ll +++ /dev/null @@ -1,94 +0,0 @@ -; This test is intended to check that per-aspect device code split works as -; expected with SYCL_EXTERNAL functions - -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE -; -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ -; RUN: --implicit-check-not foo --implicit-check-not kernel1 -; -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ -; RUN: --implicit-check-not foo --implicit-check-not kernel0 -; -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not foo \ -; RUN: --implicit-check-not bar -; -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not bar - -; We expect to see 3 modules generated: -; -; CHECK-TABLE: Code -; CHECK-TABLE-NEXT: _0.sym -; CHECK-TABLE-NEXT: _1.sym -; CHECK-TABLE-NEXT: _2.sym -; CHECK-TABLE-EMPTY: - -; sycl-post-link aims to achieve two goals while doing splitting: -; - each kernel must be self-contained, i.e. all functions called from a -; kernel must reside in the same device image -; - each entry point should be assigned to a correct device image in -; accordance with selected device code split mode -; -; In this test @bar and @foo are SYCL_EXTERNAL functions and they are treated -; as entry points. -; -; @bar uses the same list of aspects as @kernel0 which calls it and therefore -; they can be put into the same device image. There also goes @baz, because of -; the same list of used aspects. -; -; CHECK-M0-SYMS: bar -; CHECK-M0-SYMS: baz -; CHECK-M0-SYMS: kernel0 -; -; List of aspects used by @foo is different from the one attached to @kernel1 -; which calls @foo (for example, @kernel1 uses an extra optional feature besides -; ones used in @foo). As a result, @foo should be both included into the same -; device image as @kernel1 to make it self contained, but at the same time it -; should also present in a separate device image, because it is an entry point -; with unique set of used aspects. -; -; CHECK-M1-SYMS: kernel1 -; -; CHECK-M2-SYMS: foo -; -; @kernel1 uses @foo and therefore @foo should be present in the same module as -; @kernel1 as well -; CHECK-M1-IR-DAG: define {{.*}}spir_func void @foo -; CHECK-M1-IR-DAG: define spir_kernel void @kernel1 - - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -define spir_func void @foo() #0 !sycl_used_aspects !1 { - ret void -} - -define spir_func void @bar() #1 !sycl_used_aspects !2 { - ret void -} - -define spir_func void @baz() #1 !sycl_used_aspects !2 { - ret void -} - -define spir_kernel void @kernel0() #1 !sycl_used_aspects !2 { -entry: - call void @bar() - ret void -} - -define spir_kernel void @kernel1() #0 !sycl_used_aspects !3 { -entry: - call void @foo() - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } - -!1 = !{i32 1} -!2 = !{i32 2} -!3 = !{i32 3, i32 1} From 5e176d6e062de5fe7aab6387b6ddf4224fc699ed Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Mon, 4 Nov 2024 08:30:25 -0800 Subject: [PATCH 04/16] remove unused functions from ESIMD part --- .../include/llvm/Transforms/Utils/SYCLUtils.h | 42 ------ llvm/lib/Transforms/Utils/SYCLUtils.cpp | 136 ------------------ 2 files changed, 178 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h index 45ddc1734f922..5acf845238b9a 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h @@ -73,48 +73,6 @@ void traverseCallgraphUp( ErrorOnNonCallUse, functionFilter); } -/// Tells if this value is a bit cast or address space cast. -bool isCast(const Value *V); - -/// Tells if this value is a GEP instructions with all zero indices. -bool isZeroGEP(const Value *V); - -/// Climbs up the use-def chain of given value until a value which is not a -/// bit cast or address space cast is met. -const Value *stripCasts(const Value *V); -Value *stripCasts(Value *V); - -/// Climbs up the use-def chain of given value until a value is met which is -/// neither of: -/// - bit cast -/// - address space cast -/// - GEP instruction with all zero indices -const Value *stripCastsAndZeroGEPs(const Value *V); -Value *stripCastsAndZeroGEPs(Value *V); - -/// Collects uses of given value "looking through" casts. I.e. if a use is a -/// cast (chain), then uses of the result of the cast (chain) are collected. -void collectUsesLookThroughCasts(const Value *V, - SmallPtrSetImpl &Uses); - -/// Collects uses of given pointer-typed value "looking through" casts and GEPs -/// with all zero indices - those pointer transformation instructions which -/// don't change pointed-to value. E.g. if a use is a cast (chain), then uses of -/// the result of the cast (chain) are collected. -void collectUsesLookThroughCastsAndZeroGEPs(const Value *V, - SmallPtrSetImpl &Uses); - -void collectUsesLookThroughCasts(const Value *V, - SmallPtrSetImpl &Uses); - -void collectUsesLookThroughCastsAndZeroGEPs(const Value *V, - SmallPtrSetImpl &Uses); - -bool collectPossibleStoredVals( - Value *Addr, SmallPtrSetImpl &Vals, - std::function EscapesIfAddrIsArgOf = - [](const CallInst *) { return true; }); - inline bool isSYCLExternalFunction(const Function *F) { return F->hasFnAttribute(ATTR_SYCL_MODULE_ID); } diff --git a/llvm/lib/Transforms/Utils/SYCLUtils.cpp b/llvm/lib/Transforms/Utils/SYCLUtils.cpp index 95ce5522a2600..450b9d6380feb 100644 --- a/llvm/lib/Transforms/Utils/SYCLUtils.cpp +++ b/llvm/lib/Transforms/Utils/SYCLUtils.cpp @@ -75,142 +75,6 @@ void traverseCallgraphUp(llvm::Function *F, CallGraphNodeAction ActionF, } } -bool isCast(const Value *V) { - int Opc = Operator::getOpcode(V); - return (Opc == Instruction::BitCast) || (Opc == Instruction::AddrSpaceCast); -} - -bool isZeroGEP(const Value *V) { - const auto *GEPI = dyn_cast(V); - return GEPI && GEPI->hasAllZeroIndices(); -} - -Value *stripCasts(Value *V) { - return const_cast(stripCasts(const_cast(V))); -} - -const Value *stripCastsAndZeroGEPs(const Value *V); - -Value *stripCastsAndZeroGEPs(Value *V) { - return const_cast( - stripCastsAndZeroGEPs(const_cast(V))); -} - -const Value *stripCasts(const Value *V) { - if (!V->getType()->isPtrOrPtrVectorTy()) - return V; - // Even though we don't look through PHI nodes, we could be called on an - // instruction in an unreachable block, which may be on a cycle. - SmallPtrSet Visited; - Visited.insert(V); - - do { - if (isCast(V)) { - V = cast(V)->getOperand(0); - } - assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!"); - } while (Visited.insert(V).second); - return V; -} - -const Value *stripCastsAndZeroGEPs(const Value *V) { - if (!V->getType()->isPtrOrPtrVectorTy()) - return V; - // Even though we don't look through PHI nodes, we could be called on an - // instruction in an unreachable block, which may be on a cycle. - SmallPtrSet Visited; - Visited.insert(V); - - do { - if (isCast(V)) { - V = cast(V)->getOperand(0); - } else if (isZeroGEP(V)) { - V = cast(V)->getOperand(0); - } - assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!"); - } while (Visited.insert(V).second); - return V; -} - -void collectUsesLookThroughCasts(const Value *V, - SmallPtrSetImpl &Uses) { - for (const Use &U : V->uses()) { - Value *VV = U.getUser(); - - if (isCast(VV)) { - collectUsesLookThroughCasts(VV, Uses); - } else { - Uses.insert(&U); - } - } -} - -void collectUsesLookThroughCastsAndZeroGEPs( - const Value *V, SmallPtrSetImpl &Uses) { - assert(V->getType()->isPtrOrPtrVectorTy() && "pointer type expected"); - - for (const Use &U : V->uses()) { - Value *VV = U.getUser(); - - if (isCast(VV) || isZeroGEP(VV)) { - collectUsesLookThroughCastsAndZeroGEPs(VV, Uses); - } else { - Uses.insert(&U); - } - } -} - -// Tries to find possible values stored into given address. -// Returns true if the set of values could be reliably found, false otherwise. -bool collectPossibleStoredVals( - Value *Addr, SmallPtrSetImpl &Vals, - std::function EscapesIfAddrIsArgOf) { - SmallPtrSet Visited; - AllocaInst *LocalVar = dyn_cast_or_null(stripCasts(Addr)); - - if (!LocalVar) { - return false; - } - SmallPtrSet Uses; - collectUsesLookThroughCasts(LocalVar, Uses); - - for (const Use *U : Uses) { - Value *V = U->getUser(); - - if (auto *StI = dyn_cast(V)) { - if (U != &StI->getOperandUse(StoreInst::getPointerOperandIndex())) { - // this is double indirection - not supported - return false; - } - V = stripCasts(StI->getValueOperand()); - - if (auto *LI = dyn_cast(V)) { - // A value loaded from another address is stored at this address - - // recurse into the other address - if (!collectPossibleStoredVals(LI->getPointerOperand(), Vals)) { - return false; - } - } else { - Vals.insert(V); - } - continue; - } - if (const auto *CI = dyn_cast(V)) { - if (EscapesIfAddrIsArgOf(CI)) { - return false; - } - continue; - } - if (isa(V)) { - // LoadInst from this addr is OK, as it does not affect what can be stored - // through the addr - continue; - } - return false; - } - return true; -} - bool removeSYCLKernelsConstRefArray(Module &M) { GlobalVariable *GV = M.getGlobalVariable("llvm.used"); From 6a943c317b19a7af423d6c74f1f3631566d0ee26 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Tue, 5 Nov 2024 08:45:49 -0800 Subject: [PATCH 05/16] Remove ModuleDesc::clone(). Remove IROutputOnly. Apply clang-format --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 24 ++++++------- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 35 ++++++------------- 2 files changed, 22 insertions(+), 37 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 7d443988f1354..179e2f0af762d 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -28,10 +28,10 @@ class Function; class Module; enum class IRSplitMode { - IRSM_PER_TU, // one module per translation unit - IRSM_PER_KERNEL, // one module per kernel - IRSM_AUTO, // automatically select split mode - IRSM_NONE // no splitting + IRSM_PER_TU, // one module per translation unit + IRSM_PER_KERNEL, // one module per kernel + IRSM_AUTO, // automatically select split mode + IRSM_NONE // no splitting }; /// \returns IRSplitMode value if \p S is recognized. Otherwise, std::nullopt is @@ -99,16 +99,15 @@ class ModuleDesc { // Cleans up module IR - removes dead globals, debug info etc. void cleanup(); - ModuleDesc clone() const; - std::string makeSymbolTable() const; void dump() const; }; /// Module split support interface. -/// It gets a module (in a form of module descriptor, to get additional info) and -/// a collection of entry points groups. Each group specifies subset entry points +/// It gets a module (in a form of module descriptor, to get additional info) +/// and a collection of entry points groups. Each group specifies subset entry +/// points // from input module that should be included in a split module. class ModuleSplitterBase { protected: @@ -137,8 +136,8 @@ class ModuleSplitterBase { virtual ~ModuleSplitterBase() = default; - /// Gets next subsequence of entry points in an input module and provides split - /// submodule containing these entry points and their dependencies. + /// Gets next subsequence of entry points in an input module and provides + /// split submodule containing these entry points and their dependencies. virtual ModuleDesc nextSplit() = 0; /// Returns a number of remaining modules, which can be split out using this @@ -153,8 +152,9 @@ std::unique_ptr getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, bool EmitOnlyKernelsAsEntryPoints); -/// The structure represents a split LLVM Module accompanied by additional information. -/// Split Modules are being stored at disk due to the high RAM consumption during the whole splitting process. +/// The structure represents a split LLVM Module accompanied by additional +/// information. Split Modules are being stored at disk due to the high RAM +/// consumption during the whole splitting process. struct SYCLSplitModule { std::string ModuleFilePath; std::string Symbols; diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index 6a3370bdff75b..5070f1c10df04 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -49,11 +49,8 @@ using namespace llvm; namespace { // Identifying name for global scope constexpr char GLOBAL_SCOPE_NAME[] = ""; -constexpr char SYCL_SCOPE_NAME[] = ""; -EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M, - IRSplitMode Mode, - bool AutoSplitIsGlobalScope) { +EntryPointsGroupScope selectDeviceCodeGroupScope(IRSplitMode Mode) { switch (Mode) { case IRSplitMode::IRSM_PER_TU: return Scope_PerModule; @@ -62,9 +59,6 @@ EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M, return Scope_PerKernel; case IRSplitMode::IRSM_AUTO: { - if (AutoSplitIsGlobalScope) - return Scope_Global; - // At the moment, we assume that per-source split is the best way of // splitting device code and can always be used except for cases handled // above. @@ -343,10 +337,11 @@ class ModuleSplitter : public ModuleSplitterBase { namespace llvm { std::optional convertStringToSplitMode(StringRef S) { - static const StringMap Values = {{"kernel", IRSplitMode::IRSM_PER_KERNEL}, - {"source", IRSplitMode::IRSM_PER_TU}, - {"auto", IRSplitMode::IRSM_AUTO}, - {"none", IRSplitMode::IRSM_NONE}}; + static const StringMap Values = { + {"kernel", IRSplitMode::IRSM_PER_KERNEL}, + {"source", IRSplitMode::IRSM_PER_TU}, + {"auto", IRSplitMode::IRSM_AUTO}, + {"none", IRSplitMode::IRSM_NONE}}; auto It = Values.find(S); if (It == Values.end()) @@ -355,11 +350,10 @@ std::optional convertStringToSplitMode(StringRef S) { return It->second; } -static void dumpEntryPoints(const EntryPointSet &C, - std::string_view Msg) { +static void dumpEntryPoints(const EntryPointSet &C, std::string_view Msg) { constexpr size_t INDENT = 4; dbgs().indent(INDENT) << "ENTRY POINTS" - << " " << Msg << " {\n"; + << " " << Msg << " {\n"; for (const Function *F : C) dbgs().indent(INDENT) << " " << F->getName() << "\n"; @@ -437,13 +431,6 @@ void ModuleDesc::cleanup() { processSubModuleNamedMetadata(M.get()); } -ModuleDesc ModuleDesc::clone() const { - std::unique_ptr NewModule = CloneModule(getModule()); - ModuleDesc NewMD(std::move(NewModule)); - NewMD.EntryPoints.Props = EntryPoints.Props; - return NewMD; -} - void ModuleDesc::dump() const { assert(M && "dump of empty ModuleDesc"); dbgs() << "split_module::ModuleDesc[" << M->getName() << "] {\n"; @@ -675,12 +662,11 @@ std::string FunctionsCategorizer::computeCategoryFor(Function *F) const { } // namespace std::unique_ptr -getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, +getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool EmitOnlyKernelsAsEntryPoints) { FunctionsCategorizer Categorizer; - EntryPointsGroupScope Scope = - selectDeviceCodeGroupScope(MD.getModule(), Mode, IROutputOnly); + EntryPointsGroupScope Scope = selectDeviceCodeGroupScope(Mode); switch (Scope) { case Scope_Global: @@ -838,7 +824,6 @@ Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); auto Splitter = getDeviceCodeSplitter(std::move(MD), Settings.Mode, - /*IROutputOnly=*/false, /*EmitOnlyKernelsAsEntryPoints=*/false); size_t ID = 0; From 3171ebe7ebbeb9227dbfa052f62aff9ca364ec05 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Thu, 14 Nov 2024 07:37:51 -0800 Subject: [PATCH 06/16] Simplify ModuleSplitterBase ModuleCopier is removed. ModuleSplitterBase is replaced by simplified ModuleSplitter. getDeviceCodeSplitter is changed to selectEntryPointGroups function. collectFunctionsAndGlobalVariablesToExtract is polished according to LLVM Coding Standards. --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 53 +------ llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 134 ++++++++++-------- 2 files changed, 73 insertions(+), 114 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 179e2f0af762d..1f3713a6dcc32 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -39,7 +39,7 @@ enum class IRSplitMode { std::optional convertStringToSplitMode(StringRef S); // A vector that contains all entry point functions in a split module. -using EntryPointSet = SetVector; +using EntryPointSet = SetVector; /// Describes scope covered by each entry in the module-entry points map /// populated by the groupEntryPointsByScope function. @@ -70,8 +70,7 @@ struct EntryPointGroup { : GroupId(GroupId), Functions(std::move(Functions)), Props(Props) {} }; -using EntryPointGroupVec = SmallVector; - +// TODO: move it into cpp file. /// Annotates an llvm::Module with information necessary to perform and track /// result of device code (llvm::Module instances) splitting: /// - entry points of the module determined e.g. by a module splitter, as well @@ -104,54 +103,6 @@ class ModuleDesc { void dump() const; }; -/// Module split support interface. -/// It gets a module (in a form of module descriptor, to get additional info) -/// and a collection of entry points groups. Each group specifies subset entry -/// points -// from input module that should be included in a split module. -class ModuleSplitterBase { -protected: - ModuleDesc Input; - EntryPointGroupVec Groups; - -protected: - EntryPointGroup nextGroup() { - assert(hasMoreSplits() && "Reached end of entry point groups list."); - EntryPointGroup Res = std::move(Groups.back()); - Groups.pop_back(); - return Res; - } - - Module &getInputModule() { return Input.getModule(); } - - std::unique_ptr releaseInputModule() { - return Input.releaseModulePtr(); - } - -public: - ModuleSplitterBase(ModuleDesc MD, EntryPointGroupVec GroupVec) - : Input(std::move(MD)), Groups(std::move(GroupVec)) { - assert(!Groups.empty() && "Entry points groups collection is empty!"); - } - - virtual ~ModuleSplitterBase() = default; - - /// Gets next subsequence of entry points in an input module and provides - /// split submodule containing these entry points and their dependencies. - virtual ModuleDesc nextSplit() = 0; - - /// Returns a number of remaining modules, which can be split out using this - /// splitter. The value is reduced by 1 each time nextSplit is called. - size_t remainingSplits() const { return Groups.size(); } - - /// Check that there are still submodules to split. - bool hasMoreSplits() const { return remainingSplits() > 0; } -}; - -std::unique_ptr -getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, bool IROutputOnly, - bool EmitOnlyKernelsAsEntryPoints); - /// The structure represents a split LLVM Module accompanied by additional /// information. Split Modules are being stored at disk due to the high RAM /// consumption during the whole splitting process. diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index 5070f1c10df04..c09f731cf4c9a 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -231,43 +231,34 @@ class DependencyGraph { void collectFunctionsAndGlobalVariablesToExtract( SetVector &GVs, const Module &M, - const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &Deps, - const std::function &IncludeFunctionPredicate = - nullptr) { + const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &DG) { // We start with module entry points for (const auto *F : ModuleEntryPoints.Functions) GVs.insert(F); // Non-discardable global variables are also include into the initial set - for (const auto &GV : M.globals()) { + for (const auto &GV : M.globals()) if (!GV.isDiscardableIfUnused()) GVs.insert(&GV); - } // GVs has SetVector type. This type inserts a value only if it is not yet // present there. So, recursion is not expected here. size_t Idx = 0; while (Idx < GVs.size()) { - const auto *Obj = GVs[Idx++]; + const GlobalValue *Obj = GVs[Idx++]; - for (const GlobalValue *Dep : Deps.dependencies(Obj)) { + for (const GlobalValue *Dep : DG.dependencies(Obj)) { if (const auto *Func = dyn_cast(Dep)) { - if (Func->isDeclaration()) - continue; - - // Functions can be additionally filtered - if (!IncludeFunctionPredicate || IncludeFunctionPredicate(Func)) + if (!Func->isDeclaration()) GVs.insert(Func); - } else { - // Global variables are added unconditionally - GVs.insert(Dep); - } + } else + GVs.insert(Dep); // Global variables are added unconditionally } } } ModuleDesc extractSubModule(const ModuleDesc &MD, - const SetVector GVs, + const SetVector &GVs, EntryPointGroup ModuleEntryPoints) { const Module &M = MD.getModule(); // For each group of entry points collect all dependencies. @@ -291,45 +282,52 @@ ModuleDesc extractSubModule(const ModuleDesc &MD, // in ModuleEntryPoints vector, in addition to the entry point functions. ModuleDesc extractCallGraph(const ModuleDesc &MD, EntryPointGroup ModuleEntryPoints, - const DependencyGraph &CG, - const std::function - &IncludeFunctionPredicate = nullptr) { + const DependencyGraph &DG) { SetVector GVs; - collectFunctionsAndGlobalVariablesToExtract( - GVs, MD.getModule(), ModuleEntryPoints, CG, IncludeFunctionPredicate); + collectFunctionsAndGlobalVariablesToExtract(GVs, MD.getModule(), + ModuleEntryPoints, DG); ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints)); LLVM_DEBUG(SplitM.dump()); SplitM.cleanup(); - return SplitM; } -class ModuleCopier : public ModuleSplitterBase { -public: - using ModuleSplitterBase::ModuleSplitterBase; // to inherit base constructors - - ModuleDesc nextSplit() override { - ModuleDesc Desc{releaseInputModule(), nextGroup()}; - // Do some basic optimization like unused symbol removal - // even if there was no split. - Desc.cleanup(); - return Desc; +using EntryPointGroupVec = SmallVector; + +/// Module Splitter. +/// It gets a module (in a form of module descriptor, to get additional info) +/// and a collection of entry points groups. Each group specifies subset entry +/// points from input module that should be included in a split module. +class ModuleSplitter { +private: + ModuleDesc Input; + EntryPointGroupVec Groups; + DependencyGraph DG; + +private: + EntryPointGroup drawEntryPointGroup() { + assert(Groups.size() > 0 && "Reached end of entry point groups list."); + EntryPointGroup Group = std::move(Groups.back()); + Groups.pop_back(); + return Group; } -}; -class ModuleSplitter : public ModuleSplitterBase { public: ModuleSplitter(ModuleDesc MD, EntryPointGroupVec GroupVec) - : ModuleSplitterBase(std::move(MD), std::move(GroupVec)), - CG(Input.getModule()) {} + : Input(std::move(MD)), Groups(std::move(GroupVec)), + DG(Input.getModule()) { + assert(!Groups.empty() && "Entry points groups collection is empty!"); + } - ModuleDesc nextSplit() override { - return extractCallGraph(Input, nextGroup(), CG); + /// Gets next subsequence of entry points in an input module and provides + /// split submodule containing these entry points and their dependencies. + ModuleDesc getNextSplit() { + return extractCallGraph(Input, drawEntryPointGroup(), DG); } -private: - DependencyGraph CG; + /// Check that there are still submodules to split. + bool hasMoreSplits() const { return Groups.size() > 0; } }; } // namespace @@ -464,11 +462,12 @@ class FunctionsCategorizer { public: FunctionsCategorizer() = default; - std::string computeCategoryFor(Function *) const; + std::string computeCategoryFor(const Function *) const; // Accepts a callback, which should return a string based on provided // function, which will be used as an entry points group identifier. - void registerRule(const std::function &Callback) { + void + registerRule(const std::function &Callback) { Rules.emplace_back(Rule::RKind::K_Callback, Callback); } @@ -523,7 +522,7 @@ class FunctionsCategorizer { private: std::variant> + std::function> Storage; public: @@ -579,7 +578,7 @@ class FunctionsCategorizer { SmallVector Rules; }; -std::string FunctionsCategorizer::computeCategoryFor(Function *F) const { +std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { SmallString<256> Result; for (const auto &R : Rules) { StringRef AttrName; @@ -659,11 +658,12 @@ std::string FunctionsCategorizer::computeCategoryFor(Function *F) const { return static_cast(Result); } + } // namespace -std::unique_ptr -getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, - bool EmitOnlyKernelsAsEntryPoints) { +static EntryPointGroupVec +selectEntryPointGroups(const ModuleDesc &MD, IRSplitMode Mode, + bool EmitOnlyKernelsAsEntryPoints) { FunctionsCategorizer Categorizer; EntryPointsGroupScope Scope = selectDeviceCodeGroupScope(Mode); @@ -672,13 +672,13 @@ getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, case Scope_Global: // We simply perform entry points filtering, but group all of them together. Categorizer.registerRule( - [](Function *) -> std::string { return GLOBAL_SCOPE_NAME; }); + [](const Function *) -> std::string { return GLOBAL_SCOPE_NAME; }); break; case Scope_PerKernel: // Per-kernel split is quite simple: every kernel goes into a separate // module and that's it, no other rules required. Categorizer.registerRule( - [](Function *F) -> std::string { return F->getName().str(); }); + [](const Function *F) -> std::string { return F->getName().str(); }); break; case Scope_PerModule: // The most complex case, because we should account for many other features @@ -716,7 +716,7 @@ getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, std::map EntryPointsMap; // Only process module entry points: - for (auto &F : MD.getModule().functions()) { + for (const auto &F : MD.getModule().functions()) { if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints)) continue; @@ -736,13 +736,7 @@ getDeviceCodeSplitter(ModuleDesc MD, IRSplitMode Mode, Groups.emplace_back(Key, std::move(EntryPoints), MDProps); } - bool DoSplit = (Mode != IRSplitMode::IRSM_NONE && - (Groups.size() > 1 || !Groups.begin()->Functions.empty())); - - if (DoSplit) - return std::make_unique(std::move(MD), std::move(Groups)); - - return std::make_unique(std::move(MD), std::move(Groups)); + return Groups; } static Error saveModuleIRInFile(Module &M, StringRef FilePath, @@ -823,13 +817,27 @@ parseSYCLSplitModulesFromFile(StringRef File) { Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); - auto Splitter = getDeviceCodeSplitter(std::move(MD), Settings.Mode, - /*EmitOnlyKernelsAsEntryPoints=*/false); + EntryPointGroupVec Groups = + selectEntryPointGroups(MD, Settings.Mode, + /*EmitOnlyKernelsAsEntryPoints=*/false); - size_t ID = 0; SmallVector OutputImages; - while (Splitter->hasMoreSplits()) { - ModuleDesc MD = Splitter->nextSplit(); + if (Groups.size() < 2) { + // FIXME(maksimsab): this branch is not tested yet. + std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); + auto ImageOrErr = + saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); + if (!ImageOrErr) + return ImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*ImageOrErr)); + return OutputImages; + } + + ModuleSplitter Splitter(std::move(MD), std::move(Groups)); + size_t ID = 0; + while (Splitter.hasMoreSplits()) { + ModuleDesc MD = Splitter.getNextSplit(); std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str(); auto SplitImageOrErr = From c7297ac2e2277743c8cf7a2e654b107e557d481f Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Mon, 18 Nov 2024 07:39:57 -0800 Subject: [PATCH 07/16] Remove EmitOnlyKernelsAsEntryPoints Remove mentions of indirectly-callable Remove unused rules in FuncitonCategorizer --- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 151 +----------------- .../indirectly-callable-auto-split.ll | 45 ------ .../indirectly-callable-per-kernel-split.ll | 53 ------ 3 files changed, 7 insertions(+), 242 deletions(-) delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index c09f731cf4c9a..44354d0df1ad4 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -72,28 +72,12 @@ EntryPointsGroupScope selectDeviceCodeGroupScope(IRSplitMode Mode) { llvm_unreachable("unsupported split mode"); } -// Return true if the function is a SPIRV or SYCL builtin, e.g. -// _Z28__spirv_GlobalInvocationId_xv -bool isSpirvSyclBuiltin(StringRef FName) { - if (!FName.consume_front("_Z")) - return false; - // now skip the digits - FName = FName.drop_while([](char C) { return std::isdigit(C); }); - - return FName.starts_with("__spirv_") || FName.starts_with("__sycl_"); -} - -// Return true if the function name starts with "__builtin_" -bool isGenericBuiltin(StringRef FName) { - return FName.starts_with("__builtin_"); -} - bool isKernel(const Function &F) { return F.getCallingConv() == CallingConv::SPIR_KERNEL || F.getCallingConv() == CallingConv::AMDGPU_KERNEL; } -bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { +bool isEntryPoint(const Function &F) { // Skip declarations, if any: they should not be included into a vector of // entry points groups or otherwise we will end up with incorrectly generated // list of symbols. @@ -101,21 +85,7 @@ bool isEntryPoint(const Function &F, bool EmitOnlyKernelsAsEntryPoints) { return false; // Kernels are always considered to be entry points - if (isKernel(F)) - return true; - - if (!EmitOnlyKernelsAsEntryPoints) { - // If not disabled, SYCL_EXTERNAL functions with sycl-module-id attribute - // are also considered as entry points (except __spirv_* and __sycl_* - // functions) - return llvm::isSYCLExternalFunction(&F) && - !isSpirvSyclBuiltin(F.getName()) && !isGenericBuiltin(F.getName()); - } - - // Even if we are emitting only kernels as entry points, virtual functions - // should still be treated as entry points, because they are going to be - // outlined into separate device images and linked in later. - return F.hasFnAttribute("indirectly-callable"); + return isKernel(F); } // Represents "dependency" or "use" graph of global objects (functions and @@ -477,52 +447,16 @@ class FunctionsCategorizer { Rules.emplace_back(Rule::RKind::K_SimpleStringAttribute, AttrName); } - // Creates a simple rule, which adds a value of a string metadata into a - // resulting identifier. - void registerSimpleStringMetadataRule(StringRef MetadataName) { - Rules.emplace_back(Rule::RKind::K_SimpleStringMetadata, MetadataName); - } - - // Creates a simple rule, which adds one or another value to a resulting - // identifier based on the presence of a metadata on a function. - void registerSimpleFlagAttributeRule(StringRef AttrName, - StringRef IfPresentStr, - StringRef IfAbsentStr = "") { - Rules.emplace_back(Rule::RKind::K_FlagAttribute, - Rule::FlagRuleData{AttrName, IfPresentStr, IfAbsentStr}); - } - - // Creates a simple rule, which adds one or another value to a resulting - // identifier based on the presence of a metadata on a function. - void registerSimpleFlagMetadataRule(StringRef MetadataName, - StringRef IfPresentStr, - StringRef IfAbsentStr = "") { - Rules.emplace_back( - Rule::RKind::K_FlagMetadata, - Rule::FlagRuleData{MetadataName, IfPresentStr, IfAbsentStr}); - } - // Creates a rule, which adds a list of dash-separated integers converted // into strings listed in a metadata to a resulting identifier. void registerListOfIntegersInMetadataRule(StringRef MetadataName) { Rules.emplace_back(Rule::RKind::K_IntegersListMetadata, MetadataName); } - // Creates a rule, which adds a list of sorted dash-separated integers - // converted into strings listed in a metadata to a resulting identifier. - void registerListOfIntegersInMetadataSortedRule(StringRef MetadataName) { - Rules.emplace_back(Rule::RKind::K_SortedIntegersListMetadata, MetadataName); - } - private: struct Rule { - struct FlagRuleData { - StringRef Name, IfPresentStr, IfAbsentStr; - }; - private: - std::variant> + std::variant> Storage; public: @@ -532,15 +466,7 @@ class FunctionsCategorizer { // Copy value of the specified attribute, if present K_SimpleStringAttribute, // Copy value of the specified metadata, if present - K_SimpleStringMetadata, - // Use one or another string based on the specified metadata presence - K_FlagMetadata, - // Use one or another string based on the specified attribute presence - K_FlagAttribute, - // Concatenate and use list of integers from the specified metadata K_IntegersListMetadata, - // Sort, concatenate and use list of integers from the specified metadata - K_SortedIntegersListMetadata }; RKind Kind; @@ -550,13 +476,8 @@ class FunctionsCategorizer { switch (K) { case RKind::K_SimpleStringAttribute: case RKind::K_IntegersListMetadata: - case RKind::K_SimpleStringMetadata: - case RKind::K_SortedIntegersListMetadata: return 0; case RKind::K_Callback: - return 2; - case RKind::K_FlagMetadata: - case RKind::K_FlagAttribute: return 1; } // can't use llvm_unreachable in constexpr context @@ -583,7 +504,6 @@ std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { for (const auto &R : Rules) { StringRef AttrName; StringRef MetadataName; - Rule::FlagRuleData Data; switch (R.Kind) { case Rule::RKind::K_Callback: @@ -598,25 +518,6 @@ std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { } break; - case Rule::RKind::K_SimpleStringMetadata: - MetadataName = R.getStorage(); - if (F->hasMetadata(MetadataName)) { - auto *MDN = F->getMetadata(MetadataName); - for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) { - MDString *S = cast(MDN->getOperand(I).get()); - Result += "-" + S->getString().str(); - } - } - break; - - case Rule::RKind::K_FlagMetadata: - Data = R.getStorage(); - if (F->hasMetadata(Data.Name)) - Result += Data.IfPresentStr; - else - Result += Data.IfAbsentStr; - break; - case Rule::RKind::K_IntegersListMetadata: MetadataName = R.getStorage(); if (F->hasMetadata(MetadataName)) { @@ -627,30 +528,6 @@ std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { mdconst::extract(MDOp)->getZExtValue()); } break; - - case Rule::RKind::K_SortedIntegersListMetadata: - MetadataName = R.getStorage(); - if (F->hasMetadata(MetadataName)) { - MDNode *MDN = F->getMetadata(MetadataName); - - SmallVector Values; - for (const MDOperand &MDOp : MDN->operands()) - Values.push_back(mdconst::extract(MDOp)->getZExtValue()); - - llvm::sort(Values); - - for (std::uint64_t V : Values) - Result += "-" + std::to_string(V); - } - break; - - case Rule::RKind::K_FlagAttribute: - Data = R.getStorage(); - if (F->hasFnAttribute(Data.Name)) - Result += Data.IfPresentStr; - else - Result += Data.IfAbsentStr; - break; } Result += "-"; @@ -661,9 +538,8 @@ std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { } // namespace -static EntryPointGroupVec -selectEntryPointGroups(const ModuleDesc &MD, IRSplitMode Mode, - bool EmitOnlyKernelsAsEntryPoints) { +static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD, + IRSplitMode Mode) { FunctionsCategorizer Categorizer; EntryPointsGroupScope Scope = selectDeviceCodeGroupScope(Mode); @@ -687,17 +563,6 @@ selectEntryPointGroups(const ModuleDesc &MD, IRSplitMode Mode, // This is core of per-source device code split Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_MODULE_ID); - // This attribute marks virtual functions and effectively dictates how they - // should be groupped together. By design we won't split those groups of - // virtual functions further even if functions from the same group use - // different optional features and therefore this rule is put here. - // Strictly speaking, we don't even care about module-id splitting for - // those, but to avoid that we need to refactor the whole categorizer. - // However, this is good enough as it is for an initial version. - // TODO: for AOT use case we shouldn't be outlining those and instead should - // only select those functions which are compatible with the target device - Categorizer.registerSimpleStringAttributeRule("indirectly-callable"); - // Optional features // Note: Add more rules at the end of the list to avoid chaning orders of // output files in existing tests. @@ -717,7 +582,7 @@ selectEntryPointGroups(const ModuleDesc &MD, IRSplitMode Mode, // Only process module entry points: for (const auto &F : MD.getModule().functions()) { - if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints)) + if (!isEntryPoint(F)) continue; std::string Key = Categorizer.computeCategoryFor(&F); @@ -817,9 +682,7 @@ parseSYCLSplitModulesFromFile(StringRef File) { Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); - EntryPointGroupVec Groups = - selectEntryPointGroups(MD, Settings.Mode, - /*EmitOnlyKernelsAsEntryPoints=*/false); + EntryPointGroupVec Groups = selectEntryPointGroups(MD, Settings.Mode); SmallVector OutputImages; if (Groups.size() < 2) { diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll deleted file mode 100644 index 69ee88d572960..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-auto-split.ll +++ /dev/null @@ -1,45 +0,0 @@ -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; -; This test checks that functions marked with "indirectly-callable" LLVM IR -; attribute are outlined into separate device image(s) in accordance with the -; attribute value. -; -; Current device code split implementation may split those groups further if -; they use different optional kernel features for example, but we don't care -; about that subsequent split and don't test it. -; -; RUN: FileCheck %s --input-file=%t_0.ll --check-prefix CHECK-IR0 \ -; RUN: --implicit-check-not kernel --implicit-check-not foo -; RUN: FileCheck %s --input-file=%t_1.ll --check-prefix CHECK-IR1 \ -; RUN: --implicit-check-not kernel --implicit-check-not bar \ -; RUN: --implicit-check-not baz -; RUN: FileCheck %s --input-file=%t_2.ll --check-prefix CHECK-IR2 \ -; RUN: --implicit-check-not foo --implicit-check-not bar \ -; RUN: --implicit-check-not baz - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" -target triple = "spir64-unknown-unknown" - -define spir_func void @foo() #0 { -entry: - ret void -} - -define spir_func void @bar() #1 { -entry: - ret void -} - -define spir_func void @baz() #1 { -entry: - ret void -} - -define weak_odr dso_local spir_kernel void @kernel() #2 { -entry: - ret void -} - -attributes #0 = { "indirectly-callable"="set-1" "sycl-module-id"="v.cpp" } -attributes #1 = { "indirectly-callable"="set-2" "sycl-module-id"="v.cpp" } -attributes #2 = { "sycl-module-id"="v.cpp" } diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll deleted file mode 100644 index a35ebedf387fc..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/indirectly-callable-per-kernel-split.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llvm-split -sycl-split=kernel -S < %s -o %t -; -; This test checks that functions marked with "indirectly-callable" LLVM IR -; attribute are outlined into separate device image(s) in accordance with the -; attribute value. -; -; This version of the test is focused on per-kernel device code split -; -; RUN: FileCheck %s --input-file=%t_0.ll --check-prefix CHECK-IR0 \ -; RUN: --implicit-check-not foo --implicit-check-not bar \ -; RUN: --implicit-check-not baz -; RUN: FileCheck %s --input-file=%t_1.ll --check-prefix CHECK-IR1 \ -; RUN: --implicit-check-not kernel --implicit-check-not bar \ -; RUN: --implicit-check-not baz -; RUN: FileCheck %s --input-file=%t_2.ll --check-prefix CHECK-IR2 \ -; RUN: --implicit-check-not kernel --implicit-check-not foo \ -; RUN: --implicit-check-not bar -; RUN: FileCheck %s --input-file=%t_3.ll --check-prefix CHECK-IR3 \ -; RUN: --implicit-check-not kernel --implicit-check-not foo \ -; RUN: --implicit-check-not baz -; -; CHECK-IR0: define weak_odr dso_local spir_kernel void @kernel -; CHECK-IR1: define spir_func void @foo -; CHECK-IR2: define spir_func void @baz -; CHECK-IR3: define spir_func void @bar - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" -target triple = "spir64-unknown-unknown" - -define spir_func void @foo() #0 { -entry: - ret void -} - -define spir_func void @bar() #1 { -entry: - ret void -} - -define spir_func void @baz() #1 { -entry: - ret void -} - -define weak_odr dso_local spir_kernel void @kernel() #2 { -entry: - ret void -} - -attributes #0 = { "indirectly-callable"="set-1" "sycl-module-id"="v.cpp" } -attributes #1 = { "indirectly-callable"="set-2" "sycl-module-id"="v.cpp" } -attributes #2 = { "sycl-module-id"="v.cpp" } - From a12d0f7ddc772c9d58258cc0eb39ec92bfafba56 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Tue, 19 Nov 2024 06:23:26 -0800 Subject: [PATCH 08/16] remove processSubModuleNamedMetadata function --- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 55 ------------------- 1 file changed, 55 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index 44354d0df1ad4..b2397ed9d681a 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -328,52 +328,6 @@ static void dumpEntryPoints(const EntryPointSet &C, std::string_view Msg) { dbgs().indent(INDENT) << "}\n"; } -// Check "spirv.ExecutionMode" named metadata in the module and remove nodes -// that reference kernels that have dead prototypes or don't reference any -// kernel at all (nullptr). Dead prototypes are removed as well. -static void processSubModuleNamedMetadata(Module *M) { - auto ExecutionModeMD = M->getNamedMetadata("spirv.ExecutionMode"); - if (!ExecutionModeMD) - return; - - bool ContainsNodesToRemove = false; - SmallVector ValueVec; - for (auto Op : ExecutionModeMD->operands()) { - assert(Op->getNumOperands() > 0); - if (!Op->getOperand(0)) { - ContainsNodesToRemove = true; - continue; - } - - // If the first operand is not nullptr then it has to be a kernel - // function. - Value *Val = cast(Op->getOperand(0))->getValue(); - Function *F = cast(Val); - // If kernel function is just a prototype and unused then we can remove it - // and later remove corresponding spirv.ExecutionMode metadata node. - if (F->isDeclaration() && F->use_empty()) { - F->eraseFromParent(); - ContainsNodesToRemove = true; - continue; - } - - // Rememver nodes which we need to keep in the module. - ValueVec.push_back(Op); - } - if (!ContainsNodesToRemove) - return; - - if (ValueVec.empty()) { - // If all nodes need to be removed then just remove named metadata - // completely. - ExecutionModeMD->eraseFromParent(); - } else { - ExecutionModeMD->clearOperands(); - for (auto MD : ValueVec) - ExecutionModeMD->addOperand(MD); - } -} - void ModuleDesc::cleanup() { // Externalize them so they are not dropped by GlobalDCE for (Function &F : *M) @@ -388,15 +342,6 @@ void ModuleDesc::cleanup() { MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. MPM.run(*M, MAM); - - // Original module may have named metadata (spirv.ExecutionMode) referencing - // kernels in the module. Some of the Metadata nodes may reference kernels - // which are not included into the extracted submodule, in such case - // CloneModule either leaves that metadata nodes as is but they will reference - // dead prototype of the kernel or operand will be replace with nullptr. So - // process all nodes in the named metadata and remove nodes which are - // referencing kernels which are not included into submodule. - processSubModuleNamedMetadata(M.get()); } void ModuleDesc::dump() const { From d38dc63f0103be0843d303aa4ce8a209e3cb86f6 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Wed, 20 Nov 2024 08:47:49 -0800 Subject: [PATCH 09/16] Remove auto and source split modes. auto and source modes are going to be added in the upcoming pathes. It simplifies the ongoing public code review. --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 20 -- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 212 ++---------------- .../device-code-split/auto-module-split-1.ll | 121 ---------- .../device-code-split/auto-module-split-2.ll | 127 ----------- .../device-code-split/auto-module-split-3.ll | 112 --------- .../auto-module-split-func-ptr.ll | 50 ----- .../device-code-split/basic-module-split.ll | 122 ---------- .../complex-indirect-call-chain.ll | 26 +-- .../one-kernel-per-module.ll | 4 +- .../per-reqd-sub-group-size-split-1.ll | 36 +-- .../per-reqd-sub-group-size-split-2.ll | 60 ----- .../per-reqd-wg-size-split-1.ll | 36 +-- .../per-reqd-wg-size-split-2.ll | 59 ----- .../split-with-kernel-declarations.ll | 23 +- llvm/tools/llvm-split/llvm-split.cpp | 15 +- 15 files changed, 37 insertions(+), 986 deletions(-) delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll delete mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 1f3713a6dcc32..5b096e886c562 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -28,9 +28,7 @@ class Function; class Module; enum class IRSplitMode { - IRSM_PER_TU, // one module per translation unit IRSM_PER_KERNEL, // one module per kernel - IRSM_AUTO, // automatically select split mode IRSM_NONE // no splitting }; @@ -41,33 +39,15 @@ std::optional convertStringToSplitMode(StringRef S); // A vector that contains all entry point functions in a split module. using EntryPointSet = SetVector; -/// Describes scope covered by each entry in the module-entry points map -/// populated by the groupEntryPointsByScope function. -enum EntryPointsGroupScope { - Scope_PerKernel, // one entry per kernel - Scope_PerModule, // one entry per module - Scope_Global // single entry in the map for all kernels -}; - /// Represents a named group of device code entry points - kernels and /// SYCL_EXTERNAL functions. struct EntryPointGroup { - // Properties an entry point (EP) group - struct Properties { - // Scope represented by EPs in a group - EntryPointsGroupScope Scope = Scope_Global; - }; - std::string GroupId; EntryPointSet Functions; - Properties Props; EntryPointGroup(StringRef GroupId = "") : GroupId(GroupId) {} EntryPointGroup(StringRef GroupId, EntryPointSet Functions) : GroupId(GroupId), Functions(std::move(Functions)) {} - EntryPointGroup(StringRef GroupId, EntryPointSet Functions, - const Properties &Props) - : GroupId(GroupId), Functions(std::move(Functions)), Props(Props) {} }; // TODO: move it into cpp file. diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index b2397ed9d681a..ef69148214b3a 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -47,30 +47,6 @@ using namespace llvm; #define DEBUG_TYPE "sycl_module_split" namespace { -// Identifying name for global scope -constexpr char GLOBAL_SCOPE_NAME[] = ""; - -EntryPointsGroupScope selectDeviceCodeGroupScope(IRSplitMode Mode) { - switch (Mode) { - case IRSplitMode::IRSM_PER_TU: - return Scope_PerModule; - - case IRSplitMode::IRSM_PER_KERNEL: - return Scope_PerKernel; - - case IRSplitMode::IRSM_AUTO: { - // At the moment, we assume that per-source split is the best way of - // splitting device code and can always be used except for cases handled - // above. - return Scope_PerModule; - } - - case IRSplitMode::IRSM_NONE: - return Scope_Global; - } - - llvm_unreachable("unsupported split mode"); -} bool isKernel(const Function &F) { return F.getCallingConv() == CallingConv::SPIR_KERNEL || @@ -307,8 +283,6 @@ namespace llvm { std::optional convertStringToSplitMode(StringRef S) { static const StringMap Values = { {"kernel", IRSplitMode::IRSM_PER_KERNEL}, - {"source", IRSplitMode::IRSM_PER_TU}, - {"auto", IRSplitMode::IRSM_AUTO}, {"none", IRSplitMode::IRSM_NONE}}; auto It = Values.find(S); @@ -359,191 +333,29 @@ std::string ModuleDesc::makeSymbolTable() const { return ST; } -namespace { -// This is a helper class, which allows to group/categorize function based on -// provided rules. It is intended to be used in device code split -// implementation. -// -// "Rule" is a simple routine, which returns a string for an llvm::Function -// passed to it. There could be more than one rule and they are applied in order -// of their registration. Results obtained from those rules are concatenated -// together to produce the final result. -// -// There are some predefined rules for the most popular use-cases, like grouping -// functions together based on an attribute value or presence of a metadata. -// However, there is also a possibility to register a custom callback function -// as a rule, to implement custom/more complex logic. -class FunctionsCategorizer { -public: - FunctionsCategorizer() = default; - - std::string computeCategoryFor(const Function *) const; - - // Accepts a callback, which should return a string based on provided - // function, which will be used as an entry points group identifier. - void - registerRule(const std::function &Callback) { - Rules.emplace_back(Rule::RKind::K_Callback, Callback); - } - - // Creates a simple rule, which adds a value of a string attribute into a - // resulting identifier. - void registerSimpleStringAttributeRule(StringRef AttrName) { - Rules.emplace_back(Rule::RKind::K_SimpleStringAttribute, AttrName); - } - - // Creates a rule, which adds a list of dash-separated integers converted - // into strings listed in a metadata to a resulting identifier. - void registerListOfIntegersInMetadataRule(StringRef MetadataName) { - Rules.emplace_back(Rule::RKind::K_IntegersListMetadata, MetadataName); - } - -private: - struct Rule { - private: - std::variant> - Storage; - - public: - enum class RKind { - // Custom callback function - K_Callback, - // Copy value of the specified attribute, if present - K_SimpleStringAttribute, - // Copy value of the specified metadata, if present - K_IntegersListMetadata, - }; - RKind Kind; - - // Returns an index into std::variant<...> Storage defined above, which - // corresponds to the specified rule Kind. - constexpr static std::size_t storage_index(RKind K) { - switch (K) { - case RKind::K_SimpleStringAttribute: - case RKind::K_IntegersListMetadata: - return 0; - case RKind::K_Callback: - return 1; - } - // can't use llvm_unreachable in constexpr context - return std::variant_npos; - } - - template auto getStorage() const { - return std::get(Storage); - } - - template - Rule(RKind K, Args... args) : Storage(args...), Kind(K) { - assert(storage_index(K) == Storage.index()); - } - - Rule(Rule &&Other) = default; - }; - - SmallVector Rules; -}; - -std::string FunctionsCategorizer::computeCategoryFor(const Function *F) const { - SmallString<256> Result; - for (const auto &R : Rules) { - StringRef AttrName; - StringRef MetadataName; - - switch (R.Kind) { - case Rule::RKind::K_Callback: - Result += R.getStorage()(F); - break; - - case Rule::RKind::K_SimpleStringAttribute: - AttrName = R.getStorage(); - if (F->hasFnAttribute(AttrName)) { - Attribute Attr = F->getFnAttribute(AttrName); - Result += Attr.getValueAsString(); - } - break; - - case Rule::RKind::K_IntegersListMetadata: - MetadataName = R.getStorage(); - if (F->hasMetadata(MetadataName)) { - auto *MDN = F->getMetadata(MetadataName); - for (const MDOperand &MDOp : MDN->operands()) - Result += - "-" + std::to_string( - mdconst::extract(MDOp)->getZExtValue()); - } - break; - } - - Result += "-"; - } - - return static_cast(Result); -} - -} // namespace - -static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD, - IRSplitMode Mode) { - FunctionsCategorizer Categorizer; - - EntryPointsGroupScope Scope = selectDeviceCodeGroupScope(Mode); - - switch (Scope) { - case Scope_Global: - // We simply perform entry points filtering, but group all of them together. - Categorizer.registerRule( - [](const Function *) -> std::string { return GLOBAL_SCOPE_NAME; }); - break; - case Scope_PerKernel: - // Per-kernel split is quite simple: every kernel goes into a separate - // module and that's it, no other rules required. - Categorizer.registerRule( - [](const Function *F) -> std::string { return F->getName().str(); }); - break; - case Scope_PerModule: - // The most complex case, because we should account for many other features - // like aspects used in a kernel, large-grf mode, reqd-work-group-size, etc. - - // This is core of per-source device code split - Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_MODULE_ID); - - // Optional features - // Note: Add more rules at the end of the list to avoid chaning orders of - // output files in existing tests. - Categorizer.registerSimpleStringAttributeRule("sycl-register-alloc-mode"); - Categorizer.registerSimpleStringAttributeRule("sycl-grf-size"); - Categorizer.registerListOfIntegersInMetadataRule("reqd_work_group_size"); - Categorizer.registerListOfIntegersInMetadataRule("work_group_num_dim"); - Categorizer.registerListOfIntegersInMetadataRule( - "intel_reqd_sub_group_size"); - Categorizer.registerSimpleStringAttributeRule(ATTR_SYCL_OPTLEVEL); - break; - } - +static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD) { // std::map is used here to ensure stable ordering of entry point groups, // which is based on their contents, this greatly helps LIT tests - std::map EntryPointsMap; + std::map EntryPointsMap; // Only process module entry points: for (const auto &F : MD.getModule().functions()) { if (!isEntryPoint(F)) continue; - std::string Key = Categorizer.computeCategoryFor(&F); + StringRef Key = F.getName(); EntryPointsMap[std::move(Key)].insert(&F); } EntryPointGroupVec Groups; if (EntryPointsMap.empty()) { // No entry points met, record this. - Groups.emplace_back(GLOBAL_SCOPE_NAME, EntryPointSet{}); + Groups.emplace_back("-", EntryPointSet()); } else { Groups.reserve(EntryPointsMap.size()); // Start with properties of a source module - EntryPointGroup::Properties MDProps = MD.getEntryPointGroup().Props; for (auto &[Key, EntryPoints] : EntryPointsMap) - Groups.emplace_back(Key, std::move(EntryPoints), MDProps); + Groups.emplace_back(Key, std::move(EntryPoints)); } return Groups; @@ -627,9 +439,19 @@ parseSYCLSplitModulesFromFile(StringRef File) { Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); - EntryPointGroupVec Groups = selectEntryPointGroups(MD, Settings.Mode); - SmallVector OutputImages; + if (Settings.Mode == IRSplitMode::IRSM_NONE) { + std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); + auto ImageOrErr = + saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); + if (!ImageOrErr) + return ImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*ImageOrErr)); + return OutputImages; + } + + EntryPointGroupVec Groups = selectEntryPointGroups(MD); if (Groups.size() < 2) { // FIXME(maksimsab): this branch is not tested yet. std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll deleted file mode 100644 index 539adf551ea96..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll +++ /dev/null @@ -1,121 +0,0 @@ -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; By default auto mode is equal to source mode -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -$_Z3barIiET_S0_ = comdat any - -; CHECK-TU1-NOT: @{{.*}}GV{{.*}} -; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 -@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { -entry: - call spir_func void @_Z3foov() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() - -; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) - -define dso_local spir_func void @_Z3foov() { -entry: - %a = alloca i32, align 4 - %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) - %add = add nsw i32 2, %call - store i32 %add, ptr %a, align 4 - ret void -} - -; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) -; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) - -; Function Attrs: nounwind -define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { -entry: - %arg.addr = alloca i32, align 4 - store i32 %arg, ptr %arg.addr, align 4 - %0 = load i32, ptr %arg.addr, align 4 - ret i32 %0 -} - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { -entry: - call spir_func void @_Z4foo1v() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo1v() { -entry: - %a = alloca i32, align 4 - store i32 2, ptr %a, align 4 - ret void -} - -; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} -; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} - -; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { -entry: - call spir_func void @_Z4foo2v() - ret void -} - -; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() -; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo2v() { -entry: - %a = alloca i32, align 4 -; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 - %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 - %add = add nsw i32 4, %0 - store i32 %add, ptr %a, align 4 - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } - -; Metadata is saved in both modules. -; CHECK: !opencl.spir.version = !{!0, !0} -; CHECK: !spirv.Source = !{!1, !1} - -!opencl.spir.version = !{!0, !0} -!spirv.Source = !{!1, !1} - -; CHECK: !0 = !{i32 1, i32 2} -; CHECK: !1 = !{i32 4, i32 100000} - -!0 = !{i32 1, i32 2} -!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll deleted file mode 100644 index 33dde7b965755..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-2.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; -; This is the same as auto-module-split-1 test with the only difference is that -; @_Z3foov is marked with "referenced-indirectly" attribute. -; The purpose of this test is to make sure that we can still perform device code -; split as usual, because that function is not a part of any indirect calls -; -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -$_Z3barIiET_S0_ = comdat any - -; CHECK-TU1-NOT: @{{.*}}GV{{.*}} -; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 -@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { -entry: - call spir_func void @_Z3foov() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() - -; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) - -define dso_local spir_func void @_Z3foov() #2 { -entry: - %a = alloca i32, align 4 - %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) - %add = add nsw i32 2, %call - store i32 %add, ptr %a, align 4 - ret void -} - -; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) -; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) - -; Function Attrs: nounwind -define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { -entry: - %arg.addr = alloca i32, align 4 - store i32 %arg, ptr %arg.addr, align 4 - %0 = load i32, ptr %arg.addr, align 4 - ret i32 %0 -} - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { -entry: - call spir_func void @_Z4foo1v() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo1v() { -entry: - %a = alloca i32, align 4 - store i32 2, ptr %a, align 4 - ret void -} - -; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} -; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} - -; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { -entry: - call spir_func void @_Z4foo2v() - ret void -} - -; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() -; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo2v() { -entry: - %a = alloca i32, align 4 -; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 - %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 - %add = add nsw i32 4, %0 - store i32 %add, ptr %a, align 4 - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } -attributes #2 = { "referenced-indirectly" } - -; Metadata is saved in both modules. -; CHECK: !opencl.spir.version = !{!0, !0} -; CHECK: !spirv.Source = !{!1, !1} - -!opencl.spir.version = !{!0, !0} -!spirv.Source = !{!1, !1} - -; CHECK: !0 = !{i32 1, i32 2} -; CHECK: !1 = !{i32 4, i32 100000} - -!0 = !{i32 1, i32 2} -!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll deleted file mode 100644 index 3c40986a31e62..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-3.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; -; In precense of indirect calls we start matching functions using their -; signatures, i.e. we have an indirect call to i32(i32) function within -; @_Z3foov, which means that all functions with i32(i32) signature should be -; placed in the same module as @_Z3foov. -; -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0-IR \ -; RUN: --implicit-check-not TU0_kernel --implicit-check-not _Z3foov \ -; RUN: --implicit-check-not _Z4foo3v -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1-IR \ -; RUN: --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v \ -; RUN: --implicit-check-not _Z4foo1v -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM - -; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0 -; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1 -; -; CHECK-TU1-SYM: _ZTSZ4mainE10TU0_kernel -; -; CHECK-TU0-IR: @_ZL2GV = internal addrspace(1) constant -; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0 -; CHECK-TU0-IR: define {{.*}} spir_func i32 @_Z4foo1v -; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1 -; CHECK-TU0-IR: define {{.*}} spir_func void @_Z4foo2v -; -; CHECK-TU1-IR: define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel -; CHECK-TU1-IR: define {{.*}} spir_func void @_Z3foov -; CHECK-TU1-IR: define {{.*}} spir_func i32 @_Z4foo3v - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -$_Z3barIiET_S0_ = comdat any - -@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 - -define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel() #0 { -entry: - call spir_func void @_Z3foov() - ret void -} - -define dso_local spir_func void @_Z3foov() { -entry: - %a = alloca i32, align 4 - %ptr = bitcast i32* %a to i32 (i32)* - %call = call spir_func i32 %ptr(i32 1) - %add = add nsw i32 2, %call - store i32 %add, i32* %a, align 4 - ret void -} - -; Function Attrs: nounwind -define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { -entry: - %arg.addr = alloca i32, align 4 - store i32 %arg, i32* %arg.addr, align 4 - %0 = load i32, i32* %arg.addr, align 4 - ret i32 %0 -} - -define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0() #1 { -entry: - %a = alloca i32, align 4 - %arg = load i32, i32* %a, align 4 - %call = call spir_func i32 @_Z4foo1v(i32 %arg) - ret void -} - -; Function Attrs: nounwind -define dso_local spir_func i32 @_Z4foo1v(i32 %arg) { -entry: - %a = alloca i32, align 4 - store i32 %arg, i32* %a, align 4 - ret i32 %arg -} - -; Function Attrs: nounwind -define dso_local spir_func i32 @_Z4foo3v(i32 %arg) #2 { -entry: - %a = alloca i32, align 4 - store i32 %arg, i32* %a, align 4 - ret i32 %arg -} - -define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1() #1 { -entry: - call spir_func void @_Z4foo2v() - ret void -} - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo2v() { -entry: - %a = alloca i32, align 4 - %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 - %add = add nsw i32 4, %0 - store i32 %add, i32* %a, align 4 - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } -attributes #2 = { "referenced-indirectly" } - -!opencl.spir.version = !{!0, !0} -!spirv.Source = !{!1, !1} - -!0 = !{i32 1, i32 2} -!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll deleted file mode 100644 index dd10a9bec6269..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1 -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1 - -; This test checkes that we can properly perform device code split by tracking -; all uses of functions (not only direct calls) - -; CHECK-SYM0: kernel2 -; CHECK-SYM1: kernel1 -; -; CHECK-IR0: define dso_local spir_kernel void @kernel2 -; -; CHECK-IR1: @_Z2f1iTable = weak global ptr @_Z2f1i -; CHECK-IR1: define {{.*}} i32 @_Z2f1i -; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" -target triple = "spir64_x86_64-unknown-unknown" - -@_Z2f1iTable = weak global ptr @_Z2f1i, align 8 - -; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn -define dso_local spir_func i32 @_Z2f1i(i32 %a) #0 { -entry: - ret i32 %a -} - -; Function Attrs: convergent norecurse -define weak_odr dso_local spir_kernel void @kernel1() #1 { -entry: - %0 = call i32 @indirect_call(ptr addrspace(4) addrspacecast ( ptr getelementptr inbounds ( [1 x ptr] , ptr @_Z2f1iTable, i64 0, i64 0) to ptr addrspace(4)), i32 0) - ret void -} - -; Function Attrs: convergent norecurse -define dso_local spir_kernel void @kernel2() #2 { -entry: - ret void -} - -declare dso_local spir_func i32 @indirect_call(ptr addrspace(4), i32) local_unnamed_addr - -attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn } -attributes #1 = { convergent norecurse "sycl-module-id"="TU1.cpp" } -attributes #2 = { convergent norecurse "sycl-module-id"="TU2.cpp" } - -; CHECK: kernel1 -; CHECK: kernel2 diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll deleted file mode 100644 index a916fdfa82b76..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll +++ /dev/null @@ -1,122 +0,0 @@ -; RUN: llvm-split -sycl-split=source -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT - -; ModuleID = 'basic-module-split.ll' -source_filename = "basic-module-split.ll" -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -$_Z3barIiET_S0_ = comdat any - -;CHECK-TU1-NOT: @{{.*}}GV{{.*}} -;CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 -@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { -entry: - call spir_func void @_Z3foov() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() - -; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) - -define dso_local spir_func void @_Z3foov() { -entry: - %a = alloca i32, align 4 - %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) - %add = add nsw i32 2, %call - store i32 %add, ptr %a, align 4 - ret void -} - -; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) -; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) - -; Function Attrs: nounwind -define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { -entry: - %arg.addr = alloca i32, align 4 - store i32 %arg, ptr %arg.addr, align 4 - %0 = load i32, ptr %arg.addr, align 4 - ret i32 %0 -} - -; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} -; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() -; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} - -; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { -entry: - call spir_func void @_Z4foo1v() - ret void -} - -; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() -; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo1v() { -entry: - %a = alloca i32, align 4 - store i32 2, ptr %a, align 4 - ret void -} - -; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} -; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() -; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} - -; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() - -define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { -entry: - call spir_func void @_Z4foo2v() - ret void -} - -; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() -; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() - -; Function Attrs: nounwind -define dso_local spir_func void @_Z4foo2v() { -entry: - %a = alloca i32, align 4 -; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 - %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 - %add = add nsw i32 4, %0 - store i32 %add, ptr %a, align 4 - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } -attributes #1 = { "sycl-module-id"="TU2.cpp" } - -; Metadata is saved in both modules. -; CHECK: !opencl.spir.version = !{!0, !0} -; CHECK: !spirv.Source = !{!1, !1} - -!opencl.spir.version = !{!0, !0} -!spirv.Source = !{!1, !1} - -; CHECK; !0 = !{i32 1, i32 2} -; CHECK; !1 = !{i32 4, i32 100000} - -!0 = !{i32 1, i32 2} -!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll index 413769947aaaf..9e093dfda4f3a 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll @@ -1,28 +1,6 @@ -; The idea of the test is to ensure that sycl-post-link can trace through more -; complex call stacks involving several nested indirect calls +; Check that Module splitting can trace through more complex call stacks +; involving several nested indirect calls. -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ -; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ -; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ -; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ -; RUN: --implicit-check-not @foo --implicit-check-not @bar \ -; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ -; RUN: --implicit-check-not @kernel_C -; -; RUN: llvm-split -sycl-split=source -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ -; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ -; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ -; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ -; RUN: --implicit-check-not @foo --implicit-check-not @bar \ -; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ -; RUN: --implicit-check-not @kernel_C -; ; RUN: llvm-split -sycl-split=kernel -S < %s -o %t ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ ; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll index f61623d377bcd..ddb0ea0b3c59a 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll @@ -1,3 +1,5 @@ +; Test checks "kernel" splitting mode. + ; RUN: llvm-split -sycl-split=kernel -S < %s -o %t.files ; RUN: FileCheck %s -input-file=%t.files_0.ll --check-prefixes CHECK-MODULE0,CHECK ; RUN: FileCheck %s -input-file=%t.files_0.sym --check-prefixes CHECK-MODULE0-TXT @@ -9,7 +11,7 @@ ; ModuleID = 'one-kernel-per-module.ll' source_filename = "one-kernel-per-module.ll" target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" +target triple = "spirv64-unknown-unknown" $_Z3barIiET_S0_ = comdat any diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll index 9436e4308ac99..921b7c22fc365 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -1,26 +1,12 @@ +; The test checks that Module splitting correctly separates kernels +; that use reqd_sub_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + ; This test emulates two translation units with 3 kernels: ; TU0_kernel0 - 1st translation unit, no reqd_sub_group_size attribute used ; TU0_kernel1 - 1st translation unit, reqd_sub_group_size attribute is used ; TU1_kernel2 - 2nd translation unit, no reqd_sub_group_size attribute used -; The test is intended to check that sycl-post-link correctly separates kernels -; that use reqd_sub_group_size attributes from kernels which doesn't use them -; regardless of device code split mode - -; RUN: llvm-split -sycl-split=auto -S %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 - ; RUN: llvm-split -sycl-split=kernel -S %s -o %t ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 @@ -35,20 +21,6 @@ ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 -; RUN: llvm-split -sycl-split=source -S %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 - ; Regardless of device code split mode, each kernel should go into a separate ; device image diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll deleted file mode 100644 index 49976fec60c26..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-2.ll +++ /dev/null @@ -1,60 +0,0 @@ -; The test is intended to check that sycl-post-link correctly groups kernels -; by unique reqd_sub_group_size values used in them - -; RUN: llvm-split -sycl-split=auto -S %s -o %t -; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE -; -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 -; -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ -; RUN: --implicit-check-not kernel3 - -; -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ -; RUN: --implicit-check-not kernel2 - -; CHECK-TABLE: Code -; CHECK-TABLE-NEXT: _0.sym -; CHECK-TABLE-NEXT: _1.sym -; CHECK-TABLE-NEXT: _2.sym -; CHECK-TABLE-EMPTY: - -; CHECK-M0-SYMS: kernel1 -; CHECK-M0-SYMS: kernel2 - -; CHECK-M1-SYMS: kernel0 - -; CHECK-M2-SYMS: kernel3 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -define dso_local spir_kernel void @kernel0() #0 !intel_reqd_sub_group_size !1 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel1() #0 !intel_reqd_sub_group_size !2 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel2() #0 !intel_reqd_sub_group_size !3 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel3() #0 !intel_reqd_sub_group_size !4 { -entry: - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } - -!1 = !{i32 32} -!2 = !{i32 64} -!3 = !{i32 64} -!4 = !{i32 16} \ No newline at end of file diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll index 64acdc04e957c..2ca8b220edfbe 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll @@ -1,40 +1,12 @@ +; The test checks that Module splitting correctly separates kernels +; that use reqd_work_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + ; This test emulates two translation units with 3 kernels: ; TU0_kernel0 - 1st translation unit, no reqd_work_group_size attribute used ; TU0_kernel1 - 1st translation unit, reqd_work_group_size attribute is used ; TU1_kernel2 - 2nd translation unit, no reqd_work_group_size attribute used -; The test is intended to check that sycl-post-link correctly separates kernels -; that use reqd_work_group_size attributes from kernels which doesn't use them -; regardless of device code split mode - -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 - -; RUN: llvm-split -sycl-split=source -S < %s -o %t -; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 - ; RUN: llvm-split -sycl-split=kernel -S < %s -o %t ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll deleted file mode 100644 index 569bdeb8ff14c..0000000000000 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-2.ll +++ /dev/null @@ -1,59 +0,0 @@ -; The test is intended to check that sycl-post-link correctly groups kernels -; by unique reqd_work_group_size values used in them - -; RUN: llvm-split -sycl-split=auto -S < %s -o %t -; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE -; -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 -; -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel3 \ -; RUN: --implicit-check-not kernel2 -; -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ -; RUN: --implicit-check-not kernel0 - -; CHECK-TABLE: Code -; CHECK-TABLE-NEXT: _0.sym -; CHECK-TABLE-NEXT: _1.sym -; CHECK-TABLE-NEXT: _2.sym -; CHECK-TABLE-EMPTY: - -; CHECK-M0-SYMS: kernel1 -; CHECK-M0-SYMS: kernel2 - -; CHECK-M1-SYMS: kernel0 - -; CHECK-M2-SYMS: kernel3 - -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" - -define dso_local spir_kernel void @kernel0() #0 !reqd_work_group_size !1 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel1() #0 !reqd_work_group_size !2 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel2() #0 !reqd_work_group_size !3 { -entry: - ret void -} - -define dso_local spir_kernel void @kernel3() #0 !reqd_work_group_size !4 { -entry: - ret void -} - -attributes #0 = { "sycl-module-id"="TU1.cpp" } - -!1 = !{i32 32} -!2 = !{i32 64, i32 64} -!3 = !{i32 64, i32 64} -!4 = !{i32 16, i32 16, i32 16} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll index 2632641a69a5c..4ba15ecdefea6 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll @@ -1,32 +1,11 @@ -; Purpose of this test is to check that sycl-post-link does not treat -; declarations as entry points. +; The test checks that Module splitting does not treat declarations as entry points. -; RUN: llvm-split -sycl-split=source -S < %s -o %t -; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-PER-SOURCE-TABLE -; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-PER-SOURCE-SYM0 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-PER-SOURCE-SYM1 -; ; RUN: llvm-split -sycl-split=kernel -S < %s -o %t2 ; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE ; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 ; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 ; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 -; With per-source split, there should be two device images -; CHECK-PER-SOURCE-TABLE: [Code|Symbols] -; CHECK-PER-SOURCE-TABLE: {{.*}}_0.ll|{{.*}}_0.sym -; CHECK-PER-SOURCE-TABLE-NEXT: {{.*}}_1.ll|{{.*}}_1.sym -; CHECK-PER-SOURCE-TABLE-EMPTY: -; -; CHECK-PER-SOURCE-SYM1-NOT: _ZTS4mainE10TU1_kernel1 -; CHECK-PER-SOURCE-SYM1: _ZTSZ4mainE11TU0_kernel0 -; CHECK-PER-SOURCE-SYM1-NEXT: _ZTSZ4mainE11TU0_kernel1 -; CHECK-PER-SOURCE-SYM1-EMPTY: -; -; CHECK-PER-SOURCE-SYM0-NOT: _ZTS4mainE10TU1_kernel1 -; CHECK-PER-SOURCE-SYM0: _ZTSZ4mainE10TU1_kernel0 -; CHECK-PER-SOURCE-SYM0-EMPTY: - ; With per-kernel split, there should be three device images ; CHECK-PER-KERNEL-TABLE: [Code|Symbols] ; CHECK-PER-KERNEL-TABLE: {{.*}}_0.ll|{{.*}}_0.sym diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp index 66fad2e0db3d8..215e915de4893 100644 --- a/llvm/tools/llvm-split/llvm-split.cpp +++ b/llvm/tools/llvm-split/llvm-split.cpp @@ -76,15 +76,12 @@ static cl::opt MCPU("mcpu", cl::desc("Target CPU, ignored if -mtriple is not used"), cl::value_desc("cpu"), cl::cat(SplitCategory)); -cl::opt SYCLSplitMode( - "sycl-split", cl::desc("module split mode"), cl::Optional, - cl::init(IRSplitMode::IRSM_NONE), - cl::values( - clEnumValN(IRSplitMode::IRSM_PER_TU, "source", - "1 output module per source (translation unit)"), - clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", "1 output module per kernel"), - clEnumValN(IRSplitMode::IRSM_AUTO, "auto", "Choose split mode automatically")), - cl::cat(SplitCategory)); +cl::opt + SYCLSplitMode("sycl-split", cl::desc("module split mode"), cl::Optional, + cl::init(IRSplitMode::IRSM_NONE), + cl::values(clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", + "1 output module per kernel")), + cl::cat(SplitCategory)); cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), cl::cat(SplitCategory)}; From bf3a7d48490d33ff6603115a7711363e218a5686 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Thu, 21 Nov 2024 06:14:55 -0800 Subject: [PATCH 10/16] move ModuleDesc to source file --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 47 ------ llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 158 +++++++++++------- 2 files changed, 99 insertions(+), 106 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 5b096e886c562..11630f3b27b68 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -36,53 +36,6 @@ enum class IRSplitMode { /// returned. std::optional convertStringToSplitMode(StringRef S); -// A vector that contains all entry point functions in a split module. -using EntryPointSet = SetVector; - -/// Represents a named group of device code entry points - kernels and -/// SYCL_EXTERNAL functions. -struct EntryPointGroup { - std::string GroupId; - EntryPointSet Functions; - - EntryPointGroup(StringRef GroupId = "") : GroupId(GroupId) {} - EntryPointGroup(StringRef GroupId, EntryPointSet Functions) - : GroupId(GroupId), Functions(std::move(Functions)) {} -}; - -// TODO: move it into cpp file. -/// Annotates an llvm::Module with information necessary to perform and track -/// result of device code (llvm::Module instances) splitting: -/// - entry points of the module determined e.g. by a module splitter, as well -/// as information about entry point origin (e.g. result of a scoped split) -/// - its properties, such as whether it has specialization constants uses -/// It also provides convenience functions for entry point set transformation -/// between llvm::Function object and string representations. -class ModuleDesc { - std::unique_ptr M; - EntryPointGroup EntryPoints; - -public: - ModuleDesc(std::unique_ptr M) : M(std::move(M)) {} - - ModuleDesc(std::unique_ptr M, EntryPointGroup EntryPoints) - : M(std::move(M)), EntryPoints(std::move(EntryPoints)) {} - - const EntryPointSet &entries() const { return EntryPoints.Functions; } - const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } - EntryPointSet &entries() { return EntryPoints.Functions; } - Module &getModule() { return *M; } - const Module &getModule() const { return *M; } - std::unique_ptr releaseModulePtr() { return std::move(M); } - - // Cleans up module IR - removes dead globals, debug info etc. - void cleanup(); - - std::string makeSymbolTable() const; - - void dump() const; -}; - /// The structure represents a split LLVM Module accompanied by additional /// information. Split Modules are being stored at disk due to the high RAM /// consumption during the whole splitting process. diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index ef69148214b3a..b18278bb11dbd 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -46,14 +46,12 @@ using namespace llvm; #define DEBUG_TYPE "sycl_module_split" -namespace { - -bool isKernel(const Function &F) { +static bool isKernel(const Function &F) { return F.getCallingConv() == CallingConv::SPIR_KERNEL || F.getCallingConv() == CallingConv::AMDGPU_KERNEL; } -bool isEntryPoint(const Function &F) { +static bool isEntryPoint(const Function &F) { // Skip declarations, if any: they should not be included into a vector of // entry points groups or otherwise we will end up with incorrectly generated // list of symbols. @@ -64,6 +62,89 @@ bool isEntryPoint(const Function &F) { return isKernel(F); } +namespace { + +// A vector that contains all entry point functions in a split module. +using EntryPointSet = SetVector; + +/// Represents a named group of device code entry points - kernels and +/// SYCL_EXTERNAL functions. +struct EntryPointGroup { + std::string GroupId; + EntryPointSet Functions; + + EntryPointGroup(StringRef GroupId = "") : GroupId(GroupId) {} + EntryPointGroup(StringRef GroupId, EntryPointSet Functions) + : GroupId(GroupId), Functions(std::move(Functions)) {} + + void dump() const { + constexpr size_t INDENT = 4; + dbgs().indent(INDENT) << "ENTRY POINTS" + << " " << GroupId << " {\n"; + for (const Function *F : Functions) + dbgs().indent(INDENT) << " " << F->getName() << "\n"; + + dbgs().indent(INDENT) << "}\n"; + } +}; + +/// Annotates an llvm::Module with information necessary to perform and track +/// result of device code (llvm::Module instances) splitting: +/// - entry points of the module determined e.g. by a module splitter, as well +/// as information about entry point origin (e.g. result of a scoped split) +/// - its properties, such as whether it has specialization constants uses +/// It also provides convenience functions for entry point set transformation +/// between llvm::Function object and string representations. +class ModuleDesc { + std::unique_ptr M; + EntryPointGroup EntryPoints; + +public: + ModuleDesc(std::unique_ptr M) : M(std::move(M)) {} + + ModuleDesc(std::unique_ptr M, EntryPointGroup EntryPoints) + : M(std::move(M)), EntryPoints(std::move(EntryPoints)) {} + + const EntryPointSet &entries() const { return EntryPoints.Functions; } + const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } + EntryPointSet &entries() { return EntryPoints.Functions; } + Module &getModule() { return *M; } + const Module &getModule() const { return *M; } + std::unique_ptr releaseModulePtr() { return std::move(M); } + + // Cleans up module IR - removes dead globals, debug info etc. + void cleanup() { + // Externalize them so they are not dropped by GlobalDCE + for (Function &F : *M) + if (F.hasFnAttribute("indirectly-callable")) + F.setLinkage(GlobalValue::LinkageTypes::ExternalLinkage); + + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + ModulePassManager MPM; + // Do cleanup. + MPM.addPass(GlobalDCEPass()); // Delete unreachable globals. + MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. + MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. + MPM.run(*M, MAM); + } + + std::string makeSymbolTable() const { + std::string ST; + for (const Function *F : EntryPoints.Functions) + ST += (Twine(F->getName()) + "\n").str(); + + return ST; + } + + void dump() const { + assert(M && "dump of empty ModuleDesc"); + dbgs() << "split_module::ModuleDesc[" << M->getName() << "] {\n"; + EntryPoints.dump(); + dbgs() << "}\n"; + } +}; + // Represents "dependency" or "use" graph of global objects (functions and // global variables) in a module. It is used during device code split to // understand which global variables and functions (other than entry points) @@ -278,61 +359,6 @@ class ModuleSplitter { } // namespace -namespace llvm { - -std::optional convertStringToSplitMode(StringRef S) { - static const StringMap Values = { - {"kernel", IRSplitMode::IRSM_PER_KERNEL}, - {"none", IRSplitMode::IRSM_NONE}}; - - auto It = Values.find(S); - if (It == Values.end()) - return std::nullopt; - - return It->second; -} - -static void dumpEntryPoints(const EntryPointSet &C, std::string_view Msg) { - constexpr size_t INDENT = 4; - dbgs().indent(INDENT) << "ENTRY POINTS" - << " " << Msg << " {\n"; - for (const Function *F : C) - dbgs().indent(INDENT) << " " << F->getName() << "\n"; - - dbgs().indent(INDENT) << "}\n"; -} - -void ModuleDesc::cleanup() { - // Externalize them so they are not dropped by GlobalDCE - for (Function &F : *M) - if (F.hasFnAttribute("indirectly-callable")) - F.setLinkage(GlobalValue::LinkageTypes::ExternalLinkage); - - ModuleAnalysisManager MAM; - MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); - ModulePassManager MPM; - // Do cleanup. - MPM.addPass(GlobalDCEPass()); // Delete unreachable globals. - MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. - MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. - MPM.run(*M, MAM); -} - -void ModuleDesc::dump() const { - assert(M && "dump of empty ModuleDesc"); - dbgs() << "split_module::ModuleDesc[" << M->getName() << "] {\n"; - dumpEntryPoints(entries(), EntryPoints.GroupId.c_str()); - dbgs() << "}\n"; -} - -std::string ModuleDesc::makeSymbolTable() const { - std::string ST; - for (const Function *F : EntryPoints.Functions) - ST += (Twine(F->getName()) + "\n").str(); - - return ST; -} - static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD) { // std::map is used here to ensure stable ordering of entry point groups, // which is based on their contents, this greatly helps LIT tests @@ -436,6 +462,20 @@ parseSYCLSplitModulesFromFile(StringRef File) { return Modules; } +namespace llvm { + +std::optional convertStringToSplitMode(StringRef S) { + static const StringMap Values = { + {"kernel", IRSplitMode::IRSM_PER_KERNEL}, + {"none", IRSplitMode::IRSM_NONE}}; + + auto It = Values.find(S); + if (It == Values.end()) + return std::nullopt; + + return It->second; +} + Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { ModuleDesc MD = std::move(M); From 1db04f105e75221e6cdcffcbe2fa175d689b5dd0 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Fri, 22 Nov 2024 07:48:04 -0800 Subject: [PATCH 11/16] return simple split mode by source --- .../llvm/Transforms/Utils/SYCLModuleSplit.h | 6 +- .../include/llvm/Transforms/Utils/SYCLUtils.h | 7 - llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 40 ++++-- .../device-code-split/auto-module-split-1.ll | 120 +++++++++++++++++ .../auto-module-split-func-ptr.ll | 50 +++++++ .../device-code-split/basic-module-split.ll | 122 ++++++++++++++++++ llvm/tools/llvm-split/llvm-split.cpp | 14 +- 7 files changed, 332 insertions(+), 27 deletions(-) create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll create mode 100644 llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 11630f3b27b68..1b46cda5d11ce 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// Functionality to split a module into call graphs. A callgraph here is a set +// Functionality to split a module into callgraphs. A callgraph here is a set // of entry points with all functions reachable from them via a call. The result // of the split is new modules containing corresponding callgraph. //===----------------------------------------------------------------------===// @@ -13,9 +13,7 @@ #ifndef LLVM_SYCL_MODULE_SPLIT_H #define LLVM_SYCL_MODULE_SPLIT_H -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/Function.h" #include "llvm/Support/Error.h" #include @@ -24,10 +22,10 @@ namespace llvm { -class Function; class Module; enum class IRSplitMode { + IRSM_PER_TU, // one module per translation unit IRSM_PER_KERNEL, // one module per kernel IRSM_NONE // no splitting }; diff --git a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h index 5acf845238b9a..02c069624794d 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h @@ -23,9 +23,6 @@ namespace llvm { -constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id"; -constexpr char ATTR_SYCL_OPTLEVEL[] = "sycl-optlevel"; - using CallGraphNodeAction = ::std::function; using CallGraphFunctionFilter = std::function; @@ -73,10 +70,6 @@ void traverseCallgraphUp( ErrorOnNonCallUse, functionFilter); } -inline bool isSYCLExternalFunction(const Function *F) { - return F->hasFnAttribute(ATTR_SYCL_MODULE_ID); -} - /// Removes the global variable "llvm.used" and returns true on success. /// "llvm.used" is a global constant array containing references to kernels /// available in the module and callable from host code. The elements of diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index b18278bb11dbd..ba1ce80943f36 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -89,9 +89,9 @@ struct EntryPointGroup { }; /// Annotates an llvm::Module with information necessary to perform and track -/// result of device code (llvm::Module instances) splitting: +/// the result of device code (llvm::Module instances) splitting: /// - entry points of the module determined e.g. by a module splitter, as well -/// as information about entry point origin (e.g. result of a scoped split) +/// as information about entry point's origin (e.g. result of a scoped split) /// - its properties, such as whether it has specialization constants uses /// It also provides convenience functions for entry point set transformation /// between llvm::Function object and string representations. @@ -359,18 +359,37 @@ class ModuleSplitter { } // namespace -static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD) { +/// Gets attached attribute value if it is present. Otherwise returns empty +/// stirng. +static StringRef computeFunctionCategoryFromStringMetadata(const Function &F, + StringRef AttrName) { + return F.getFnAttribute(AttrName).getValueAsString(); +} + +static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD, + IRSplitMode Mode) { // std::map is used here to ensure stable ordering of entry point groups, // which is based on their contents, this greatly helps LIT tests - std::map EntryPointsMap; + std::map EntryPointsMap; - // Only process module entry points: + static constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id"; for (const auto &F : MD.getModule().functions()) { if (!isEntryPoint(F)) continue; - StringRef Key = F.getName(); - EntryPointsMap[std::move(Key)].insert(&F); + std::string Key; + switch (Mode) { + case IRSplitMode::IRSM_PER_KERNEL: + Key = F.getName(); + break; + case IRSplitMode::IRSM_PER_TU: + Key = computeFunctionCategoryFromStringMetadata(F, ATTR_SYCL_MODULE_ID); + break; + case IRSplitMode::IRSM_NONE: + llvm_unreachable(""); + } + + EntryPointsMap[Key].insert(&F); } EntryPointGroupVec Groups; @@ -419,6 +438,8 @@ saveModuleDesc(ModuleDesc &MD, std::string Prefix, bool OutputAssembly) { return SM; } +namespace llvm { + Expected> parseSYCLSplitModulesFromFile(StringRef File) { auto EntriesMBOrErr = llvm::MemoryBuffer::getFile(File); @@ -462,10 +483,9 @@ parseSYCLSplitModulesFromFile(StringRef File) { return Modules; } -namespace llvm { - std::optional convertStringToSplitMode(StringRef S) { static const StringMap Values = { + {"source", IRSplitMode::IRSM_PER_TU}, {"kernel", IRSplitMode::IRSM_PER_KERNEL}, {"none", IRSplitMode::IRSM_NONE}}; @@ -491,7 +511,7 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { return OutputImages; } - EntryPointGroupVec Groups = selectEntryPointGroups(MD); + EntryPointGroupVec Groups = selectEntryPointGroups(MD, Settings.Mode); if (Groups.size() < 2) { // FIXME(maksimsab): this branch is not tested yet. std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll new file mode 100644 index 0000000000000..3734153b9fbaa --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll @@ -0,0 +1,120 @@ +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +; CHECK-TU1-NOT: @{{.*}}GV{{.*}} +; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK: !0 = !{i32 1, i32 2} +; CHECK: !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll new file mode 100644 index 0000000000000..2e3d2e5e55c9b --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll @@ -0,0 +1,50 @@ +; This test checks that we can properly perform device code split by tracking +; all uses of functions (not only direct calls) + +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1 +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1 + +; CHECK-SYM0: kernel2 +; CHECK-SYM1: kernel1 +; +; CHECK-IR0: define dso_local spir_kernel void @kernel2 +; +; CHECK-IR1: @_Z2f1iTable = weak global ptr @_Z2f1i +; CHECK-IR1: define {{.*}} i32 @_Z2f1i +; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spirv64-unknown-unknown" + +@_Z2f1iTable = weak global ptr @_Z2f1i, align 8 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define dso_local spir_func i32 @_Z2f1i(i32 %a) #0 { +entry: + ret i32 %a +} + +; Function Attrs: convergent norecurse +define weak_odr dso_local spir_kernel void @kernel1() #1 { +entry: + %0 = call i32 @indirect_call(ptr addrspace(4) addrspacecast ( ptr getelementptr inbounds ( [1 x ptr] , ptr @_Z2f1iTable, i64 0, i64 0) to ptr addrspace(4)), i32 0) + ret void +} + +; Function Attrs: convergent norecurse +define dso_local spir_kernel void @kernel2() #2 { +entry: + ret void +} + +declare dso_local spir_func i32 @indirect_call(ptr addrspace(4), i32) local_unnamed_addr + +attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn } +attributes #1 = { convergent norecurse "sycl-module-id"="TU1.cpp" } +attributes #2 = { convergent norecurse "sycl-module-id"="TU2.cpp" } + +; CHECK: kernel1 +; CHECK: kernel2 diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll new file mode 100644 index 0000000000000..a916fdfa82b76 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll @@ -0,0 +1,122 @@ +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +; ModuleID = 'basic-module-split.ll' +source_filename = "basic-module-split.ll" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +;CHECK-TU1-NOT: @{{.*}}GV{{.*}} +;CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK; !0 = !{i32 1, i32 2} +; CHECK; !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp index 215e915de4893..e40bf71026472 100644 --- a/llvm/tools/llvm-split/llvm-split.cpp +++ b/llvm/tools/llvm-split/llvm-split.cpp @@ -76,12 +76,14 @@ static cl::opt MCPU("mcpu", cl::desc("Target CPU, ignored if -mtriple is not used"), cl::value_desc("cpu"), cl::cat(SplitCategory)); -cl::opt - SYCLSplitMode("sycl-split", cl::desc("module split mode"), cl::Optional, - cl::init(IRSplitMode::IRSM_NONE), - cl::values(clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", - "1 output module per kernel")), - cl::cat(SplitCategory)); +cl::opt SYCLSplitMode( + "sycl-split", cl::desc("module split mode"), cl::Optional, + cl::init(IRSplitMode::IRSM_NONE), + cl::values(clEnumValN(IRSplitMode::IRSM_PER_TU, "source", + "1 ouptput module per translation unit"), + clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", + "1 output module per kernel")), + cl::cat(SplitCategory)); cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), cl::cat(SplitCategory)}; From 051c0a98da53a567a655fe89a1d6fc9ddc6ec0f8 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Mon, 25 Nov 2024 10:13:51 -0800 Subject: [PATCH 12/16] do refactoring --- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 72 ++++++++++--------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index ba1ce80943f36..c88991e1be954 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -67,20 +67,25 @@ namespace { // A vector that contains all entry point functions in a split module. using EntryPointSet = SetVector; -/// Represents a named group of device code entry points - kernels and -/// SYCL_EXTERNAL functions. +/// Represents a named group entry points. struct EntryPointGroup { - std::string GroupId; + std::string GroupName; EntryPointSet Functions; - EntryPointGroup(StringRef GroupId = "") : GroupId(GroupId) {} - EntryPointGroup(StringRef GroupId, EntryPointSet Functions) - : GroupId(GroupId), Functions(std::move(Functions)) {} + EntryPointGroup() = default; + EntryPointGroup(const EntryPointGroup &) = default; + EntryPointGroup &operator=(const EntryPointGroup &) = default; + EntryPointGroup(EntryPointGroup &&) = default; + EntryPointGroup &operator=(EntryPointGroup &&) = default; + + EntryPointGroup(StringRef GroupName, + EntryPointSet Functions = EntryPointSet()) + : GroupName(GroupName), Functions(std::move(Functions)) {} void dump() const { constexpr size_t INDENT = 4; dbgs().indent(INDENT) << "ENTRY POINTS" - << " " << GroupId << " {\n"; + << " " << GroupName << " {\n"; for (const Function *F : Functions) dbgs().indent(INDENT) << " " << F->getName() << "\n"; @@ -90,9 +95,7 @@ struct EntryPointGroup { /// Annotates an llvm::Module with information necessary to perform and track /// the result of device code (llvm::Module instances) splitting: -/// - entry points of the module determined e.g. by a module splitter, as well -/// as information about entry point's origin (e.g. result of a scoped split) -/// - its properties, such as whether it has specialization constants uses +/// - entry points group from the module. /// It also provides convenience functions for entry point set transformation /// between llvm::Function object and string representations. class ModuleDesc { @@ -100,29 +103,29 @@ class ModuleDesc { EntryPointGroup EntryPoints; public: - ModuleDesc(std::unique_ptr M) : M(std::move(M)) {} - - ModuleDesc(std::unique_ptr M, EntryPointGroup EntryPoints) - : M(std::move(M)), EntryPoints(std::move(EntryPoints)) {} + ModuleDesc() = delete; + ModuleDesc(const ModuleDesc &) = delete; + ModuleDesc &operator=(const ModuleDesc &) = delete; + ModuleDesc(ModuleDesc &&) = default; + ModuleDesc &operator=(ModuleDesc &&) = default; + + ModuleDesc(std::unique_ptr M, + EntryPointGroup EntryPoints = EntryPointGroup()) + : M(std::move(M)), EntryPoints(std::move(EntryPoints)) { + assert(this->M && "Module should be non-empty"); + } const EntryPointSet &entries() const { return EntryPoints.Functions; } const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } EntryPointSet &entries() { return EntryPoints.Functions; } Module &getModule() { return *M; } const Module &getModule() const { return *M; } - std::unique_ptr releaseModulePtr() { return std::move(M); } // Cleans up module IR - removes dead globals, debug info etc. void cleanup() { - // Externalize them so they are not dropped by GlobalDCE - for (Function &F : *M) - if (F.hasFnAttribute("indirectly-callable")) - F.setLinkage(GlobalValue::LinkageTypes::ExternalLinkage); - ModuleAnalysisManager MAM; MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); ModulePassManager MPM; - // Do cleanup. MPM.addPass(GlobalDCEPass()); // Delete unreachable globals. MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. @@ -130,16 +133,17 @@ class ModuleDesc { } std::string makeSymbolTable() const { - std::string ST; - for (const Function *F : EntryPoints.Functions) - ST += (Twine(F->getName()) + "\n").str(); + SmallString<128> ST; + for (const Function *F : EntryPoints.Functions) { + ST += F->getName(); + ST += "\n"; + } - return ST; + return std::string(ST); } void dump() const { - assert(M && "dump of empty ModuleDesc"); - dbgs() << "split_module::ModuleDesc[" << M->getName() << "] {\n"; + dbgs() << "ModuleDesc[" << M->getName() << "] {\n"; EntryPoints.dump(); dbgs() << "}\n"; } @@ -366,14 +370,14 @@ static StringRef computeFunctionCategoryFromStringMetadata(const Function &F, return F.getFnAttribute(AttrName).getValueAsString(); } -static EntryPointGroupVec selectEntryPointGroups(const ModuleDesc &MD, +static EntryPointGroupVec selectEntryPointGroups(const Module &M, IRSplitMode Mode) { // std::map is used here to ensure stable ordering of entry point groups, // which is based on their contents, this greatly helps LIT tests std::map EntryPointsMap; static constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id"; - for (const auto &F : MD.getModule().functions()) { + for (const auto &F : M.functions()) { if (!isEntryPoint(F)) continue; @@ -427,12 +431,11 @@ static Error saveModuleIRInFile(Module &M, StringRef FilePath, static Expected saveModuleDesc(ModuleDesc &MD, std::string Prefix, bool OutputAssembly) { - SYCLSplitModule SM; Prefix += OutputAssembly ? ".ll" : ".bc"; - Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly); - if (E) + if (Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly)) return E; + SYCLSplitModule SM; SM.ModuleFilePath = Prefix; SM.Symbols = MD.makeSymbolTable(); return SM; @@ -498,9 +501,9 @@ std::optional convertStringToSplitMode(StringRef S) { Expected> splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { - ModuleDesc MD = std::move(M); SmallVector OutputImages; if (Settings.Mode == IRSplitMode::IRSM_NONE) { + ModuleDesc MD = std::move(M); std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); auto ImageOrErr = saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); @@ -511,7 +514,8 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { return OutputImages; } - EntryPointGroupVec Groups = selectEntryPointGroups(MD, Settings.Mode); + EntryPointGroupVec Groups = selectEntryPointGroups(*M, Settings.Mode); + ModuleDesc MD = std::move(M); if (Groups.size() < 2) { // FIXME(maksimsab): this branch is not tested yet. std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); From 93db9b24c76ed91b53f4a82cba4ca52a7c039a72 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Tue, 26 Nov 2024 06:58:39 -0800 Subject: [PATCH 13/16] remove the branch for the case of only one entry points group --- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index c88991e1be954..dde0f8a9d67a8 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -203,17 +203,7 @@ class DependencyGraph { if (!CI || !CI->isIndirectCall()) // Direct calls were handled above continue; - // TODO: consider limiting set of potential callees to functions marked - // with special attribute (like [[intel::device_indirectly_callable]]) const FunctionType *Signature = CI->getFunctionType(); - // Note: strictly speaking, virtual functions are allowed to use - // co-variant return types, i.e. we can actually miss a potential callee - // here, because it has different signature (different return type). - // However, this is not a problem for two reasons: - // - opaque pointers will be enabled at some point and will make - // signatures the same in that case - // - all virtual functions are referenced from vtable and therefore will - // anyway be preserved in a module const auto &PotentialCallees = FuncTypeToFuncsMap[Signature]; Graph[&F].insert(PotentialCallees.begin(), PotentialCallees.end()); } @@ -516,18 +506,6 @@ splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { EntryPointGroupVec Groups = selectEntryPointGroups(*M, Settings.Mode); ModuleDesc MD = std::move(M); - if (Groups.size() < 2) { - // FIXME(maksimsab): this branch is not tested yet. - std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); - auto ImageOrErr = - saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); - if (!ImageOrErr) - return ImageOrErr.takeError(); - - OutputImages.emplace_back(std::move(*ImageOrErr)); - return OutputImages; - } - ModuleSplitter Splitter(std::move(MD), std::move(Groups)); size_t ID = 0; while (Splitter.hasMoreSplits()) { From a4e71e24c17069d69bbede1d6043a0cb946f0358 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Tue, 26 Nov 2024 07:33:04 -0800 Subject: [PATCH 14/16] edit included headers --- llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h | 1 + llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 9 --------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h index 1b46cda5d11ce..4df3e0321e9cd 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -13,6 +13,7 @@ #ifndef LLVM_SYCL_MODULE_SPLIT_H #define LLVM_SYCL_MODULE_SPLIT_H +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index dde0f8a9d67a8..a6003db7d5e3b 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -13,34 +13,27 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Bitcode/BitcodeWriterPass.h" -#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PassManagerImpl.h" #include "llvm/IRPrinter/IRPrintingPasses.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/GlobalDCE.h" -#include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/Transforms/IPO/StripSymbols.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/SYCLUtils.h" -#include #include #include -#include using namespace llvm; @@ -96,8 +89,6 @@ struct EntryPointGroup { /// Annotates an llvm::Module with information necessary to perform and track /// the result of device code (llvm::Module instances) splitting: /// - entry points group from the module. -/// It also provides convenience functions for entry point set transformation -/// between llvm::Function object and string representations. class ModuleDesc { std::unique_ptr M; EntryPointGroup EntryPoints; From 96d36a43c478a07de4fa5db7ee860234e6657c0d Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Thu, 28 Nov 2024 08:22:25 -0800 Subject: [PATCH 15/16] remove unused functions --- .../include/llvm/Transforms/Utils/SYCLUtils.h | 64 +---------- llvm/lib/Transforms/Utils/SYCLUtils.cpp | 102 +----------------- 2 files changed, 2 insertions(+), 164 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h index 02c069624794d..53dec1139cd8e 100644 --- a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h +++ b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h @@ -10,74 +10,12 @@ #ifndef LLVM_TRANSFORMS_UTILS_SYCLUTILS_H #define LLVM_TRANSFORMS_UTILS_SYCLUTILS_H -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Operator.h" - -#include #include #include namespace llvm { -using CallGraphNodeAction = ::std::function; -using CallGraphFunctionFilter = - std::function; - -// Traverses call graph starting from given function up the call chain applying -// given action to each function met on the way. If \c ErrorOnNonCallUse -// parameter is true, then no functions' uses are allowed except calls. -// Otherwise, any function where use of the current one happened is added to the -// call graph as if the use was a call. -// The 'functionFilter' parameter is a callback function that can be used to -// control which functions will be added to a call graph. -// -// The callback is invoked whenever a function being traversed is used -// by some instruction which is not a call to this instruction (e.g. storing -// function pointer to memory) - the first parameter is the using instructions, -// the second - the function being traversed. The parent function of the -// instruction is added to the call graph depending on whether the callback -// returns 'true' (added) or 'false' (not added). -// Functions which are part of the visited set ('Visited' parameter) are not -// traversed. - -void traverseCallgraphUp( - llvm::Function *F, CallGraphNodeAction NodeF, - SmallPtrSetImpl &Visited, bool ErrorOnNonCallUse, - const CallGraphFunctionFilter &functionFilter = - [](const Instruction *, const Function *) { return true; }); - -template -void traverseCallgraphUp( - Function *F, CallGraphNodeActionF ActionF, - SmallPtrSetImpl &Visited, bool ErrorOnNonCallUse, - const CallGraphFunctionFilter &functionFilter = - [](const Instruction *, const Function *) { return true; }) { - traverseCallgraphUp(F, CallGraphNodeAction(ActionF), Visited, - ErrorOnNonCallUse, functionFilter); -} - -template -void traverseCallgraphUp( - Function *F, CallGraphNodeActionF ActionF, bool ErrorOnNonCallUse = true, - const CallGraphFunctionFilter &functionFilter = - [](const Instruction *, const Function *) { return true; }) { - SmallPtrSet Visited; - traverseCallgraphUp(F, CallGraphNodeAction(ActionF), Visited, - ErrorOnNonCallUse, functionFilter); -} - -/// Removes the global variable "llvm.used" and returns true on success. -/// "llvm.used" is a global constant array containing references to kernels -/// available in the module and callable from host code. The elements of -/// the array are ConstantExpr bitcast to i8*. -/// The variable must be removed as it is a) has done the job to the moment -/// of this function call and b) the references to the kernels callable from -/// host must not have users. -bool removeSYCLKernelsConstRefArray(Module &M); +class raw_ostream; using SYCLStringTable = std::vector>; diff --git a/llvm/lib/Transforms/Utils/SYCLUtils.cpp b/llvm/lib/Transforms/Utils/SYCLUtils.cpp index 450b9d6380feb..7ae94e044bd42 100644 --- a/llvm/lib/Transforms/Utils/SYCLUtils.cpp +++ b/llvm/lib/Transforms/Utils/SYCLUtils.cpp @@ -9,110 +9,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/SYCLUtils.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Support/raw_ostream.h" namespace llvm { -void traverseCallgraphUp(llvm::Function *F, CallGraphNodeAction ActionF, - SmallPtrSetImpl &FunctionsVisited, - bool ErrorOnNonCallUse, - const CallGraphFunctionFilter &functionFilter) { - SmallVector Worklist; - - if (FunctionsVisited.count(F) == 0) - Worklist.push_back(F); - - while (!Worklist.empty()) { - Function *CurF = Worklist.pop_back_val(); - FunctionsVisited.insert(CurF); - // Apply the action function. - ActionF(CurF); - - // Update all callers as well. - for (auto It = CurF->use_begin(); It != CurF->use_end(); It++) { - auto FCall = It->getUser(); - auto ErrMsg = - llvm::Twine(__FILE__ " ") + - "Function use other than call detected while traversing call\n" - "graph up to a kernel"; - if (!isa(FCall)) { - // A use other than a call is met... - if (ErrorOnNonCallUse) { - // ... non-call is an error - report - llvm::report_fatal_error(ErrMsg); - } else { - // ... non-call is OK - add using function to the worklist - if (auto *I = dyn_cast(FCall)) { - if (!functionFilter(I, CurF)) { - continue; - } - - auto UseF = I->getFunction(); - - if (FunctionsVisited.count(UseF) == 0) { - Worklist.push_back(UseF); - } - } - } - } else { - auto *CI = cast(FCall); - - if ((CI->getCalledFunction() != CurF)) { - // CurF is used in a call, but not as the callee. - if (ErrorOnNonCallUse) - llvm::report_fatal_error(ErrMsg); - } else { - auto FCaller = CI->getFunction(); - - if (!FunctionsVisited.count(FCaller)) { - Worklist.push_back(FCaller); - } - } - } - } - } -} - -bool removeSYCLKernelsConstRefArray(Module &M) { - GlobalVariable *GV = M.getGlobalVariable("llvm.used"); - - if (!GV) - return false; - - assert(GV->user_empty() && "Unexpected llvm.used users"); - Constant *Initializer = GV->getInitializer(); - GV->setInitializer(nullptr); - GV->eraseFromParent(); - - // Destroy the initializer and all operands of it. - SmallVector IOperands; - for (auto It = Initializer->op_begin(); It != Initializer->op_end(); It++) - IOperands.push_back(cast(*It)); - assert(llvm::isSafeToDestroyConstant(Initializer) && - "Cannot remove initializer of llvm.used global"); - Initializer->destroyConstant(); - for (auto It = IOperands.begin(); It != IOperands.end(); It++) { - auto Op = (*It)->stripPointerCasts(); - auto *F = dyn_cast(Op); - if (llvm::isSafeToDestroyConstant(*It)) - (*It)->destroyConstant(); - else if (F && F->getCallingConv() == CallingConv::SPIR_KERNEL && - !F->use_empty()) { - // The element in "llvm.used" array has other users. That is Ok for - // specialization constants, but is wrong for kernels. - llvm::report_fatal_error("Unexpected usage of SYCL kernel"); - } - - // Remove unused kernel declarations to avoid LLVM IR check fails. - if (F && F->isDeclaration() && F->use_empty()) - F->eraseFromParent(); - } - - return true; -} - void writeSYCLStringTable(const SYCLStringTable &Table, raw_ostream &OS) { assert(Table.size() > 0 && "table should contain at least column titles"); size_t numberColumns = Table[0].size(); From 1f2039e6a0c88ea413c35d002d1751006d745658 Mon Sep 17 00:00:00 2001 From: "Sabianin, Maksim" Date: Wed, 4 Dec 2024 06:18:30 -0800 Subject: [PATCH 16/16] remove usage of referenced-indirectly func attribute --- llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp | 9 ++------- .../device-code-split/complex-indirect-call-chain.ll | 9 +++++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp index a6003db7d5e3b..e6a36a1fba969 100644 --- a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -160,8 +160,8 @@ class ModuleDesc { // bitcast, phi node, call, etc.): "A" -> "B" edge will be added to the // graph; // 2. function A performs an indirect call of a function with signature S and -// there is a function B with signature S marked with "referenced-indirectly" -// attribute. "A" -> "B" edge will be added to the graph; +// there is a function B with signature S. "A" -> "B" edge will be added to +// the graph; class DependencyGraph { public: using GlobalSet = SmallPtrSet; @@ -175,11 +175,6 @@ class DependencyGraph { if (isKernel(F)) continue; - // Only functions which are marked with "referenced-indireclty" attribute - // are considered to be indirect callee candidates. - if (!F.hasFnAttribute("referenced-indirectly")) - continue; - FuncTypeToFuncsMap[F.getFunctionType()].insert(&F); } diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll index 9e093dfda4f3a..1e92034c156bf 100644 --- a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll @@ -4,7 +4,7 @@ ; RUN: llvm-split -sycl-split=kernel -S < %s -o %t ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ ; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ -; RUN: --implicit-check-not @kernel_B --implicit-check-not @baz +; RUN: --implicit-check-not @kernel_B ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ ; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C ; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ @@ -14,11 +14,13 @@ ; CHECK0-DAG: define spir_kernel void @kernel_C ; CHECK0-DAG: define spir_func i32 @bar +; CHECK0-DAG: define spir_func void @baz ; CHECK0-DAG: define spir_func void @BAZ ; CHECK1-DAG: define spir_kernel void @kernel_B ; CHECK1-DAG: define {{.*}}spir_func i32 @foo ; CHECK1-DAG: define spir_func i32 @bar +; CHECK1-DAG: define spir_func void @baz ; CHECK1-DAG: define spir_func void @BAZ ; CHECK2-DAG: define spir_kernel void @kernel_A @@ -32,7 +34,7 @@ define spir_func i32 @foo(i32 (i32, void ()*)* %ptr1, void ()* %ptr2) { ret i32 %1 } -define spir_func i32 @bar(i32 %arg, void ()* %ptr) #3 { +define spir_func i32 @bar(i32 %arg, void ()* %ptr) { call spir_func void %ptr() ret i32 %arg } @@ -41,7 +43,7 @@ define spir_func void @baz() { ret void } -define spir_func void @BAZ() #3 { +define spir_func void @BAZ() { ret void } @@ -63,4 +65,3 @@ define spir_kernel void @kernel_C() #2 { attributes #0 = { "sycl-module-id"="TU1.cpp" } attributes #1 = { "sycl-module-id"="TU2.cpp" } attributes #2 = { "sycl-module-id"="TU3.cpp" } -attributes #3 = { "referenced-indirectly" }