diff --git a/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h new file mode 100644 index 0000000000000..4df3e0321e9cd --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/SYCLModuleSplit.h @@ -0,0 +1,71 @@ +//===-------- SYCLModuleSplit.h - module split ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Functionality to split a module into callgraphs. A callgraph here is a set +// of entry points with all functions reachable from them via a call. The result +// of the split is new modules containing corresponding callgraph. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SYCL_MODULE_SPLIT_H +#define LLVM_SYCL_MODULE_SPLIT_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" + +#include +#include +#include + +namespace llvm { + +class Module; + +enum class IRSplitMode { + IRSM_PER_TU, // one module per translation unit + IRSM_PER_KERNEL, // one module per kernel + IRSM_NONE // no splitting +}; + +/// \returns IRSplitMode value if \p S is recognized. Otherwise, std::nullopt is +/// returned. +std::optional convertStringToSplitMode(StringRef S); + +/// The structure represents a split LLVM Module accompanied by additional +/// information. Split Modules are being stored at disk due to the high RAM +/// consumption during the whole splitting process. +struct SYCLSplitModule { + std::string ModuleFilePath; + std::string Symbols; + + SYCLSplitModule() = default; + SYCLSplitModule(const SYCLSplitModule &) = default; + SYCLSplitModule &operator=(const SYCLSplitModule &) = default; + SYCLSplitModule(SYCLSplitModule &&) = default; + SYCLSplitModule &operator=(SYCLSplitModule &&) = default; + + SYCLSplitModule(std::string_view File, std::string Symbols) + : ModuleFilePath(File), Symbols(std::move(Symbols)) {} +}; + +struct ModuleSplitterSettings { + IRSplitMode Mode; + bool OutputAssembly = false; // Bitcode or LLVM IR. + StringRef OutputPrefix; +}; + +/// Parses the string table. +Expected> +parseSYCLSplitModulesFromFile(StringRef File); + +/// Splits the given module \p M according to the given \p Settings. +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings); + +} // namespace llvm + +#endif // LLVM_SYCL_MODULE_SPLIT_H diff --git a/llvm/include/llvm/Transforms/Utils/SYCLUtils.h b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h new file mode 100644 index 0000000000000..53dec1139cd8e --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/SYCLUtils.h @@ -0,0 +1,26 @@ +//===------------ SYCLUtils.h - SYCL utility functions --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Utility functions for SYCL. +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_UTILS_SYCLUTILS_H +#define LLVM_TRANSFORMS_UTILS_SYCLUTILS_H + +#include +#include + +namespace llvm { + +class raw_ostream; + +using SYCLStringTable = std::vector>; + +void writeSYCLStringTable(const SYCLStringTable &Table, raw_ostream &OS); + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_SYCLUTILS_H diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index 65bd3080662c4..530cba5275dcb 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -82,6 +82,8 @@ add_llvm_component_library(LLVMTransformUtils SizeOpts.cpp SplitModule.cpp StripNonLineTableDebugInfo.cpp + SYCLModuleSplit.cpp + SYCLUtils.cpp SymbolRewriter.cpp UnifyFunctionExitNodes.cpp UnifyLoopExits.cpp diff --git a/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp new file mode 100644 index 0000000000000..e6a36a1fba969 --- /dev/null +++ b/llvm/lib/Transforms/Utils/SYCLModuleSplit.cpp @@ -0,0 +1,513 @@ +//===-------- SYCLModuleSplitter.cpp - split a module into callgraphs -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// See comments in the header. +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SYCLModuleSplit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Bitcode/BitcodeWriterPass.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PassManagerImpl.h" +#include "llvm/IRPrinter/IRPrintingPasses.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/LineIterator.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Transforms/IPO/GlobalDCE.h" +#include "llvm/Transforms/IPO/StripDeadPrototypes.h" +#include "llvm/Transforms/IPO/StripSymbols.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SYCLUtils.h" + +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "sycl_module_split" + +static bool isKernel(const Function &F) { + return F.getCallingConv() == CallingConv::SPIR_KERNEL || + F.getCallingConv() == CallingConv::AMDGPU_KERNEL; +} + +static bool isEntryPoint(const Function &F) { + // Skip declarations, if any: they should not be included into a vector of + // entry points groups or otherwise we will end up with incorrectly generated + // list of symbols. + if (F.isDeclaration()) + return false; + + // Kernels are always considered to be entry points + return isKernel(F); +} + +namespace { + +// A vector that contains all entry point functions in a split module. +using EntryPointSet = SetVector; + +/// Represents a named group entry points. +struct EntryPointGroup { + std::string GroupName; + EntryPointSet Functions; + + EntryPointGroup() = default; + EntryPointGroup(const EntryPointGroup &) = default; + EntryPointGroup &operator=(const EntryPointGroup &) = default; + EntryPointGroup(EntryPointGroup &&) = default; + EntryPointGroup &operator=(EntryPointGroup &&) = default; + + EntryPointGroup(StringRef GroupName, + EntryPointSet Functions = EntryPointSet()) + : GroupName(GroupName), Functions(std::move(Functions)) {} + + void dump() const { + constexpr size_t INDENT = 4; + dbgs().indent(INDENT) << "ENTRY POINTS" + << " " << GroupName << " {\n"; + for (const Function *F : Functions) + dbgs().indent(INDENT) << " " << F->getName() << "\n"; + + dbgs().indent(INDENT) << "}\n"; + } +}; + +/// Annotates an llvm::Module with information necessary to perform and track +/// the result of device code (llvm::Module instances) splitting: +/// - entry points group from the module. +class ModuleDesc { + std::unique_ptr M; + EntryPointGroup EntryPoints; + +public: + ModuleDesc() = delete; + ModuleDesc(const ModuleDesc &) = delete; + ModuleDesc &operator=(const ModuleDesc &) = delete; + ModuleDesc(ModuleDesc &&) = default; + ModuleDesc &operator=(ModuleDesc &&) = default; + + ModuleDesc(std::unique_ptr M, + EntryPointGroup EntryPoints = EntryPointGroup()) + : M(std::move(M)), EntryPoints(std::move(EntryPoints)) { + assert(this->M && "Module should be non-empty"); + } + + const EntryPointSet &entries() const { return EntryPoints.Functions; } + const EntryPointGroup &getEntryPointGroup() const { return EntryPoints; } + EntryPointSet &entries() { return EntryPoints.Functions; } + Module &getModule() { return *M; } + const Module &getModule() const { return *M; } + + // Cleans up module IR - removes dead globals, debug info etc. + void cleanup() { + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + ModulePassManager MPM; + MPM.addPass(GlobalDCEPass()); // Delete unreachable globals. + MPM.addPass(StripDeadDebugInfoPass()); // Remove dead debug info. + MPM.addPass(StripDeadPrototypesPass()); // Remove dead func decls. + MPM.run(*M, MAM); + } + + std::string makeSymbolTable() const { + SmallString<128> ST; + for (const Function *F : EntryPoints.Functions) { + ST += F->getName(); + ST += "\n"; + } + + return std::string(ST); + } + + void dump() const { + dbgs() << "ModuleDesc[" << M->getName() << "] {\n"; + EntryPoints.dump(); + dbgs() << "}\n"; + } +}; + +// Represents "dependency" or "use" graph of global objects (functions and +// global variables) in a module. It is used during device code split to +// understand which global variables and functions (other than entry points) +// should be included into a split module. +// +// Nodes of the graph represent LLVM's GlobalObjects, edges "A" -> "B" represent +// the fact that if "A" is included into a module, then "B" should be included +// as well. +// +// Examples of dependencies which are represented in this graph: +// - Function FA calls function FB +// - Function FA uses global variable GA +// - Global variable GA references (initialized with) function FB +// - Function FA stores address of a function FB somewhere +// +// The following cases are treated as dependencies between global objects: +// 1. Global object A is used within by a global object B in any way (store, +// bitcast, phi node, call, etc.): "A" -> "B" edge will be added to the +// graph; +// 2. function A performs an indirect call of a function with signature S and +// there is a function B with signature S. "A" -> "B" edge will be added to +// the graph; +class DependencyGraph { +public: + using GlobalSet = SmallPtrSet; + + DependencyGraph(const Module &M) { + // Group functions by their signature to handle case (2) described above + DenseMap + FuncTypeToFuncsMap; + for (const auto &F : M.functions()) { + // Kernels can't be called (either directly or indirectly) in SYCL + if (isKernel(F)) + continue; + + FuncTypeToFuncsMap[F.getFunctionType()].insert(&F); + } + + for (const auto &F : M.functions()) { + // case (1), see comment above the class definition + for (const Value *U : F.users()) + addUserToGraphRecursively(cast(U), &F); + + // case (2), see comment above the class definition + for (const auto &I : instructions(F)) { + const auto *CI = dyn_cast(&I); + if (!CI || !CI->isIndirectCall()) // Direct calls were handled above + continue; + + const FunctionType *Signature = CI->getFunctionType(); + const auto &PotentialCallees = FuncTypeToFuncsMap[Signature]; + Graph[&F].insert(PotentialCallees.begin(), PotentialCallees.end()); + } + } + + // And every global variable (but their handling is a bit simpler) + for (const auto &GV : M.globals()) + for (const Value *U : GV.users()) + addUserToGraphRecursively(cast(U), &GV); + } + + iterator_range + dependencies(const GlobalValue *Val) const { + auto It = Graph.find(Val); + return (It == Graph.end()) + ? make_range(EmptySet.begin(), EmptySet.end()) + : make_range(It->second.begin(), It->second.end()); + } + +private: + void addUserToGraphRecursively(const User *Root, const GlobalValue *V) { + SmallVector WorkList; + WorkList.push_back(Root); + + while (!WorkList.empty()) { + const User *U = WorkList.pop_back_val(); + if (const auto *I = dyn_cast(U)) { + const auto *UFunc = I->getFunction(); + Graph[UFunc].insert(V); + } else if (isa(U)) { + if (const auto *GV = dyn_cast(U)) + Graph[GV].insert(V); + // This could be a global variable or some constant expression (like + // bitcast or gep). We trace users of this constant further to reach + // global objects they are used by and add them to the graph. + for (const auto *UU : U->users()) + WorkList.push_back(UU); + } else + llvm_unreachable("Unhandled type of function user"); + } + } + + DenseMap Graph; + SmallPtrSet EmptySet; +}; + +void collectFunctionsAndGlobalVariablesToExtract( + SetVector &GVs, const Module &M, + const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &DG) { + // We start with module entry points + for (const auto *F : ModuleEntryPoints.Functions) + GVs.insert(F); + + // Non-discardable global variables are also include into the initial set + for (const auto &GV : M.globals()) + if (!GV.isDiscardableIfUnused()) + GVs.insert(&GV); + + // GVs has SetVector type. This type inserts a value only if it is not yet + // present there. So, recursion is not expected here. + size_t Idx = 0; + while (Idx < GVs.size()) { + const GlobalValue *Obj = GVs[Idx++]; + + for (const GlobalValue *Dep : DG.dependencies(Obj)) { + if (const auto *Func = dyn_cast(Dep)) { + if (!Func->isDeclaration()) + GVs.insert(Func); + } else + GVs.insert(Dep); // Global variables are added unconditionally + } + } +} + +ModuleDesc extractSubModule(const ModuleDesc &MD, + const SetVector &GVs, + EntryPointGroup ModuleEntryPoints) { + const Module &M = MD.getModule(); + // For each group of entry points collect all dependencies. + ValueToValueMapTy VMap; + // Clone definitions only for needed globals. Others will be added as + // declarations and removed later. + std::unique_ptr SubM = CloneModule( + M, VMap, [&](const GlobalValue *GV) { return GVs.count(GV); }); + // Replace entry points with cloned ones. + EntryPointSet NewEPs; + const EntryPointSet &EPs = ModuleEntryPoints.Functions; + std::for_each(EPs.begin(), EPs.end(), [&](const Function *F) { + NewEPs.insert(cast(VMap[F])); + }); + ModuleEntryPoints.Functions = std::move(NewEPs); + return ModuleDesc{std::move(SubM), std::move(ModuleEntryPoints)}; +} + +// The function produces a copy of input LLVM IR module M with only those +// functions and globals that can be called from entry points that are specified +// in ModuleEntryPoints vector, in addition to the entry point functions. +ModuleDesc extractCallGraph(const ModuleDesc &MD, + EntryPointGroup ModuleEntryPoints, + const DependencyGraph &DG) { + SetVector GVs; + collectFunctionsAndGlobalVariablesToExtract(GVs, MD.getModule(), + ModuleEntryPoints, DG); + + ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints)); + LLVM_DEBUG(SplitM.dump()); + SplitM.cleanup(); + return SplitM; +} + +using EntryPointGroupVec = SmallVector; + +/// Module Splitter. +/// It gets a module (in a form of module descriptor, to get additional info) +/// and a collection of entry points groups. Each group specifies subset entry +/// points from input module that should be included in a split module. +class ModuleSplitter { +private: + ModuleDesc Input; + EntryPointGroupVec Groups; + DependencyGraph DG; + +private: + EntryPointGroup drawEntryPointGroup() { + assert(Groups.size() > 0 && "Reached end of entry point groups list."); + EntryPointGroup Group = std::move(Groups.back()); + Groups.pop_back(); + return Group; + } + +public: + ModuleSplitter(ModuleDesc MD, EntryPointGroupVec GroupVec) + : Input(std::move(MD)), Groups(std::move(GroupVec)), + DG(Input.getModule()) { + assert(!Groups.empty() && "Entry points groups collection is empty!"); + } + + /// Gets next subsequence of entry points in an input module and provides + /// split submodule containing these entry points and their dependencies. + ModuleDesc getNextSplit() { + return extractCallGraph(Input, drawEntryPointGroup(), DG); + } + + /// Check that there are still submodules to split. + bool hasMoreSplits() const { return Groups.size() > 0; } +}; + +} // namespace + +/// Gets attached attribute value if it is present. Otherwise returns empty +/// stirng. +static StringRef computeFunctionCategoryFromStringMetadata(const Function &F, + StringRef AttrName) { + return F.getFnAttribute(AttrName).getValueAsString(); +} + +static EntryPointGroupVec selectEntryPointGroups(const Module &M, + IRSplitMode Mode) { + // std::map is used here to ensure stable ordering of entry point groups, + // which is based on their contents, this greatly helps LIT tests + std::map EntryPointsMap; + + static constexpr char ATTR_SYCL_MODULE_ID[] = "sycl-module-id"; + for (const auto &F : M.functions()) { + if (!isEntryPoint(F)) + continue; + + std::string Key; + switch (Mode) { + case IRSplitMode::IRSM_PER_KERNEL: + Key = F.getName(); + break; + case IRSplitMode::IRSM_PER_TU: + Key = computeFunctionCategoryFromStringMetadata(F, ATTR_SYCL_MODULE_ID); + break; + case IRSplitMode::IRSM_NONE: + llvm_unreachable(""); + } + + EntryPointsMap[Key].insert(&F); + } + + EntryPointGroupVec Groups; + if (EntryPointsMap.empty()) { + // No entry points met, record this. + Groups.emplace_back("-", EntryPointSet()); + } else { + Groups.reserve(EntryPointsMap.size()); + // Start with properties of a source module + for (auto &[Key, EntryPoints] : EntryPointsMap) + Groups.emplace_back(Key, std::move(EntryPoints)); + } + + return Groups; +} + +static Error saveModuleIRInFile(Module &M, StringRef FilePath, + bool OutputAssembly) { + int FD = -1; + if (std::error_code EC = sys::fs::openFileForWrite(FilePath, FD)) + return errorCodeToError(EC); + + raw_fd_ostream OS(FD, true); + ModulePassManager MPM; + ModuleAnalysisManager MAM; + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + if (OutputAssembly) + MPM.addPass(PrintModulePass(OS)); + else + MPM.addPass(BitcodeWriterPass(OS)); + + MPM.run(M, MAM); + return Error::success(); +} + +static Expected +saveModuleDesc(ModuleDesc &MD, std::string Prefix, bool OutputAssembly) { + Prefix += OutputAssembly ? ".ll" : ".bc"; + if (Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly)) + return E; + + SYCLSplitModule SM; + SM.ModuleFilePath = Prefix; + SM.Symbols = MD.makeSymbolTable(); + return SM; +} + +namespace llvm { + +Expected> +parseSYCLSplitModulesFromFile(StringRef File) { + auto EntriesMBOrErr = llvm::MemoryBuffer::getFile(File); + if (!EntriesMBOrErr) + return createFileError(File, EntriesMBOrErr.getError()); + + line_iterator LI(**EntriesMBOrErr); + if (LI.is_at_eof() || *LI != "[Code|Symbols]") + return createStringError(inconvertibleErrorCode(), + "invalid SYCL Table file."); + + // "Code" and "Symbols" at the moment. + static constexpr int NUMBER_COLUMNS = 2; + ++LI; + SmallVector Modules; + while (!LI.is_at_eof()) { + StringRef Line = *LI; + if (Line.empty()) + return createStringError("invalid SYCL table row."); + + SmallVector Parts; + Line.split(Parts, "|"); + if (Parts.size() != NUMBER_COLUMNS) + return createStringError("invalid SYCL Table row."); + + auto [IRFilePath, SymbolsFilePath] = std::tie(Parts[0], Parts[1]); + if (SymbolsFilePath.empty()) + return createStringError("invalid SYCL Table row."); + + auto MBOrErr = MemoryBuffer::getFile(SymbolsFilePath); + if (!MBOrErr) + return createFileError(SymbolsFilePath, MBOrErr.getError()); + + auto &MB2 = *MBOrErr; + std::string Symbols = + std::string(MB2->getBufferStart(), MB2->getBufferEnd()); + Modules.emplace_back(IRFilePath, std::move(Symbols)); + ++LI; + } + + return Modules; +} + +std::optional convertStringToSplitMode(StringRef S) { + static const StringMap Values = { + {"source", IRSplitMode::IRSM_PER_TU}, + {"kernel", IRSplitMode::IRSM_PER_KERNEL}, + {"none", IRSplitMode::IRSM_NONE}}; + + auto It = Values.find(S); + if (It == Values.end()) + return std::nullopt; + + return It->second; +} + +Expected> +splitSYCLModule(std::unique_ptr M, ModuleSplitterSettings Settings) { + SmallVector OutputImages; + if (Settings.Mode == IRSplitMode::IRSM_NONE) { + ModuleDesc MD = std::move(M); + std::string OutIRFileName = (Settings.OutputPrefix + Twine("_0")).str(); + auto ImageOrErr = + saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); + if (!ImageOrErr) + return ImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*ImageOrErr)); + return OutputImages; + } + + EntryPointGroupVec Groups = selectEntryPointGroups(*M, Settings.Mode); + ModuleDesc MD = std::move(M); + ModuleSplitter Splitter(std::move(MD), std::move(Groups)); + size_t ID = 0; + while (Splitter.hasMoreSplits()) { + ModuleDesc MD = Splitter.getNextSplit(); + + std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str(); + auto SplitImageOrErr = + saveModuleDesc(MD, OutIRFileName, Settings.OutputAssembly); + if (!SplitImageOrErr) + return SplitImageOrErr.takeError(); + + OutputImages.emplace_back(std::move(*SplitImageOrErr)); + ++ID; + } + + return OutputImages; +} + +} // namespace llvm diff --git a/llvm/lib/Transforms/Utils/SYCLUtils.cpp b/llvm/lib/Transforms/Utils/SYCLUtils.cpp new file mode 100644 index 0000000000000..7ae94e044bd42 --- /dev/null +++ b/llvm/lib/Transforms/Utils/SYCLUtils.cpp @@ -0,0 +1,27 @@ +//===------------ SYCLUtils.cpp - SYCL utility functions ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// SYCL utility functions. +//===----------------------------------------------------------------------===// +#include "llvm/Transforms/Utils/SYCLUtils.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +void writeSYCLStringTable(const SYCLStringTable &Table, raw_ostream &OS) { + assert(Table.size() > 0 && "table should contain at least column titles"); + size_t numberColumns = Table[0].size(); + assert(numberColumns > 0 && "table should be non-empty"); + OS << '[' << join(Table[0].begin(), Table[0].end(), "|") << "]\n"; + for (size_t I = 1, E = Table.size(); I != E; ++I) { + assert(Table[I].size() == numberColumns && "row's size should be equal"); + OS << join(Table[I].begin(), Table[I].end(), "|") << '\n'; + } +} + +} // namespace llvm diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll new file mode 100644 index 0000000000000..6b0305d12400f --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/amd-kernel-split.ll @@ -0,0 +1,17 @@ +; -- Per-kernel split +; RUN: llvm-split -sycl-split=kernel -S < %s -o %tC +; RUN: FileCheck %s -input-file=%tC_0.ll --check-prefixes CHECK-A0 +; RUN: FileCheck %s -input-file=%tC_1.ll --check-prefixes CHECK-A1 + +define dso_local amdgpu_kernel void @Kernel1() { + ret void +} + +define dso_local amdgpu_kernel void @Kernel2() { + ret void +} + +; CHECK-A0: define dso_local amdgpu_kernel void @Kernel2() +; CHECK-A0-NOT: define dso_local amdgpu_kernel void @Kernel1() +; CHECK-A1-NOT: define dso_local amdgpu_kernel void @Kernel2() +; CHECK-A1: define dso_local amdgpu_kernel void @Kernel1() diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll new file mode 100644 index 0000000000000..3734153b9fbaa --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-1.ll @@ -0,0 +1,120 @@ +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +; CHECK-TU1-NOT: @{{.*}}GV{{.*}} +; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK: !0 = !{i32 1, i32 2} +; CHECK: !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll new file mode 100644 index 0000000000000..2e3d2e5e55c9b --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/auto-module-split-func-ptr.ll @@ -0,0 +1,50 @@ +; This test checks that we can properly perform device code split by tracking +; all uses of functions (not only direct calls) + +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1 +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1 + +; CHECK-SYM0: kernel2 +; CHECK-SYM1: kernel1 +; +; CHECK-IR0: define dso_local spir_kernel void @kernel2 +; +; CHECK-IR1: @_Z2f1iTable = weak global ptr @_Z2f1i +; CHECK-IR1: define {{.*}} i32 @_Z2f1i +; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spirv64-unknown-unknown" + +@_Z2f1iTable = weak global ptr @_Z2f1i, align 8 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn +define dso_local spir_func i32 @_Z2f1i(i32 %a) #0 { +entry: + ret i32 %a +} + +; Function Attrs: convergent norecurse +define weak_odr dso_local spir_kernel void @kernel1() #1 { +entry: + %0 = call i32 @indirect_call(ptr addrspace(4) addrspacecast ( ptr getelementptr inbounds ( [1 x ptr] , ptr @_Z2f1iTable, i64 0, i64 0) to ptr addrspace(4)), i32 0) + ret void +} + +; Function Attrs: convergent norecurse +define dso_local spir_kernel void @kernel2() #2 { +entry: + ret void +} + +declare dso_local spir_func i32 @indirect_call(ptr addrspace(4), i32) local_unnamed_addr + +attributes #0 = { mustprogress nofree norecurse nosync nounwind readnone willreturn } +attributes #1 = { convergent norecurse "sycl-module-id"="TU1.cpp" } +attributes #2 = { convergent norecurse "sycl-module-id"="TU2.cpp" } + +; CHECK: kernel1 +; CHECK: kernel2 diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll new file mode 100644 index 0000000000000..a916fdfa82b76 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/basic-module-split.ll @@ -0,0 +1,122 @@ +; RUN: llvm-split -sycl-split=source -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT + +; ModuleID = 'basic-module-split.ll' +source_filename = "basic-module-split.ll" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +$_Z3barIiET_S0_ = comdat any + +;CHECK-TU1-NOT: @{{.*}}GV{{.*}} +;CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-TU1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-TU0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-TU1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-TU0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-TU0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK; !0 = !{i32 1, i32 2} +; CHECK; !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll new file mode 100644 index 0000000000000..1e92034c156bf --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/complex-indirect-call-chain.ll @@ -0,0 +1,67 @@ +; Check that Module splitting can trace through more complex call stacks +; involving several nested indirect calls. + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \ +; RUN: --implicit-check-not @foo --implicit-check-not @kernel_A \ +; RUN: --implicit-check-not @kernel_B +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \ +; RUN: --implicit-check-not @kernel_A --implicit-check-not @kernel_C +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \ +; RUN: --implicit-check-not @foo --implicit-check-not @bar \ +; RUN: --implicit-check-not @BAZ --implicit-check-not @kernel_B \ +; RUN: --implicit-check-not @kernel_C + +; CHECK0-DAG: define spir_kernel void @kernel_C +; CHECK0-DAG: define spir_func i32 @bar +; CHECK0-DAG: define spir_func void @baz +; CHECK0-DAG: define spir_func void @BAZ + +; CHECK1-DAG: define spir_kernel void @kernel_B +; CHECK1-DAG: define {{.*}}spir_func i32 @foo +; CHECK1-DAG: define spir_func i32 @bar +; CHECK1-DAG: define spir_func void @baz +; CHECK1-DAG: define spir_func void @BAZ + +; CHECK2-DAG: define spir_kernel void @kernel_A +; CHECK2-DAG: define {{.*}}spir_func void @baz + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define spir_func i32 @foo(i32 (i32, void ()*)* %ptr1, void ()* %ptr2) { + %1 = call spir_func i32 %ptr1(i32 42, void ()* %ptr2) + ret i32 %1 +} + +define spir_func i32 @bar(i32 %arg, void ()* %ptr) { + call spir_func void %ptr() + ret i32 %arg +} + +define spir_func void @baz() { + ret void +} + +define spir_func void @BAZ() { + ret void +} + +define spir_kernel void @kernel_A() #0 { + call spir_func void @baz() + ret void +} + +define spir_kernel void @kernel_B() #1 { + call spir_func i32 @foo(i32 (i32, void ()*)* null, void ()* null) + ret void +} + +define spir_kernel void @kernel_C() #2 { + call spir_func i32 @bar(i32 42, void ()* null) + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } +attributes #2 = { "sycl-module-id"="TU3.cpp" } diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll new file mode 100644 index 0000000000000..ddb0ea0b3c59a --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/one-kernel-per-module.ll @@ -0,0 +1,135 @@ +; Test checks "kernel" splitting mode. + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t.files +; RUN: FileCheck %s -input-file=%t.files_0.ll --check-prefixes CHECK-MODULE0,CHECK +; RUN: FileCheck %s -input-file=%t.files_0.sym --check-prefixes CHECK-MODULE0-TXT +; RUN: FileCheck %s -input-file=%t.files_1.ll --check-prefixes CHECK-MODULE1,CHECK +; RUN: FileCheck %s -input-file=%t.files_1.sym --check-prefixes CHECK-MODULE1-TXT +; RUN: FileCheck %s -input-file=%t.files_2.ll --check-prefixes CHECK-MODULE2,CHECK +; RUN: FileCheck %s -input-file=%t.files_2.sym --check-prefixes CHECK-MODULE2-TXT + +; ModuleID = 'one-kernel-per-module.ll' +source_filename = "one-kernel-per-module.ll" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +$_Z3barIiET_S0_ = comdat any + +;CHECK-MODULE2-NOT: @{{.*}}GV{{.*}} +;CHECK-MODULE1-NOT: @{{.*}}GV{{.*}} +;CHECK-MODULE0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4 +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +; CHECK-MODULE2: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE2-TXT: {{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE1-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}} +; CHECK-MODULE1-TXT-NOT: {{.*}}TU0_kernel0{{.*}} + +; CHECK-MODULE2: call spir_func void @{{.*}}foo{{.*}}() + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @_Z3foov() + ret void +} + +; CHECK-MODULE2: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-MODULE1-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() +; CHECK-MODULE0-NOT: define {{.*}} spir_func void @{{.*}}foo{{.*}}() + +; CHECK-MODULE2: call spir_func i32 @{{.*}}bar{{.*}}(i32 1) + +define dso_local spir_func void @_Z3foov() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @_Z3barIiET_S0_(i32 1) + %add = add nsw i32 2, %call + store i32 %add, ptr %a, align 4 + ret void +} + +; CHECK-MODULE2: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-MODULE1-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) +; CHECK-MODULE0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg) + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, ptr %arg.addr, align 4 + %0 = load i32, ptr %arg.addr, align 4 + ret i32 %0 +} + +; CHECK-MODULE2-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE2-TXT-NOT: {{.*}}TU0_kernel1{{.*}} +; CHECK-MODULE1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE1-TXT: {{.*}}TU0_kernel1{{.*}} +; CHECK-MODULE0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}() +; CHECK-MODULE0-TXT-NOT: {{.*}}TU0_kernel1{{.*}} + +; CHECK-MODULE1: call spir_func void @{{.*}}foo1{{.*}}() + +define dso_local spir_kernel void @TU0_kernel1() #0 { +entry: + call spir_func void @_Z4foo1v() + ret void +} + +; CHECK-MODULE2-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-MODULE1: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() +; CHECK-MODULE0-NOT: define {{.*}} spir_func void @{{.*}}foo1{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo1v() { +entry: + %a = alloca i32, align 4 + store i32 2, ptr %a, align 4 + ret void +} + +; CHECK-MODULE2-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE2-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-MODULE1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE1-TXT-NOT: {{.*}}TU1_kernel{{.*}} +; CHECK-MODULE0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}() +; CHECK-MODULE0-TXT: {{.*}}TU1_kernel{{.*}} + +; CHECK-MODULE0: call spir_func void @{{.*}}foo2{{.*}}() + +define dso_local spir_kernel void @TU1_kernel() #1 { +entry: + call spir_func void @_Z4foo2v() + ret void +} + +; CHECK-MODULE2-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-MODULE1-NOT: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() +; CHECK-MODULE0: define {{.*}} spir_func void @{{.*}}foo2{{.*}}() + +; Function Attrs: nounwind +define dso_local spir_func void @_Z4foo2v() { +entry: + %a = alloca i32, align 4 +; CHECK-MODULE0: %0 = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @{{.*}}GV{{.*}} to ptr addrspace(4)), align 4 + %0 = load i32, ptr addrspace(4) getelementptr inbounds ([1 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(1) @_ZL2GV to ptr addrspace(4)), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, ptr %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +; Metadata is saved in both modules. +; CHECK: !opencl.spir.version = !{!0, !0} +; CHECK: !spirv.Source = !{!1, !1} + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +; CHECK; !0 = !{i32 1, i32 2} +; CHECK; !1 = !{i32 4, i32 100000} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll new file mode 100644 index 0000000000000..921b7c22fc365 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -0,0 +1,105 @@ +; The test checks that Module splitting correctly separates kernels +; that use reqd_sub_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_sub_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_sub_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_sub_group_size attribute used + +; RUN: llvm-split -sycl-split=kernel -S %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} \ No newline at end of file diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll new file mode 100644 index 0000000000000..2ca8b220edfbe --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/per-reqd-wg-size-split-1.ll @@ -0,0 +1,105 @@ +; The test checks that Module splitting correctly separates kernels +; that use reqd_work_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_work_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_work_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_work_group_size attribute used + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !reqd_work_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} diff --git a/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll new file mode 100644 index 0000000000000..4ba15ecdefea6 --- /dev/null +++ b/llvm/test/tools/llvm-split/SYCL/device-code-split/split-with-kernel-declarations.ll @@ -0,0 +1,53 @@ +; The test checks that Module splitting does not treat declarations as entry points. + +; RUN: llvm-split -sycl-split=kernel -S < %s -o %t2 +; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE +; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1 +; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2 +; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0 + +; With per-kernel split, there should be three device images +; CHECK-PER-KERNEL-TABLE: [Code|Symbols] +; CHECK-PER-KERNEL-TABLE: {{.*}}_0.ll|{{.*}}_0.sym +; CHECK-PER-KERNEL-TABLE-NEXT: {{.*}}_1.ll|{{.*}}_1.sym +; CHECK-PER-KERNEL-TABLE-NEXT: {{.*}}_2.ll|{{.*}}_2.sym +; CHECK-PER-KERNEL-TABLE-EMPTY: +; +; CHECK-PER-KERNEL-SYM0-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM0: _ZTSZ4mainE10TU1_kernel0 +; CHECK-PER-KERNEL-SYM0-EMPTY: +; +; CHECK-PER-KERNEL-SYM2-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM2: _ZTSZ4mainE11TU0_kernel0 +; CHECK-PER-KERNEL-SYM2-EMPTY: +; +; CHECK-PER-KERNEL-SYM1-NOT: _ZTS4mainE10TU1_kernel1 +; CHECK-PER-KERNEL-SYM1: _ZTSZ4mainE11TU0_kernel1 +; CHECK-PER-KERNEL-SYM1-EMPTY: + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 { +entry: + ret void +} + +define spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 { +entry: + ret void +} + +define spir_kernel void @_ZTSZ4mainE10TU1_kernel0() #1 { + ret void +} + +declare spir_kernel void @_ZTS4mainE10TU1_kernel1() #1 + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} diff --git a/llvm/tools/llvm-split/llvm-split.cpp b/llvm/tools/llvm-split/llvm-split.cpp index c456403e6bc68..e40bf71026472 100644 --- a/llvm/tools/llvm-split/llvm-split.cpp +++ b/llvm/tools/llvm-split/llvm-split.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" @@ -27,8 +28,13 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/SYCLModuleSplit.h" +#include "llvm/Transforms/Utils/SYCLUtils.h" #include "llvm/Transforms/Utils/SplitModule.h" +#include +#include + using namespace llvm; static cl::OptionCategory SplitCategory("Split Options"); @@ -70,6 +76,64 @@ static cl::opt MCPU("mcpu", cl::desc("Target CPU, ignored if -mtriple is not used"), cl::value_desc("cpu"), cl::cat(SplitCategory)); +cl::opt SYCLSplitMode( + "sycl-split", cl::desc("module split mode"), cl::Optional, + cl::init(IRSplitMode::IRSM_NONE), + cl::values(clEnumValN(IRSplitMode::IRSM_PER_TU, "source", + "1 ouptput module per translation unit"), + clEnumValN(IRSplitMode::IRSM_PER_KERNEL, "kernel", + "1 output module per kernel")), + cl::cat(SplitCategory)); + +cl::opt OutputAssembly{"S", cl::desc("Write output as LLVM assembly"), + cl::cat(SplitCategory)}; + +void writeStringToFile(std::string_view Content, StringRef Path) { + std::error_code EC; + raw_fd_ostream OS(Path, EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + OS << Content << "\n"; +} + +void writeSplitModulesAsTable(ArrayRef SplitModules, + StringRef Path) { + std::vector Columns = {"Code", "Symbols"}; + SYCLStringTable Table; + Table.emplace_back(std::move(Columns)); + for (const auto &[I, SM] : enumerate(SplitModules)) { + std::string SymbolsFile = (Twine(Path) + "_" + Twine(I) + ".sym").str(); + writeStringToFile(SM.Symbols, SymbolsFile); + std::vector Row = {SM.ModuleFilePath, SymbolsFile}; + Table.emplace_back(std::move(Row)); + } + + std::error_code EC; + raw_fd_ostream OS((Path + ".table").str(), EC); + if (EC) { + errs() << formatv("error opening file: {0}\n", Path); + exit(1); + } + + writeSYCLStringTable(Table, OS); +} + +Error runSYCLSplitModule(std::unique_ptr M) { + ModuleSplitterSettings Settings; + Settings.Mode = SYCLSplitMode; + Settings.OutputAssembly = OutputAssembly; + Settings.OutputPrefix = OutputFilename; + auto SplitModulesOrErr = splitSYCLModule(std::move(M), Settings); + if (!SplitModulesOrErr) + return SplitModulesOrErr.takeError(); + + writeSplitModulesAsTable(*SplitModulesOrErr, OutputFilename); + return Error::success(); +} + int main(int argc, char **argv) { InitLLVM X(argc, argv); @@ -123,6 +187,16 @@ int main(int argc, char **argv) { Out->keep(); }; + if (SYCLSplitMode != IRSplitMode::IRSM_NONE) { + auto E = runSYCLSplitModule(std::move(M)); + if (E) { + errs() << E << "\n"; + Err.print(argv[0], errs()); + } + + return 0; + } + if (TM) { if (PreserveLocals) { errs() << "warning: -preserve-locals has no effect when using "