-
Notifications
You must be signed in to change notification settings - Fork 835
[SYCL] Use wide alignment for sycl::marray to improve performance
#22275
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: sycl
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,12 @@ template <typename DataT, std::size_t N> class marray; | |
|
|
||
| template <typename T> struct is_device_copyable; | ||
|
|
||
| #ifdef __INTEL_PREVIEW_BREAKING_CHANGES | ||
| #ifndef MARRAY_USE_WIDE_ALIGNMENT | ||
| #define MARRAY_USE_WIDE_ALIGNMENT 1 | ||
| #endif // MARRAY_USE_WIDE_ALIGNMENT | ||
| #endif | ||
|
uditagarwal97 marked this conversation as resolved.
|
||
|
|
||
| namespace detail { | ||
|
|
||
| // Helper trait for counting the aggregate number of arguments in a type list, | ||
|
|
@@ -35,6 +41,27 @@ template <typename T, typename... Ts> struct GetMArrayArgsSize<T, Ts...> { | |
| static constexpr std::size_t value = 1 + GetMArrayArgsSize<Ts...>::value; | ||
| }; | ||
|
|
||
| // Computes the storage alignment for marray. NumElements == 3 is padded to 4 | ||
| // elements and aligned as 4 elements (matching sycl::vec); all other sizes | ||
| // keep tight packing by using the largest power-of-two alignment that divides | ||
| // the total byte size. | ||
| // The alignment guarantee is limited to 64 bytes because some host compilers | ||
| // (e.g. on Microsoft Windows) limit the maximum alignment of function | ||
| // parameters to this value. | ||
| constexpr std::size_t marrayAlignment(std::size_t NumElements, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably not worth the effort but is there anyway vec and marray can use the SAME affordance for calculating alignment instead of duplicating the code in two different places?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure if we should do that because (1) |
||
| std::size_t ElemSize) { | ||
| constexpr std::size_t MaxAlign = 64; | ||
| if (NumElements == 3) { | ||
| std::size_t Bytes = 4 * ElemSize; | ||
| return Bytes <= MaxAlign ? Bytes : MaxAlign; | ||
| } | ||
| std::size_t Bytes = NumElements * ElemSize; | ||
| std::size_t Align = 1; | ||
| while ((Align << 1) <= MaxAlign && (Bytes % (Align << 1)) == 0) | ||
| Align <<= 1; | ||
| return Align; | ||
|
Copilot marked this conversation as resolved.
|
||
| } | ||
|
|
||
| } // namespace detail | ||
|
|
||
| /// Provides a cross-platform math array class template that works on | ||
|
|
@@ -58,7 +85,21 @@ template <typename Type, std::size_t NumElements> class marray { | |
| using const_iterator = const Type *; | ||
|
|
||
| private: | ||
| // For NumElements == 3 this aligns to 4 elements (capped at 64), which adds | ||
| // trailing padding so sizeof == 4 * sizeof(DataT). For all other sizes the | ||
| // alignment divides the total size, so no padding is added. | ||
| static constexpr std::size_t MArrayAlignment = | ||
| detail::marrayAlignment(NumElements, sizeof(DataT)); | ||
|
|
||
| // Changing marray's alignment may change the ABI, so we need to guard it | ||
| // with a macro. User can either use -fpreview-breaking-changes or define | ||
| // MARRAY_USE_WIDE_ALIGNMENT macro and assume responsibility of ABI breakages | ||
| // if marray is passed across ABI boundary. | ||
| #ifdef MARRAY_USE_WIDE_ALIGNMENT | ||
| alignas(MArrayAlignment) value_type MData[NumElements]; | ||
| #else | ||
| value_type MData[NumElements]; | ||
| #endif | ||
|
|
||
| // Trait for checking if an argument type is either convertible to the data | ||
| // type or an array of types convertible to the data type. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,266 @@ | ||
| //==------- vec_vs_marray.cpp --- sycl::vec vs sycl::marray performance ----==// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Performance comparison between sycl::vec and sycl::marray. | ||
| // | ||
| // Motivation | ||
| // sycl::marray (SYCL 2020 sec. 4.14.3) is specified as a plain array-like | ||
| // type with no over-alignment requirement, whereas sycl::vec (SYCL 2020 sec. | ||
| // 4.14.2) is over-aligned to the size of the whole vector. When user code is | ||
| // migrated from sycl::vec to sycl::marray the relaxed alignment can pessimize | ||
| // vectorized memory accesses, which is especially visible for 16-bit element | ||
| // types such as sycl::half. This test measures both containers side by side | ||
| // so that such differences become visible. | ||
| // | ||
| // What is compared | ||
| // The timed kernels use only the subset of the API that is common to both | ||
| // containers, so the exact same kernel body is instantiated for each and any | ||
| // timing difference is attributable to the container type alone: | ||
| // * construction from a scalar vec(const T&) / marray(const T&) | ||
| // * copy/load C v = in[i]; | ||
| // * store/assignment out[i] = v; | ||
| // * operator[] / size() element access (smoke + init) | ||
| // * arithmetic operators + - * v * a + a | ||
| // * compound assignment v -= a | ||
| // | ||
| // sycl::vec only supports 1, 2, 3, 4, 8 and 16 components, while sycl::marray | ||
| // can have an arbitrary number of components. Sizes supported by both are | ||
| // compared head to head; sizes that only marray supports are reported on | ||
| // their own ("marray-only"). | ||
| // | ||
| // The test is informational for performance: it prints the per-(type, size) | ||
| // kernel times for both containers and highlights rows where the marray/vec | ||
| // ratio exceeds threshold. It never fails on a performance difference. | ||
| // | ||
| // | ||
| // RUN: %{build} -o %t_non_prev.out | ||
| // RUN: %{build} -fpreview-breaking-changes -o %t_prev.out | ||
| // RUN: %{run} %t_non_prev.out | ||
| // RUN: %{run} %t_prev.out | ||
| // | ||
| // UNSUPPORTED: linux || windows | ||
| // UNSUPPORTED-INTENDED: This test is intended to be run manually to compare | ||
| // the performance of vec and marray. It doesn't check or assert anything. | ||
|
|
||
| #include <sycl/detail/core.hpp> | ||
| #include <sycl/ext/oneapi/bfloat16.hpp> | ||
| #include <sycl/marray.hpp> | ||
| #include <sycl/properties/all_properties.hpp> | ||
|
uditagarwal97 marked this conversation as resolved.
|
||
| #include <sycl/usm.hpp> | ||
| #include <sycl/vector.hpp> | ||
|
|
||
| #include <algorithm> | ||
| #include <array> | ||
| #include <cstdint> | ||
| #include <iomanip> | ||
| #include <iostream> | ||
| #include <stdexcept> | ||
|
|
||
| using namespace sycl; | ||
|
|
||
| namespace { | ||
|
|
||
| // -- problem size / measurement parameters --------------------------------- | ||
| constexpr size_t NumElems = 1 << 18; // work-items per kernel launch | ||
| constexpr int ComputeIters = 64; // arithmetic iterations (compute kernel) | ||
| constexpr int Warmup = 2; // un-timed launches | ||
| constexpr int Repeats = 10; // timed launches; median is kept | ||
|
|
||
| // marray/vec ratio above which a row is highlighted (purely informational). | ||
| constexpr double threshold = 1.5; | ||
|
|
||
| // -- container introspection ------------------------------------------------ | ||
| // Extracts the element type from a vec or marray container type. | ||
| template <typename ContainerT> struct ContainerElement; | ||
| template <typename ElementT, int NumComponents> | ||
| struct ContainerElement<vec<ElementT, NumComponents>> { | ||
| using type = ElementT; | ||
| }; | ||
| template <typename ElementT, size_t NumComponents> | ||
| struct ContainerElement<marray<ElementT, NumComponents>> { | ||
| using type = ElementT; | ||
| }; | ||
| template <typename ContainerT> | ||
| using ContainerElementT = typename ContainerElement<ContainerT>::type; | ||
|
|
||
| enum class BenchKind { Stream, Compute }; | ||
|
|
||
| // One timed launch. Returns the device-side kernel duration in nanoseconds as | ||
| // reported by event profiling. | ||
| template <BenchKind Mode, typename ContainerT> | ||
| double launchOnce(queue &q, const ContainerT *in, ContainerT *out, | ||
| [[maybe_unused]] ContainerElementT<ContainerT> one) { | ||
| event e = q.parallel_for(range<1>(NumElems), [=](id<1> idx) { | ||
| const size_t i = idx[0]; | ||
| const ContainerT a = in[i]; | ||
| if constexpr (Mode == BenchKind::Stream) { | ||
| // Load + vector add + store: sensitive to element alignment. | ||
| out[i] = a + ContainerT(one); | ||
| } else { | ||
| // Arithmetic heavy: sensitive to per-element compute throughput. | ||
| ContainerT acc = a; | ||
| for (int k = 0; k < ComputeIters; ++k) { | ||
| acc = acc * a + a; // multiply + add | ||
| acc -= a; // compound subtract | ||
| } | ||
| out[i] = acc; | ||
| } | ||
| }); | ||
| e.wait_and_throw(); | ||
| const auto s = e.get_profiling_info<info::event_profiling::command_start>(); | ||
| const auto f = e.get_profiling_info<info::event_profiling::command_end>(); | ||
| return static_cast<double>(f - s); | ||
| } | ||
|
|
||
| // Warm-up + timed launches for one container type and bench kind. Returns the | ||
| // median kernel time in nanoseconds, or -1 on allocation failure (so over-sized | ||
| // configurations are skipped instead of aborting). | ||
| template <BenchKind Mode, typename ContainerT> double bench(queue &q) { | ||
| using ElementT = ContainerElementT<ContainerT>; | ||
| const ElementT one = static_cast<ElementT>(static_cast<float>(1)); | ||
|
|
||
| ContainerT *in = nullptr; | ||
| ContainerT *out = nullptr; | ||
| try { | ||
| in = malloc_device<ContainerT>(NumElems, q); | ||
| out = malloc_device<ContainerT>(NumElems, q); | ||
| if (!in || !out) | ||
| throw std::runtime_error("allocation returned null"); | ||
| } catch (const std::exception &) { | ||
| if (in) | ||
| sycl::free(in, q); | ||
| if (out) | ||
| sycl::free(out, q); | ||
| return -1.0; | ||
| } | ||
|
|
||
| // Initialise input with index-dependent values to defeat constant folding. | ||
| q.parallel_for(range<1>(NumElems), [=](id<1> idx) { | ||
| const size_t i = idx[0]; | ||
| in[i] = ContainerT(static_cast<ElementT>(static_cast<float>((i % 7) + 1))); | ||
| }).wait_and_throw(); | ||
|
|
||
| for (int w = 0; w < Warmup; ++w) | ||
| (void)launchOnce<Mode>(q, in, out, one); | ||
|
|
||
| std::array<double, Repeats> samples; | ||
| for (int r = 0; r < Repeats; ++r) | ||
| samples[r] = launchOnce<Mode>(q, in, out, one); | ||
|
|
||
| sycl::free(in, q); | ||
| sycl::free(out, q); | ||
|
|
||
| // Median of the timed launches: robust to outliers while still representative | ||
| // of typical (not just best-case) performance. | ||
| std::sort(samples.begin(), samples.end()); | ||
| if constexpr (Repeats % 2 == 1) | ||
| return samples[Repeats / 2]; | ||
| else | ||
| return 0.5 * (samples[Repeats / 2 - 1] + samples[Repeats / 2]); | ||
| } | ||
|
|
||
| // Compare vec<ElementT, NumComponents> against marray<ElementT, NumComponents> | ||
| // for one (type, size). Prints the timings and highlights the row when the | ||
| // marray/vec ratio exceeds threshold. Never affects the test's pass/fail | ||
| // status. | ||
| template <typename ElementT, int NumComponents> | ||
| void compareVecAndMarray(queue &q, const char *typeName) { | ||
| using VecT = vec<ElementT, NumComponents>; | ||
| using MarrayT = marray<ElementT, static_cast<size_t>(NumComponents)>; | ||
|
|
||
| const double vStream = bench<BenchKind::Stream, VecT>(q); | ||
| const double mStream = bench<BenchKind::Stream, MarrayT>(q); | ||
| const double vCompute = bench<BenchKind::Compute, VecT>(q); | ||
| const double mCompute = bench<BenchKind::Compute, MarrayT>(q); | ||
|
|
||
| if (vStream < 0 || mStream < 0 || vCompute < 0 || mCompute < 0) { | ||
| std::cout << " " << std::left << std::setw(8) << typeName << " x" | ||
| << NumComponents << " <skipped: allocation failed>\n"; | ||
| return; | ||
| } | ||
|
|
||
| const double eps = 1.0; | ||
| const double rStream = mStream / std::max(vStream, eps); | ||
| const double rCompute = mCompute / std::max(vCompute, eps); | ||
| const bool highlight = rStream > threshold || rCompute > threshold; | ||
|
|
||
| std::cout << " " << std::left << std::setw(8) << typeName << "x" | ||
| << std::setw(3) << NumComponents << std::right << std::fixed | ||
| << std::setprecision(2) << " stream(vec/marr ns):" << std::setw(12) | ||
| << vStream << " /" << std::setw(12) << mStream << " x" | ||
| << std::setw(6) << rStream << " compute:" << std::setw(12) | ||
| << vCompute << " /" << std::setw(12) << mCompute << " x" | ||
| << std::setw(6) << rCompute | ||
| << (highlight ? " <== LARGE DIFFERENCE" : "") << "\n"; | ||
| } | ||
|
|
||
| // Benchmark marray<ElementT, NumComponents> for a size that vec cannot | ||
| // represent. Only marray timings are printed; there is nothing to compare | ||
| // against. | ||
| template <typename ElementT, size_t NumComponents> | ||
| void benchMarrayOnly(queue &q, const char *typeName) { | ||
| using MarrayT = marray<ElementT, NumComponents>; | ||
|
|
||
| const double mStream = bench<BenchKind::Stream, MarrayT>(q); | ||
| const double mCompute = bench<BenchKind::Compute, MarrayT>(q); | ||
|
|
||
| if (mStream < 0 || mCompute < 0) { | ||
| std::cout << " " << std::left << std::setw(8) << typeName << " x" | ||
| << NumComponents | ||
| << " marray-only <skipped: allocation failed>\n"; | ||
| return; | ||
| } | ||
|
|
||
| std::cout << " " << std::left << std::setw(8) << typeName << "x" | ||
| << std::setw(3) << NumComponents << std::right << std::fixed | ||
| << std::setprecision(2) | ||
| << " marray-only stream(ns):" << std::setw(12) << mStream | ||
| << " compute(ns):" << std::setw(12) << mCompute << "\n"; | ||
| } | ||
|
|
||
| template <typename ElementT> | ||
| void benchAllSizes(queue &q, const char *typeName) { | ||
| // Sizes supported by both vec and marray: compared head to head. | ||
| compareVecAndMarray<ElementT, 2>(q, typeName); | ||
| compareVecAndMarray<ElementT, 3>(q, typeName); | ||
| compareVecAndMarray<ElementT, 4>(q, typeName); | ||
| compareVecAndMarray<ElementT, 8>(q, typeName); | ||
| compareVecAndMarray<ElementT, 16>(q, typeName); | ||
|
|
||
| // Sizes only marray supports (vec is limited to 1, 2, 3, 4, 8, 16): | ||
| // marray timings reported on their own. | ||
| benchMarrayOnly<ElementT, 5>(q, typeName); | ||
| benchMarrayOnly<ElementT, 6>(q, typeName); | ||
| benchMarrayOnly<ElementT, 7>(q, typeName); | ||
| benchMarrayOnly<ElementT, 32>(q, typeName); | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| int main() { | ||
| queue q{property::queue::enable_profiling{}}; | ||
| device dev = q.get_device(); | ||
|
|
||
| std::cout << "Device: " << dev.get_info<info::device::name>() << "\n"; | ||
| std::cout << "Highlight threshold: x" << threshold << "\n"; | ||
| std::cout << "Lower ns is better; marray/vec ratio > threshold is " | ||
| "highlighted.\n\n"; | ||
|
|
||
| benchAllSizes<int8_t>(q, "int8"); | ||
| benchAllSizes<int16_t>(q, "int16"); | ||
| benchAllSizes<int32_t>(q, "int32"); | ||
| benchAllSizes<int64_t>(q, "int64"); | ||
| if (dev.has(aspect::fp16)) | ||
| benchAllSizes<half>(q, "half"); | ||
| benchAllSizes<float>(q, "float"); | ||
| if (dev.has(aspect::fp64)) | ||
| benchAllSizes<double>(q, "double"); | ||
| benchAllSizes<sycl::ext::oneapi::bfloat16>(q, "bf16"); | ||
|
|
||
| return 0; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this breaks my heart. Do we have customers actually using the preview breaking changes flag?
I guess there is nothing to be done about it, and come 2027.0 we'll once again sweep up all this ABI breaking stuff in, but I wish we had a better approach. We work hard to get these improvements into the product, but then hide them where no one will find them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True, it's sad that we can't make this change default because it will break the ABI :(