Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
072463b
create empty slides for the release briefing 4/6
JBludau Mar 14, 2025
fd494c4
Update default target
masterleinad Mar 14, 2025
bfbff99
Add DualView changes
masterleinad Mar 14, 2025
686d0c7
added slides for graph
JBludau Apr 2, 2025
7d31172
added date of release briefing
JBludau Apr 2, 2025
05afa88
Remove changes related to skipping for host-accessible memory spaces
masterleinad Apr 2, 2025
95408b0
Add slide about kokkos_check
tpadioleau Apr 3, 2025
2e08cc0
General enhancements: add inclusive_scan an kokkos tools overhead
tretre91 Apr 3, 2025
f5a6848
Update general enhancements
tretre91 Apr 3, 2025
b0ca7b3
Update Content/ReleaseBriefings/4_6/Section_Organizational.tex
JBludau Apr 3, 2025
2fe15d8
Update Content/ReleaseBriefings/4_6/Section_Organizational.tex
JBludau Apr 3, 2025
0b6816e
Update Content/ReleaseBriefings/release-46.tex
JBludau Apr 3, 2025
2708b3b
Update Content/ReleaseBriefings/4_6/Section_Organizational.tex
JBludau Apr 3, 2025
1b218ec
Update Content/ReleaseBriefings/4_6/Section_Organizational.tex
JBludau Apr 3, 2025
8d740c6
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 3, 2025
3ad305b
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 3, 2025
7dc1c33
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 3, 2025
0c30e82
Add HIP Multi-GPU slides
tcclevenger Apr 3, 2025
ba8f0b3
incorporated Romin's comments
JBludau Apr 3, 2025
00a69c9
Update Content/ReleaseBriefings/release-46.tex
JBludau Apr 3, 2025
2a081bd
move dualView slide to deprecation section
JBludau Apr 3, 2025
a486258
changed to ornlid
JBludau Apr 3, 2025
cb0b62f
add breaking changes/deprecations
nmm0 Apr 7, 2025
4ca7485
add 4.6 release briefing bugfix slides
nmm0 Apr 7, 2025
61e0e07
Add slides to general enhancements section
ldh4 Apr 7, 2025
ebfa2bd
Remove comments
ldh4 Apr 7, 2025
169386d
Add Build System updates
diehlpk Apr 5, 2025
638b123
Add backend updates
diehlpk Apr 7, 2025
a42c9cf
adjusted style to match rest of the slides
JBludau Apr 7, 2025
5a257f2
Apply suggestions to general enhancements
tretre91 Apr 8, 2025
426ebd2
removed section header slides
JBludau Apr 8, 2025
fbbb074
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
f2146e4
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
fe9f55b
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
811701c
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
e51238d
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
ea9c4e5
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
44b6879
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
14927ea
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
8d3e202
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 8, 2025
7c182f7
Reword HIP multi-gpu bullet point
tcclevenger Apr 8, 2025
1dbcacf
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 8, 2025
9e0573b
Update Content/ReleaseBriefings/4_6/Section_GeneralEnhancements.tex
tpadioleau Apr 8, 2025
965e021
Add performance numbers for inclusive scan
tretre91 Apr 8, 2025
3be527b
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 8, 2025
ac34214
Update Content/ReleaseBriefings/4_6/Section_GeneralEnhancements.tex
JBludau Apr 8, 2025
9a044fd
Update Content/ReleaseBriefings/4_6/Section_BackendUpdates.tex
JBludau Apr 8, 2025
503ab7e
Update Content/ReleaseBriefings/4_6/Section_BackendUpdates.tex
JBludau Apr 8, 2025
01e3d19
Update Content/ReleaseBriefings/4_6/Section_BackendUpdates.tex
JBludau Apr 8, 2025
898724c
Update Content/ReleaseBriefings/4_6/Section_BackendUpdates.tex
JBludau Apr 8, 2025
59573be
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
8130980
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
53d065e
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
0f26f7d
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
c6aadf7
Update Content/ReleaseBriefings/4_6/Section_BugFixes.tex
JBludau Apr 8, 2025
c78ca02
Update Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
JBludau Apr 8, 2025
c38530d
remove hyperrefs to prs
JBludau Apr 8, 2025
2d37476
spell out signature of functor in then node
JBludau Apr 8, 2025
7896362
remove impl call from slides
JBludau Apr 8, 2025
bc29a03
add hint about no guarantees for print format
JBludau Apr 8, 2025
a8b181b
add missing escape
JBludau Apr 8, 2025
73385b2
make some space on print_config slide
JBludau Apr 8, 2025
c210bf9
Update Content/ReleaseBriefings/release-46.tex
JBludau Apr 8, 2025
c455d4c
Add a slide for spack and MI300A
cedricchevalier19 Apr 9, 2025
5ef6d7c
Update Content/ReleaseBriefings/release-46.tex
JBludau Apr 9, 2025
bc48a52
add KUG program to slides
JBludau Apr 9, 2025
befcc77
added bof and tea time slide
JBludau Apr 9, 2025
f2125d0
add scan perf results
JBludau Apr 9, 2025
2fafc0a
added perf data for tooling launch overhead
JBludau Apr 9, 2025
c923ca3
shortened print_config output
JBludau Apr 9, 2025
5439ed0
add perf results for search
JBludau Apr 9, 2025
ecb73e4
Update Content/ReleaseBriefings/4_6/Section_NewFeatures.tex
JBludau Apr 9, 2025
bc84851
Change wording in multi-GPU slide
tcclevenger Apr 9, 2025
7c76215
Some changes to interoptibility of graphs
tcclevenger Apr 9, 2025
284e0e3
Some changes to graph.then
tcclevenger Apr 9, 2025
c0d281d
put a suit on christian's plots
JBludau Apr 9, 2025
6cef403
switch order of bullet points so H100 relevant changes are close to t…
JBludau Apr 9, 2025
683b165
use default colors in the plots
JBludau Apr 9, 2025
a66044d
add a hint that the speedup is algorithm and hardware dependent
JBludau Apr 9, 2025
7b97e82
mark makefles as deprecated
JBludau Apr 9, 2025
23cb5ec
Fix error in code example
tcclevenger Apr 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions Content/ReleaseBriefings/4_6/Section_BackendUpdates.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
%==========================================================================

\begin{frame}[fragile]

{\Huge Backend Updates}

\vspace{10pt}

\end{frame}


%==========================================================================

% Examples

% note: always keep the [fragile] for your frames!

\begin{frame}[fragile]{CUDA, SYCL and Serial}
\begin{itemize}
\item SYCL: Improved sorting performance for non-contiguous views
\item Serial: Reduce fences overhead when using \texttt{Kokkos\_ENABLE\_ATOMICS\_BYPASS}
\item CUDA: Improved performance for \texttt{Kokkos::parallel\_reduce} on H100 and newer by removing limitations on the runtime thread configuration
\end{itemize}
\end{frame}

%==========================================================================
\begin{frame}[fragile]{Performance of \texttt{parallel\_reduce}}

\begin{center}
\begin{minipage}{.45\textwidth}
\begin{tikzpicture}
\begin{axis}[
title={Computionally \textbf{cheap} kernel},
ymin=0,
ymax=1.1,
xmin=-100000,
xmax=1200000,
ybar,
xtick={100000,1000000},
width=0.9\textwidth,
legend style={at={(0.3,0.75)},anchor=west},
xlabel=Num Elements,
ylabel=Runtime relative to 4.5]
\addplot coordinates {(100000,1.033333333) (1000000,0.876923077)};
\addplot coordinates {(100000,1.0) (1000000,0.6559139)};
\legend{V100,H100}
\end{axis}
\end{tikzpicture}
\end{minipage}
\begin{minipage}{.45\textwidth}
\begin{tikzpicture}
\begin{axis}[
title={Computionally \textbf{expensive} kernel},
ymin=0,
ymax=1.1,
xmin=-100000,
xmax=1200000,
ybar,
xtick={100000,1000000},
width=0.9\textwidth,
legend style={at={(0.3,0.75)},anchor=west},
xlabel=Num Elements,
% ylabel=Speedup relative to 4.5,
]
\addplot coordinates {(100000,0.7956778) (1000000,0.785453609)};
\addplot coordinates {(100000,0.7149638336) (1000000,0.6977690684)};
\legend{V100,H100}
\end{axis}
\end{tikzpicture}
\end{minipage}
\end{center}

\end{frame}
%==========================================================================

\begin{frame}[fragile]{HIP}
\begin{itemize}
\item Change block size deduction to prefer smaller blocks/teams if possible
\item Allocate memory with stream ordered semantics (\emph{i.e.}\ use \texttt{hipMallocAsync})
\item Fix a segfault when a virtual function called inside a kernel requires too many registers
\end{itemize}
\end{frame}

%==========================================================================


%==========================================================================

90 changes: 90 additions & 0 deletions Content/ReleaseBriefings/4_6/Section_BreakingChanges.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
%==========================================================================

\begin{frame}[fragile]

{\Huge Deprecations and other breaking changes}

\vspace{10pt}

\end{frame}


\begin{frame}[fragile]{Dropping support for Intel C++ Compiler Classic}
\begin{itemize}
\item Intel has deprecated Intel Classic in 2022, and removed it from oneAPI 2024
\item In order to focus on newer compilers, and reduce maintenance burden, we have \textbf{removed} support for Intel Classic (oneAPI Intel/icpx still supported of course!)
\end{itemize}
\end{frame}


\begin{frame}[fragile]{DualView changes}
\textbf{Deprecate} direct access to \texttt{d\_view} and \texttt{h\_view}
\begin{itemize}
\item Modifying the allocations in d\_view and h\_view directly is dangerous, especially if \texttt{modify} and \texttt{sync} are skipped
\item Use \texttt{view\_host()} and \texttt{view\_device()} instead
\item These two functions return by value with deprecated code enabled and by const reference otherwise. This might have perfomance implications if used extensively, e.g., in loop bounds.
\end{itemize}
\end{frame}


\begin{frame}[fragile]{Experimental SIMD changes}
\begin{itemize}
\item \texttt{native\_simd}, \texttt{native\_simd\_mask} \textbf{deprecated} to align with the C++26 standard
\item \textbf{Removed} Obtaining a reference from SIMD \texttt{operator[]} to align with the C++26 Standard
\item \textbf{Changed} the return type of SIMD \texttt{operator==} and \texttt{operator!=} to return SIMD masks instead of \texttt{bool}
\begin{itemize}
\item If you want old behavior, use \texttt{all\_of(a == b)}
\end{itemize}
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Additional Deprecations and Removals}
\begin{itemize}
\item Already discussed deprecating the Makefile
\item StaticCrsGraph is \textbf{moved} to Kokkos Kernels and \textbf{deprecated} in Core
\begin{itemize}
\item See \url{https://github.com/kokkos/kokkos-kernels/pull/2419}
\item Symbol is in Kernels under \texttt{KokkosSparse::StaticCrsGraph}
\end{itemize}
\end{itemize}
\end{frame}
%==========================================================================

% Examples

% note: always keep the [fragile] for your frames!

%\begin{frame}[fragile]{Example list}
% \begin{itemize}
% \item Item 1
% \item Item 2 with some \texttt{code}
% \begin{itemize}
% \item Sub-item 2.1
% \item Sub-item 2.2
% \end{itemize}
% \end{itemize}
%\end{frame}

%\begin{frame}[fragile]{Example code}
% \begin{code}[keywords={std}]
% #include <iostream>
%
% int main() {
% std::cout << "hello world\n";
% }
% \end{code}
%\end{frame}

%\begin{frame}[fragile]{Example table}
% \begin{center}
% \begin{tabular}{l|l}
% a & b \\\hline
% c & d
% \end{tabular}
% \end{center}
%\end{frame}

%==========================================================================


%==========================================================================
105 changes: 105 additions & 0 deletions Content/ReleaseBriefings/4_6/Section_BugFixes.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
%==========================================================================


% Fix performance bug affecting atomic_fetch_{add,sub,min,max,and,or,xor} on integral types long and unsigned long with HIP #7816
% Performance bug in RangePolicy: construct error message if and only if the precondition is violated #7809
% Fix execution of ranges with more than 2B elements #7797
% Fix clean target when embedding Kokkos in another project #7557
% Build system: hint to ARCH_NATIVE if ARMv9 Grace arch is not explicitly supported by the compiler #7862
% Fix Zen3 flag for NVHPC #7558
% Use right arch for MI300A in makefiles #7786
% graph: nodes must be stored by the graph #7619
% Make sure lock arrays are on device before launching a graph #7685
% Cuda: fix incorrect iteration in MDRangePolicy of rank > 4 for high iteration counts #7724
% Cuda: ignore gcc assembler options in nvcc-wrapper #7492

% simd: fix a bug in scalar min/max #7813
% simd: fix a bug in non-masked reductions #7845
% Fix compiling BasicView on MSVC #7751


\begin{frame}[fragile]

{\Huge Bug Fixes}

\vspace{10pt}

\end{frame}

\begin{frame}[fragile]{General bug fixes}
\begin{itemize}
\item Fix execution of ranges with more than 2 billion elements
\item Graph:
\begin{itemize}
\item Fix graph node lifetime issues
\item Fix lock-based atomics failure when launching CUDA and HIP graphs
\end{itemize}
\item CUDA backend: Fix incorrect iteration in MDRangePolicy of rank $> 4$ for high iteration counts
\item SIMD:
\begin{itemize}
\item fix a bug in scalar min/max
\item fix a bug in non-masked reductions
\end{itemize}
\item View: fix MSVC compilation
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Build system fixes}
\begin{itemize}
\item Fix \texttt{clean} target when embedding Kokkos in another project
\item Stop generation if ARMv9 Grace arch is not explicitly supported by the compiler when \texttt{KOKKOS\_ARCH\_ARMV9\_GRACE} is specified
\begin{itemize}
\item Can still try and configure with \texttt{ARCH\_NATIVE}
\end{itemize}
\item Fix Zen3 flag for NVHPC
\item Use right arch for MI300A in makefiles
\item (CUDA) ignore gcc assembler options in \texttt{nvcc\_wrapper}
\end{itemize}
\end{frame}

\begin{frame}[fragile]{Performance bugfixes}
\begin{itemize}
\item Fix performance bug affecting atomic\_fetch\_\{add,sub,min,max,and,or,xor\} on integral types long and unsigned long with HIP
\item Fix performance of \texttt{RangePolicy} where an error message is generated even if precondition not violated
\end{itemize}
\end{frame}
%==========================================================================

% Examples

% note: always keep the [fragile] for your frames!

%\begin{frame}[fragile]{Example list}
% \begin{itemize}
% \item Item 1
% \item Item 2 with some \texttt{code}
% \begin{itemize}
% \item Sub-item 2.1
% \item Sub-item 2.2
% \end{itemize}
% \end{itemize}
%\end{frame}

%\begin{frame}[fragile]{Example code}
% \begin{code}[keywords={std}]
% #include <iostream>
%
% int main() {
% std::cout << "hello world\n";
% }
% \end{code}
%\end{frame}

%\begin{frame}[fragile]{Example table}
% \begin{center}
% \begin{tabular}{l|l}
% a & b \\\hline
% c & d
% \end{tabular}
% \end{center}
%\end{frame}

%==========================================================================


%==========================================================================
43 changes: 43 additions & 0 deletions Content/ReleaseBriefings/4_6/Section_BuildSystemUpdates.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%==========================================================================

\begin{frame}[fragile]

{\Huge Build Systems Updates}

\vspace{10pt}

\end{frame}

%==========================================================================

% Examples

% note: always keep the [fragile] for your frames!

\begin{frame}[fragile]{New build system features}
\begin{itemize}
\item Add support for Zen 4 AMD microarchitecture (\texttt{Kokkos\_ARCH\_ZEN4})
\item Enable NVIDIA Grace architecture with NVHPC (\texttt{Kokkos\_ARCH\_ARMV9\_GRACE})
\item Support static library builds via \texttt{CMAKE\_CUDA\_RUNTIME\_LIBRARY=static} when using CUDA as CMake language
\end{itemize}

\end{frame}

%==========================================================================

\begin{frame}[fragile]{Spack support for MI300A}
\begin{itemize}
\item Spack \textit{develop} branch now supports MI300A with a new variant \textcolor{red}{\texttt{apu}}
(\href{https://github.com/spack/spack/pull/48609}{spack/spack\#48609})

\item To compile Kokkos for MI300A, forcing the APU mode, use the following command:
\texttt{spack install kokkos +rocm amdgpu\_target=gfx942 \textcolor{red}{+apu}}

% In pure CMake, this is equivalent to:
% cmake -DKokkos_ENABLE_ROCM=ON -DKokkos_ARCH_AMD_GFX942_APU=ON
\end{itemize}

\end{frame}


%==========================================================================
Loading