Skip to content

Commit 10fc972

Browse files
committed
First draft for section on atomics
1 parent df157ff commit 10fc972

File tree

7 files changed

+2035
-0
lines changed

7 files changed

+2035
-0
lines changed

courses/02_intermediate/main.tex

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,222 @@ \section{Subviews}
133133

134134
\section{Atomics}
135135

136+
\begin{frame}[fragile]{Atomics - Race condition}
137+
\begin{columns}
138+
\begin{column}{0.4\linewidth}
139+
\begin{minted}{C++}
140+
Kokkos::View<double*>
141+
v("v", 2);
142+
// Init v with
143+
// v(0) = 4
144+
// v(1) = 5
145+
146+
Kokkos::View<double> res;
147+
Kokkos::parallel_for(
148+
Kokkos::RangePolicy(0,2),
149+
KOKKOS_LAMBDA(int i) {
150+
res() = res() + v(i);
151+
});
152+
\end{minted}
153+
\end{column}
154+
\begin{column}{0.6\linewidth}
155+
Even simple instruction like '\texttt{+}' are decomposed into several smaller assembly instructions:
156+
157+
\includegraphics{race_condition1.png}
158+
\end{column}
159+
\end{columns}
160+
\end{frame}
161+
162+
% Trainee could play with the following program to check that it really present a race condition:
163+
%#include <iostream>
164+
%#include <Kokkos_Core.hpp>
165+
%
166+
%int main(int argc, char *argv[]) {
167+
% Kokkos::initialize(argc, argv);
168+
% {
169+
% const int N = 10000;
170+
% Kokkos::View<double*> v("v", N);
171+
% Kokkos::deep_copy(v, 4);
172+
%
173+
% Kokkos::View<double> res("res", N);
174+
%
175+
% Kokkos::parallel_for(Kokkos::RangePolicy(0, N),
176+
% KOKKOS_LAMBDA(int i) {
177+
% //Kokkos::atomic_add(&res(), v(i));
178+
% res() = res() + v(i);
179+
% });
180+
%
181+
% double res_;
182+
%
183+
% deep_copy(res_, res);
184+
%
185+
% std::cout << "res_:" << res_ << std::endl;
186+
% std::cout << "4*N:" << 4*N << std::endl;
187+
% }
188+
% Kokkos::finalize();
189+
%}
190+
191+
\begin{frame}[fragile]{Atomics - Race condition}
192+
\begin{columns}
193+
\begin{column}{0.4\linewidth}
194+
\begin{minted}{C++}
195+
Kokkos::View<double*>
196+
v("v", 2);
197+
// Init v with
198+
// v(0) = 4
199+
// v(1) = 5
200+
201+
Kokkos::View<double> res;
202+
Kokkos::parallel_for(
203+
Kokkos::RangePolicy(0,2),
204+
KOKKOS_LAMBDA(int i) {
205+
res() = res() + v(i);
206+
});
207+
\end{minted}
208+
\end{column}
209+
\begin{column}{0.6\linewidth}
210+
When several cores run in parallel, instructions can interleave and generate \structure{race conditions}:
211+
212+
\includegraphics[width=1\textwidth]{race_condition2.png}
213+
\end{column}
214+
\end{columns}
215+
\end{frame}
216+
217+
\begin{frame}[fragile]{Atomic Operations}
218+
\begin{columns}
219+
\begin{column}{0.4\linewidth}
220+
\begin{minted}{C++}
221+
Kokkos::View<double*>
222+
v("v", 2);
223+
// Init v with
224+
// v(0) = 4
225+
// v(1) = 5
226+
227+
Kokkos::View<double> res;
228+
Kokkos::parallel_for(
229+
Kokkos::RangePolicy(0,2),
230+
KOKKOS_LAMBDA(int i) {
231+
// res() += v(i)
232+
Kokkos::atomic_add(
233+
&res(), v(i));
234+
});
235+
\end{minted}
236+
\end{column}
237+
\begin{column}{0.5\linewidth}
238+
\texttt{atomic\_add} execute the \texttt{LOAD}, \texttt{STORE} and \texttt{ADD} in a single atomic step,
239+
guarantying the absence of race condition during the addition.
240+
\includegraphics[width=1\textwidth]{race_condition3.png}
241+
\end{column}
242+
\end{columns}
243+
\pause
244+
Note that for this example, it would be better to use a \texttt{parallel\_reduce}.
245+
\end{frame}
246+
247+
\begin{frame}[fragile]{Atomic operations}
248+
\begin{columns}
249+
\begin{column}{0.55\linewidth}
250+
\begin{tabular}{|l|c|}
251+
\hline
252+
Operation & Replaces \\
253+
\hline
254+
Kokkos::atomic\_add(\&x, y) & x += y \\
255+
Kokkos::atomic\_and(\&x, y) & x \&= y \\
256+
Kokkos::atomic\_dec(\&x) & x-- \\
257+
Kokkos::atomic\_inc(\&x) & x++ \\
258+
Kokkos::atomic\_lshift(\&x, y) & x = x << y \\
259+
Kokkos::atomic\_max(\&x, y) & x = std::max(x, y) \\
260+
Kokkos::atomic\_min(\&x, y) & x = std::min(x, y) \\
261+
Kokkos::atomic\_mod(\&x, y) & x \%= y \\
262+
Kokkos::atomic\_nand(\&x, y) & x = !(x \&\& y) \\
263+
Kokkos::atomic\_or(\&x, y) & x |= y \\
264+
Kokkos::atomic\_rshift(\&x, y) & x = x >> y \\
265+
Kokkos::atomic\_sub(\&x, y) & x -= y \\
266+
Kokkos::atomic\_store(\&x, y) & x = y \\
267+
Kokkos::atomic\_xor(\&x, y) & x \^{}= y \\
268+
\hline
269+
\end{tabular}
270+
\end{column}
271+
272+
\begin{column}{0.35\linewidth}
273+
Other common operations are available with the format \texttt{Kokkos::atomic\_[op]}.
274+
\end{column}
275+
\end{columns}
276+
\end{frame}
277+
278+
\begin{frame}[fragile]{Atomic operations - fetch}
279+
\begin{columns}
280+
\begin{column}{0.4\linewidth}
281+
\begin{minted}{C++}
282+
auto old_value =
283+
atomic_fetch_[op](
284+
ptr_to_value,
285+
update_value);
286+
287+
auto new_value =
288+
atomic_[op]_fetch(
289+
ptr_to_value,
290+
update_value);
291+
\end{minted}
292+
\end{column}
293+
\begin{column}{0.6\linewidth}
294+
\texttt{atomic\_fetch\_[op]}:
295+
\begin{itemize}
296+
\item atomically performs the operation \texttt{[op]} with the operands \texttt{*ptr\_to\_value} and \texttt{update\_value},
297+
\item returns the value \texttt{*ptr\_to\_value} had \structure{before} the operation.
298+
\end{itemize}
299+
\texttt{atomic\_[op]\_fetch}:
300+
\begin{itemize}
301+
\item atomically performs the operation \texttt{[op]} with the operands \texttt{*ptr\_to\_value} and \texttt{update\_value},
302+
\item returns the value \texttt{*ptr\_to\_value} has \structure{after} the operation.
303+
\end{itemize}
304+
\end{column}
305+
\end{columns}
306+
\end{frame}
307+
308+
\begin{frame}[fragile]{Atomic operations - Exchange}
309+
\begin{columns}
310+
\begin{column}{0.4\linewidth}
311+
\begin{minted}{C++}
312+
old_value =
313+
atomic_exchange(
314+
ptr_to_value,
315+
desired);
316+
317+
old_value =
318+
atomic_compare_exchange(
319+
ptr_to_value,
320+
expected,
321+
desired);
322+
\end{minted}
323+
\end{column}
324+
\begin{column}{0.6\linewidth}
325+
\texttt{atomic\_exchange}:
326+
\begin{itemize}
327+
\item atomically affects the value \texttt{desired} to \texttt{*ptr\_to\_value},
328+
\item returns the value \texttt{*ptr\_to\_value} had before the call.
329+
\end{itemize}
330+
\texttt{atomic\_compare\_exchange}:
331+
\begin{itemize}
332+
\item atomically affects the value \texttt{desired} to \texttt{*ptr\_to\_value}, \structure{if} the old value of \texttt{*ptr\_to\_value} was equal to \texttt{expected},
333+
\item returns the value \texttt{*ptr\_to\_value} had before the call, whether the exchange happened or not.
334+
\end{itemize}
335+
\end{column}
336+
\end{columns}
337+
\end{frame}
338+
339+
\begin{frame}[fragile]{Atomic - Performances}
340+
Atomics can have a huge impact on performance:
341+
\begin{itemize}
342+
\item the instruction itself is slower than the one it replaces,
343+
\item they may generates extra synchronisation points,
344+
\item they bypass and invalidate cache line.
345+
\end{itemize}
346+
347+
=> Atomics should be used with care and only when strictly necessary.\linebreak
348+
349+
For some of your needs, more performant alternative exist, like \texttt{parallel\_reduce} or \texttt{Kokkos::ScatterView}.
350+
\end{frame}
351+
136352
% _____________________________________________________________________________
137353

138354
\section{Layouts}

images/race_condition1.png

125 KB
Loading

0 commit comments

Comments
 (0)