@@ -133,6 +133,222 @@ \section{Subviews}
133133
134134\section {Atomics }
135135
136+ \begin {frame }[fragile]{Atomics - Race condition}
137+ \begin {columns }
138+ \begin {column }{0.4\linewidth }
139+ \begin {minted }{C++}
140+ Kokkos::View<double*>
141+ v("v", 2);
142+ // Init v with
143+ // v(0) = 4
144+ // v(1) = 5
145+
146+ Kokkos::View<double> res;
147+ Kokkos::parallel_for(
148+ Kokkos::RangePolicy(0,2),
149+ KOKKOS_LAMBDA(int i) {
150+ res() = res() + v(i);
151+ });
152+ \end {minted }
153+ \end {column }
154+ \begin {column }{0.6\linewidth }
155+ Even simple instruction like '\texttt{+}' are decomposed into several smaller assembly instructions:
156+
157+ \includegraphics {race_condition1.png}
158+ \end {column }
159+ \end {columns }
160+ \end {frame }
161+
162+ % Trainee could play with the following program to check that it really present a race condition:
163+ % #include <iostream>
164+ % #include <Kokkos_Core.hpp>
165+ %
166+ % int main(int argc, char *argv[]) {
167+ % Kokkos::initialize(argc, argv);
168+ % {
169+ % const int N = 10000;
170+ % Kokkos::View<double*> v("v", N);
171+ % Kokkos::deep_copy(v, 4);
172+ %
173+ % Kokkos::View<double> res("res", N);
174+ %
175+ % Kokkos::parallel_for(Kokkos::RangePolicy(0, N),
176+ % KOKKOS_LAMBDA(int i) {
177+ % //Kokkos::atomic_add(&res(), v(i));
178+ % res() = res() + v(i);
179+ % });
180+ %
181+ % double res_;
182+ %
183+ % deep_copy(res_, res);
184+ %
185+ % std::cout << "res_:" << res_ << std::endl;
186+ % std::cout << "4*N:" << 4*N << std::endl;
187+ % }
188+ % Kokkos::finalize();
189+ % }
190+
191+ \begin {frame }[fragile]{Atomics - Race condition}
192+ \begin {columns }
193+ \begin {column }{0.4\linewidth }
194+ \begin {minted }{C++}
195+ Kokkos::View<double*>
196+ v("v", 2);
197+ // Init v with
198+ // v(0) = 4
199+ // v(1) = 5
200+
201+ Kokkos::View<double> res;
202+ Kokkos::parallel_for(
203+ Kokkos::RangePolicy(0,2),
204+ KOKKOS_LAMBDA(int i) {
205+ res() = res() + v(i);
206+ });
207+ \end {minted }
208+ \end {column }
209+ \begin {column }{0.6\linewidth }
210+ When several cores run in parallel, instructions can interleave and generate \structure {race conditions}:
211+
212+ \includegraphics [width=1\textwidth ]{race_condition2.png}
213+ \end {column }
214+ \end {columns }
215+ \end {frame }
216+
217+ \begin {frame }[fragile]{Atomic Operations}
218+ \begin {columns }
219+ \begin {column }{0.4\linewidth }
220+ \begin {minted }{C++}
221+ Kokkos::View<double*>
222+ v("v", 2);
223+ // Init v with
224+ // v(0) = 4
225+ // v(1) = 5
226+
227+ Kokkos::View<double> res;
228+ Kokkos::parallel_for(
229+ Kokkos::RangePolicy(0,2),
230+ KOKKOS_LAMBDA(int i) {
231+ // res() += v(i)
232+ Kokkos::atomic_add(
233+ &res(), v(i));
234+ });
235+ \end {minted }
236+ \end {column }
237+ \begin {column }{0.5\linewidth }
238+ \texttt {atomic\_ add } execute the \texttt {LOAD }, \texttt {STORE } and \texttt {ADD } in a single atomic step,
239+ guarantying the absence of race condition during the addition.
240+ \includegraphics [width=1\textwidth ]{race_condition3.png}
241+ \end {column }
242+ \end {columns }
243+ \pause
244+ Note that for this example, it would be better to use a \texttt {parallel\_ reduce }.
245+ \end {frame }
246+
247+ \begin {frame }[fragile]{Atomic operations}
248+ \begin {columns }
249+ \begin {column }{0.55\linewidth }
250+ \begin {tabular }{|l|c|}
251+ \hline
252+ Operation & Replaces \\
253+ \hline
254+ Kokkos::atomic\_ add(\& x, y) & x += y \\
255+ Kokkos::atomic\_ and(\& x, y) & x \& = y \\
256+ Kokkos::atomic\_ dec(\& x) & x-- \\
257+ Kokkos::atomic\_ inc(\& x) & x++ \\
258+ Kokkos::atomic\_ lshift(\& x, y) & x = x << y \\
259+ Kokkos::atomic\_ max(\& x, y) & x = std::max(x, y) \\
260+ Kokkos::atomic\_ min(\& x, y) & x = std::min(x, y) \\
261+ Kokkos::atomic\_ mod(\& x, y) & x \% = y \\
262+ Kokkos::atomic\_ nand(\& x, y) & x = !(x \&\& y) \\
263+ Kokkos::atomic\_ or(\& x, y) & x |= y \\
264+ Kokkos::atomic\_ rshift(\& x, y) & x = x >> y \\
265+ Kokkos::atomic\_ sub(\& x, y) & x -= y \\
266+ Kokkos::atomic\_ store(\& x, y) & x = y \\
267+ Kokkos::atomic\_ xor(\& x, y) & x \^ {}= y \\
268+ \hline
269+ \end {tabular }
270+ \end {column }
271+
272+ \begin {column }{0.35\linewidth }
273+ Other common operations are available with the format \texttt {Kokkos::atomic\_ [op] }.
274+ \end {column }
275+ \end {columns }
276+ \end {frame }
277+
278+ \begin {frame }[fragile]{Atomic operations - fetch}
279+ \begin {columns }
280+ \begin {column }{0.4\linewidth }
281+ \begin {minted }{C++}
282+ auto old_value =
283+ atomic_fetch_[op](
284+ ptr_to_value,
285+ update_value);
286+
287+ auto new_value =
288+ atomic_[op]_fetch(
289+ ptr_to_value,
290+ update_value);
291+ \end {minted }
292+ \end {column }
293+ \begin {column }{0.6\linewidth }
294+ \texttt {atomic\_ fetch\_ [op] }:
295+ \begin {itemize }
296+ \item atomically performs the operation \texttt {[op] } with the operands \texttt {*ptr\_ to\_ value } and \texttt {update\_ value },
297+ \item returns the value \texttt {*ptr\_ to\_ value } had \structure {before} the operation.
298+ \end {itemize }
299+ \texttt {atomic\_ [op]\_ fetch }:
300+ \begin {itemize }
301+ \item atomically performs the operation \texttt {[op] } with the operands \texttt {*ptr\_ to\_ value } and \texttt {update\_ value },
302+ \item returns the value \texttt {*ptr\_ to\_ value } has \structure {after} the operation.
303+ \end {itemize }
304+ \end {column }
305+ \end {columns }
306+ \end {frame }
307+
308+ \begin {frame }[fragile]{Atomic operations - Exchange}
309+ \begin {columns }
310+ \begin {column }{0.4\linewidth }
311+ \begin {minted }{C++}
312+ old_value =
313+ atomic_exchange(
314+ ptr_to_value,
315+ desired);
316+
317+ old_value =
318+ atomic_compare_exchange(
319+ ptr_to_value,
320+ expected,
321+ desired);
322+ \end {minted }
323+ \end {column }
324+ \begin {column }{0.6\linewidth }
325+ \texttt {atomic\_ exchange }:
326+ \begin {itemize }
327+ \item atomically affects the value \texttt {desired } to \texttt {*ptr\_ to\_ value },
328+ \item returns the value \texttt {*ptr\_ to\_ value } had before the call.
329+ \end {itemize }
330+ \texttt {atomic\_ compare\_ exchange }:
331+ \begin {itemize }
332+ \item atomically affects the value \texttt {desired } to \texttt {*ptr\_ to\_ value }, \structure {if} the old value of \texttt {*ptr\_ to\_ value } was equal to \texttt {expected },
333+ \item returns the value \texttt {*ptr\_ to\_ value } had before the call, whether the exchange happened or not.
334+ \end {itemize }
335+ \end {column }
336+ \end {columns }
337+ \end {frame }
338+
339+ \begin {frame }[fragile]{Atomic - Performances}
340+ Atomics can have a huge impact on performance:
341+ \begin {itemize }
342+ \item the instruction itself is slower than the one it replaces,
343+ \item they may generates extra synchronisation points,
344+ \item they bypass and invalidate cache line.
345+ \end {itemize }
346+
347+ => Atomics should be used with care and only when strictly necessary.\linebreak
348+
349+ For some of your needs, more performant alternative exist, like \texttt {parallel\_ reduce } or \texttt {Kokkos::ScatterView }.
350+ \end {frame }
351+
136352% _____________________________________________________________________________
137353
138354\section {Layouts }
0 commit comments