|
15 | 15 | #include "paddle/phi/backends/gpu/gpu_context.h"
|
16 | 16 | #include "paddle/phi/core/kernel_registry.h"
|
17 | 17 | #include "paddle/phi/core/platform/collective_helper.h"
|
| 18 | +#include "paddle/phi/kernels/activation_kernel.h" |
| 19 | +#include "paddle/phi/kernels/full_kernel.h" |
18 | 20 | #include "paddle/phi/kernels/funcs/axis_utils.h"
|
| 21 | +#include "paddle/phi/kernels/funcs/broadcast_function.h" |
19 | 22 | #include "paddle/phi/kernels/funcs/cross_entropy.h"
|
20 | 23 | #include "paddle/phi/kernels/funcs/eigen/common.h"
|
| 24 | +#include "paddle/phi/kernels/funcs/elementwise_functor.h" |
21 | 25 | #include "paddle/phi/kernels/funcs/math.h"
|
22 | 26 | #include "paddle/phi/kernels/funcs/math_function.h"
|
23 | 27 | #include "paddle/phi/kernels/funcs/softmax.h"
|
24 | 28 | #include "paddle/phi/kernels/funcs/softmax_impl.h"
|
| 29 | +#include "paddle/phi/kernels/reduce_max_kernel.h" |
25 | 30 | #include "paddle/phi/kernels/reduce_sum_kernel.h"
|
26 | 31 | #include "paddle/utils/string/string_helper.h"
|
27 | 32 |
|
@@ -213,36 +218,33 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
|
213 | 218 | softmax_2d.ShareDataWith(*softmax).Resize({N, D});
|
214 | 219 | loss_2d.ShareDataWith(*loss).Resize({N, 1});
|
215 | 220 |
|
216 |
| - auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d); |
217 |
| - auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d); |
218 |
| - |
219 | 221 | // step 1, obtain logit_max
|
220 | 222 | phi::DenseTensor logits_max;
|
221 | 223 | logits_max.Resize({N, 1});
|
222 | 224 | dev_ctx.template Alloc<T>(&logits_max);
|
223 | 225 |
|
224 |
| - auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max); |
225 |
| - Eigen::DSizes<int, 1> along_axis(1); |
226 |
| - eigen_logits_max.device(*dev_ctx.eigen_device()) = |
227 |
| - eigen_logits.maximum(along_axis); |
| 226 | + phi::MaxKernel<T, phi::GPUContext>( |
| 227 | + dev_ctx, logits_2d, {-1}, true, &logits_max); |
228 | 228 |
|
229 | 229 | comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
|
230 | 230 |
|
231 | 231 | // step 2, obtain logit - logit_max
|
232 |
| - Eigen::DSizes<int, 2> batch_by_one(N, 1); |
233 |
| - Eigen::DSizes<int, 2> one_by_class(1, D); |
234 |
| - |
235 |
| - eigen_softmax.device(*dev_ctx.eigen_device()) = |
236 |
| - (eigen_logits - |
237 |
| - eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class)); |
| 232 | + std::vector<const phi::DenseTensor*> inputs = {&logits_2d, &logits_max}; |
| 233 | + std::vector<phi::DenseTensor*> outputs = {&softmax_2d}; |
| 234 | + phi::funcs::BroadcastKernel<T>( |
| 235 | + dev_ctx, inputs, &outputs, phi::funcs::SubtractFunctor<T>()); |
238 | 236 |
|
239 | 237 | // step 3, obtain predict target
|
240 | 238 | phi::DenseTensor predicted_logits;
|
241 | 239 | predicted_logits.Resize({N, 1});
|
242 | 240 | dev_ctx.template Alloc<T>(&predicted_logits);
|
243 | 241 |
|
244 |
| - auto t = phi::EigenVector<T>::Flatten(predicted_logits); |
245 |
| - t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0)); |
| 242 | + phi::FullKernel<T, phi::GPUContext>( |
| 243 | + dev_ctx, |
| 244 | + common::vectorize(predicted_logits.dims()), |
| 245 | + 0, |
| 246 | + predicted_logits.dtype(), |
| 247 | + &predicted_logits); |
246 | 248 |
|
247 | 249 | const int64_t start_index = rank * D;
|
248 | 250 | const int64_t end_index = start_index + D;
|
@@ -309,7 +311,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
|
309 | 311 | comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
|
310 | 312 |
|
311 | 313 | // step 4, obtain exp(logit)
|
312 |
| - eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp(); |
| 314 | + phi::ExpKernel<T, phi::GPUContext>(dev_ctx, softmax_2d, &softmax_2d); |
313 | 315 |
|
314 | 316 | // step 5, obtain sum_exp_logits
|
315 | 317 | phi::DenseTensor sum_exp_logits;
|
@@ -362,11 +364,13 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
|
362 | 364 | }
|
363 | 365 | }
|
364 | 366 |
|
365 |
| - auto eigen_sum_exp_logits = |
366 |
| - phi::funcs::EigenMatrix<T>::From(sum_exp_logits); |
367 |
| - eigen_softmax.device(*dev_ctx.eigen_device()) = |
368 |
| - (eigen_softmax * |
369 |
| - eigen_sum_exp_logits.inverse().broadcast(one_by_class)); |
| 367 | + phi::ReciprocalKernel<T, phi::GPUContext>( |
| 368 | + dev_ctx, sum_exp_logits, &sum_exp_logits); |
| 369 | + |
| 370 | + inputs = std::vector<const phi::DenseTensor*>{&softmax_2d, &sum_exp_logits}; |
| 371 | + outputs = std::vector<phi::DenseTensor*>{&softmax_2d}; |
| 372 | + phi::funcs::BroadcastKernel<T>( |
| 373 | + dev_ctx, inputs, &outputs, phi::funcs::MultiplyFunctor<T>()); |
370 | 374 | #endif
|
371 | 375 | }
|
372 | 376 | };
|
|
0 commit comments