Skip to content

Commit 82e84fb

Browse files
authored
[NPU] aclnn huber_loss (PaddlePaddle#1205)
1 parent 7cc0151 commit 82e84fb

File tree

1 file changed

+87
-17
lines changed

1 file changed

+87
-17
lines changed

backends/npu/kernels/huber_loss_kernel.cc

Lines changed: 87 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,33 @@
1717

1818
namespace custom_kernel {
1919

20+
template <typename T, typename Context>
21+
void SubtractKernel(const Context& dev_ctx,
22+
const phi::DenseTensor& x,
23+
const phi::DenseTensor& y,
24+
phi::DenseTensor* out);
25+
26+
template <typename T, typename Context>
27+
void MulsKernel(const Context& dev_ctx,
28+
const phi::DenseTensor& x,
29+
const float scaling,
30+
phi::DenseTensor* out);
31+
32+
template <typename T, typename Context>
33+
void FullLikeKernel(const Context& dev_ctx,
34+
const phi::DenseTensor& x,
35+
const phi::Scalar& val,
36+
phi::DataType dtype,
37+
phi::DenseTensor* out);
38+
2039
template <typename T, typename Context>
2140
void HuberLossSub(const Context& dev_ctx,
2241
const phi::DenseTensor* x,
2342
const phi::DenseTensor* y,
2443
phi::DenseTensor* z) {
2544
// Calculate z = x - y
2645
z->Resize(x->dims());
27-
dev_ctx.template Alloc<T>(z);
28-
const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
29-
runner.Run(dev_ctx.stream());
46+
custom_kernel::SubtractKernel<T, Context>(dev_ctx, *x, *y, z);
3047
}
3148

3249
template <typename T, typename Context>
@@ -36,34 +53,91 @@ void HuberLossMuls(const Context& dev_ctx,
3653
phi::DenseTensor* y) {
3754
// Calculate y = x + scale
3855
y->Resize(x->dims());
39-
dev_ctx.template Alloc<T>(y);
40-
const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
41-
runner.Run(dev_ctx.stream());
56+
custom_kernel::MulsKernel<T, Context>(dev_ctx, *x, scalar, y);
4257
}
4358

4459
template <typename T, typename Context>
4560
void HuberLossZerosLike(const Context& dev_ctx,
4661
const phi::DenseTensor* x,
4762
phi::DenseTensor* y) {
4863
y->Resize(x->dims());
49-
dev_ctx.template Alloc<T>(y);
50-
const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {});
64+
phi::Scalar zeros = static_cast<T>(0);
65+
custom_kernel::FullLikeKernel<T, Context>(dev_ctx, *x, zeros, x->dtype(), y);
66+
}
67+
68+
template <typename T, typename Context>
69+
void AclopSmoothL1LossKernel(const Context& dev_ctx,
70+
const phi::DenseTensor* x,
71+
const phi::DenseTensor* y,
72+
float delta,
73+
phi::DenseTensor* z) {
74+
dev_ctx.template Alloc<T>(z);
75+
const auto& runner =
76+
NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}});
5177
runner.Run(dev_ctx.stream());
5278
}
5379

80+
template <typename T, typename Context>
81+
void SmoothL1LossKernel(const Context& dev_ctx,
82+
const phi::DenseTensor* x,
83+
const phi::DenseTensor* y,
84+
float delta,
85+
phi::DenseTensor* z) {
86+
DO_COMPATIBILITY(aclnnSmoothL1Loss,
87+
(custom_kernel::AclopSmoothL1LossKernel<T, Context>(
88+
dev_ctx, x, y, delta, z)));
89+
90+
dev_ctx.template Alloc<T>(z);
91+
int64_t reduction = 0; // none
92+
EXEC_NPU_CMD(aclnnSmoothL1Loss, dev_ctx, *x, *y, reduction, delta, *z);
93+
}
94+
5495
template <typename T, typename Context>
5596
void HuberLossSmoothL1Loss(const Context& dev_ctx,
5697
const phi::DenseTensor* x,
5798
const phi::DenseTensor* y,
5899
float delta,
59100
phi::DenseTensor* z) {
60101
z->Resize(x->dims());
61-
dev_ctx.template Alloc<T>(z);
62-
const auto& runner =
63-
NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}});
102+
custom_kernel::SmoothL1LossKernel<T, Context>(dev_ctx, x, y, delta, z);
103+
}
104+
105+
template <typename T, typename Context>
106+
void AclopSmoothL1LossGrad(const Context& dev_ctx,
107+
const phi::DenseTensor* pred,
108+
const phi::DenseTensor* lab,
109+
const phi::DenseTensor* dout,
110+
float sigma,
111+
phi::DenseTensor* grad) {
112+
dev_ctx.template Alloc<T>(grad);
113+
const auto& runner = NpuOpRunner(
114+
"SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}});
64115
runner.Run(dev_ctx.stream());
65116
}
66117

118+
template <typename T, typename Context>
119+
void SmoothL1LossGrad(const Context& dev_ctx,
120+
const phi::DenseTensor* pred,
121+
const phi::DenseTensor* lab,
122+
const phi::DenseTensor* dout,
123+
float sigma,
124+
phi::DenseTensor* grad) {
125+
DO_COMPATIBILITY(aclnnSmoothL1LossBackward,
126+
(custom_kernel::AclopSmoothL1LossGrad<T, Context>(
127+
dev_ctx, pred, lab, dout, sigma, grad)));
128+
129+
dev_ctx.template Alloc<T>(grad);
130+
int64_t reduction = 0; // none
131+
EXEC_NPU_CMD(aclnnSmoothL1LossBackward,
132+
dev_ctx,
133+
*dout,
134+
*pred,
135+
*lab,
136+
reduction,
137+
sigma,
138+
*grad);
139+
}
140+
67141
template <typename T, typename Context>
68142
void HuberLossSmoothL1LossGrad(const Context& dev_ctx,
69143
const phi::DenseTensor* pred,
@@ -72,10 +146,8 @@ void HuberLossSmoothL1LossGrad(const Context& dev_ctx,
72146
float sigma,
73147
phi::DenseTensor* grad) {
74148
grad->Resize(pred->dims());
75-
dev_ctx.template Alloc<T>(grad);
76-
const auto& runner = NpuOpRunner(
77-
"SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}});
78-
runner.Run(dev_ctx.stream());
149+
custom_kernel::SmoothL1LossGrad<T, Context>(
150+
dev_ctx, pred, lab, dout, sigma, grad);
79151
}
80152

81153
template <typename T, typename Context>
@@ -99,8 +171,6 @@ void HuberLossGradKernel(const Context& dev_ctx,
99171
float delta,
100172
phi::DenseTensor* dx,
101173
phi::DenseTensor* dy) {
102-
auto stream = dev_ctx.stream();
103-
104174
phi::DenseTensor t_grad_rd;
105175
if (dx || dy) {
106176
phi::DenseTensor t_zero;

0 commit comments

Comments
 (0)