17
17
18
18
namespace custom_kernel {
19
19
20
+ template <typename T, typename Context>
21
+ void SubtractKernel (const Context& dev_ctx,
22
+ const phi::DenseTensor& x,
23
+ const phi::DenseTensor& y,
24
+ phi::DenseTensor* out);
25
+
26
+ template <typename T, typename Context>
27
+ void MulsKernel (const Context& dev_ctx,
28
+ const phi::DenseTensor& x,
29
+ const float scaling,
30
+ phi::DenseTensor* out);
31
+
32
+ template <typename T, typename Context>
33
+ void FullLikeKernel (const Context& dev_ctx,
34
+ const phi::DenseTensor& x,
35
+ const phi::Scalar& val,
36
+ phi::DataType dtype,
37
+ phi::DenseTensor* out);
38
+
20
39
template <typename T, typename Context>
21
40
void HuberLossSub (const Context& dev_ctx,
22
41
const phi::DenseTensor* x,
23
42
const phi::DenseTensor* y,
24
43
phi::DenseTensor* z) {
25
44
// Calculate z = x - y
26
45
z->Resize (x->dims ());
27
- dev_ctx.template Alloc <T>(z);
28
- const auto & runner = NpuOpRunner (" Sub" , {*x, *y}, {*z}, {});
29
- runner.Run (dev_ctx.stream ());
46
+ custom_kernel::SubtractKernel<T, Context>(dev_ctx, *x, *y, z);
30
47
}
31
48
32
49
template <typename T, typename Context>
@@ -36,34 +53,91 @@ void HuberLossMuls(const Context& dev_ctx,
36
53
phi::DenseTensor* y) {
37
54
// Calculate y = x + scale
38
55
y->Resize (x->dims ());
39
- dev_ctx.template Alloc <T>(y);
40
- const auto & runner = NpuOpRunner (" Muls" , {*x}, {*y}, {{" value" , scalar}});
41
- runner.Run (dev_ctx.stream ());
56
+ custom_kernel::MulsKernel<T, Context>(dev_ctx, *x, scalar, y);
42
57
}
43
58
44
59
template <typename T, typename Context>
45
60
void HuberLossZerosLike (const Context& dev_ctx,
46
61
const phi::DenseTensor* x,
47
62
phi::DenseTensor* y) {
48
63
y->Resize (x->dims ());
49
- dev_ctx.template Alloc <T>(y);
50
- const auto & runner = NpuOpRunner (" ZerosLike" , {*x}, {*y}, {});
64
+ phi::Scalar zeros = static_cast <T>(0 );
65
+ custom_kernel::FullLikeKernel<T, Context>(dev_ctx, *x, zeros, x->dtype (), y);
66
+ }
67
+
68
+ template <typename T, typename Context>
69
+ void AclopSmoothL1LossKernel (const Context& dev_ctx,
70
+ const phi::DenseTensor* x,
71
+ const phi::DenseTensor* y,
72
+ float delta,
73
+ phi::DenseTensor* z) {
74
+ dev_ctx.template Alloc <T>(z);
75
+ const auto & runner =
76
+ NpuOpRunner (" SmoothL1Loss" , {*x, *y}, {*z}, {{" sigma" , delta}});
51
77
runner.Run (dev_ctx.stream ());
52
78
}
53
79
80
+ template <typename T, typename Context>
81
+ void SmoothL1LossKernel (const Context& dev_ctx,
82
+ const phi::DenseTensor* x,
83
+ const phi::DenseTensor* y,
84
+ float delta,
85
+ phi::DenseTensor* z) {
86
+ DO_COMPATIBILITY (aclnnSmoothL1Loss,
87
+ (custom_kernel::AclopSmoothL1LossKernel<T, Context>(
88
+ dev_ctx, x, y, delta, z)));
89
+
90
+ dev_ctx.template Alloc <T>(z);
91
+ int64_t reduction = 0 ; // none
92
+ EXEC_NPU_CMD (aclnnSmoothL1Loss, dev_ctx, *x, *y, reduction, delta, *z);
93
+ }
94
+
54
95
template <typename T, typename Context>
55
96
void HuberLossSmoothL1Loss (const Context& dev_ctx,
56
97
const phi::DenseTensor* x,
57
98
const phi::DenseTensor* y,
58
99
float delta,
59
100
phi::DenseTensor* z) {
60
101
z->Resize (x->dims ());
61
- dev_ctx.template Alloc <T>(z);
62
- const auto & runner =
63
- NpuOpRunner (" SmoothL1Loss" , {*x, *y}, {*z}, {{" sigma" , delta}});
102
+ custom_kernel::SmoothL1LossKernel<T, Context>(dev_ctx, x, y, delta, z);
103
+ }
104
+
105
+ template <typename T, typename Context>
106
+ void AclopSmoothL1LossGrad (const Context& dev_ctx,
107
+ const phi::DenseTensor* pred,
108
+ const phi::DenseTensor* lab,
109
+ const phi::DenseTensor* dout,
110
+ float sigma,
111
+ phi::DenseTensor* grad) {
112
+ dev_ctx.template Alloc <T>(grad);
113
+ const auto & runner = NpuOpRunner (
114
+ " SmoothL1LossGrad" , {*pred, *lab, *dout}, {*grad}, {{" sigma" , sigma}});
64
115
runner.Run (dev_ctx.stream ());
65
116
}
66
117
118
+ template <typename T, typename Context>
119
+ void SmoothL1LossGrad (const Context& dev_ctx,
120
+ const phi::DenseTensor* pred,
121
+ const phi::DenseTensor* lab,
122
+ const phi::DenseTensor* dout,
123
+ float sigma,
124
+ phi::DenseTensor* grad) {
125
+ DO_COMPATIBILITY (aclnnSmoothL1LossBackward,
126
+ (custom_kernel::AclopSmoothL1LossGrad<T, Context>(
127
+ dev_ctx, pred, lab, dout, sigma, grad)));
128
+
129
+ dev_ctx.template Alloc <T>(grad);
130
+ int64_t reduction = 0 ; // none
131
+ EXEC_NPU_CMD (aclnnSmoothL1LossBackward,
132
+ dev_ctx,
133
+ *dout,
134
+ *pred,
135
+ *lab,
136
+ reduction,
137
+ sigma,
138
+ *grad);
139
+ }
140
+
67
141
template <typename T, typename Context>
68
142
void HuberLossSmoothL1LossGrad (const Context& dev_ctx,
69
143
const phi::DenseTensor* pred,
@@ -72,10 +146,8 @@ void HuberLossSmoothL1LossGrad(const Context& dev_ctx,
72
146
float sigma,
73
147
phi::DenseTensor* grad) {
74
148
grad->Resize (pred->dims ());
75
- dev_ctx.template Alloc <T>(grad);
76
- const auto & runner = NpuOpRunner (
77
- " SmoothL1LossGrad" , {*pred, *lab, *dout}, {*grad}, {{" sigma" , sigma}});
78
- runner.Run (dev_ctx.stream ());
149
+ custom_kernel::SmoothL1LossGrad<T, Context>(
150
+ dev_ctx, pred, lab, dout, sigma, grad);
79
151
}
80
152
81
153
template <typename T, typename Context>
@@ -99,8 +171,6 @@ void HuberLossGradKernel(const Context& dev_ctx,
99
171
float delta,
100
172
phi::DenseTensor* dx,
101
173
phi::DenseTensor* dy) {
102
- auto stream = dev_ctx.stream ();
103
-
104
174
phi::DenseTensor t_grad_rd;
105
175
if (dx || dy) {
106
176
phi::DenseTensor t_zero;
0 commit comments