Skip to content

Commit 1ce9293

Browse files
committed
[npu] add box coder
1 parent ec2f68e commit 1ce9293

File tree

3 files changed

+633
-1
lines changed

3 files changed

+633
-1
lines changed

paddle/fluid/operators/detection/CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,13 @@ function(detection_library TARGET_NAME)
1515
PARENT_SCOPE)
1616
endfunction()
1717

18+
if (WITH_ASCEND_CL)
19+
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
20+
else()
21+
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
22+
endif()
23+
1824
detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
19-
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
2025
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
2126
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
2227
detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
http://www.apache.org/licenses/LICENSE-2.0
6+
Unless required by applicable law or agreed to in writing, software
7+
distributed under the License is distributed on an "AS IS" BASIS,
8+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
See the License for the specific language governing permissions and
10+
limitations under the License. */
11+
12+
#include "paddle/fluid/operators/detection/box_coder_op.h"
13+
#include <string>
14+
#include <vector>
15+
#include "paddle/fluid/operators/npu_op_runner.h"
16+
17+
namespace paddle {
18+
namespace operators {
19+
20+
using Tensor = framework::Tensor;
21+
22+
template <typename T>
23+
struct BoxCoderFunction {
24+
public:
25+
explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
26+
place = ctx.GetPlace();
27+
stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
28+
.stream();
29+
}
30+
Tensor Adds(const Tensor& x, float scalar) {
31+
Tensor y;
32+
y.mutable_data<T>(x.dims(), place);
33+
const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
34+
runner.Run(stream);
35+
return y;
36+
}
37+
Tensor Muls(const Tensor& x, float scalar) {
38+
Tensor y;
39+
y.mutable_data<T>(x.dims(), place);
40+
const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
41+
runner.Run(stream);
42+
return y;
43+
}
44+
Tensor Mul(const Tensor& x, const Tensor& y) {
45+
Tensor z;
46+
z.mutable_data<T>(x.dims(), place);
47+
const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
48+
runner.Run(stream);
49+
return z;
50+
}
51+
Tensor SubWithBroadCast(const Tensor& x, const Tensor& y,
52+
const framework::DDim& shape) {
53+
Tensor z;
54+
z.mutable_data<T>(shape, place);
55+
const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
56+
runner.Run(stream);
57+
return z;
58+
}
59+
void DivWithBroadCastVoid(const Tensor& x, const Tensor& y,
60+
const framework::DDim& shape, Tensor* z) {
61+
z->mutable_data<T>(shape, place);
62+
const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
63+
runner.Run(stream);
64+
}
65+
Tensor DivWithBroadCast(const Tensor& x, const Tensor& y,
66+
const framework::DDim& shape) {
67+
Tensor z;
68+
DivWithBroadCastVoid(x, y, shape, &z);
69+
return z;
70+
}
71+
void MulWithBroadCastVoid(const Tensor& x, const Tensor& y,
72+
const framework::DDim& shape, Tensor* z) {
73+
z->mutable_data<T>(shape, place);
74+
const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
75+
runner.Run(stream);
76+
}
77+
Tensor MulWithBroadCast(const Tensor& x, const Tensor& y,
78+
const framework::DDim& shape) {
79+
Tensor z;
80+
MulWithBroadCastVoid(x, y, shape, &z);
81+
return z;
82+
}
83+
void AddWithBroadCastVoid(const Tensor& x, const Tensor& y,
84+
const framework::DDim& shape, Tensor* z) {
85+
z->mutable_data<T>(shape, place);
86+
const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
87+
runner.Run(stream);
88+
}
89+
Tensor AddWithBroadCast(const Tensor& x, const Tensor& y,
90+
const framework::DDim& shape) {
91+
Tensor z;
92+
AddWithBroadCastVoid(x, y, shape, &z);
93+
return z;
94+
}
95+
Tensor Abs(const Tensor& x) {
96+
Tensor y;
97+
y.mutable_data<T>(x.dims(), place);
98+
const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
99+
runner.Run(stream);
100+
return y;
101+
}
102+
Tensor Log(const Tensor& x) {
103+
Tensor t_x_m1 = Adds(x, -1);
104+
Tensor y;
105+
y.mutable_data<T>(x.dims(), place);
106+
const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
107+
runner.Run(stream);
108+
return y;
109+
}
110+
Tensor Exp(const Tensor& x) {
111+
Tensor y;
112+
y.mutable_data<T>(x.dims(), place);
113+
const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
114+
runner.Run(stream);
115+
return y;
116+
}
117+
Tensor Dot(const Tensor& x, const Tensor& y) {
118+
auto dim_x = x.dims();
119+
auto dim_y = y.dims();
120+
PADDLE_ENFORCE_EQ(
121+
dim_x.size(), 2,
122+
platform::errors::InvalidArgument(
123+
"x should be a 2-dim tensor, but got %d-dim.", dim_x.size()));
124+
PADDLE_ENFORCE_EQ(
125+
dim_y.size(), 2,
126+
platform::errors::InvalidArgument(
127+
"y should be a 2-dim tensor, but got %d-dim.", dim_y.size()));
128+
PADDLE_ENFORCE_EQ(
129+
dim_x[1], dim_y[0],
130+
platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but "
131+
"got dim_x[1] = %d, dim_y[0] = %d.",
132+
dim_x[1], dim_y[0]));
133+
Tensor z;
134+
z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
135+
const auto& runner =
136+
NpuOpRunner("MatMul", {x, y}, {z},
137+
{{"transpose_x1", false}, {"transpose_x2", false}});
138+
runner.Run(stream);
139+
return z;
140+
}
141+
void ConcatVoid(const std::vector<Tensor>& inputs,
142+
const framework::DDim& shape_out, int axis, Tensor* output) {
143+
output->mutable_data<T>(shape_out, place);
144+
std::vector<std::string> names;
145+
for (size_t i = 0; i < inputs.size(); i++) {
146+
names.push_back("x" + std::to_string(i));
147+
}
148+
NpuOpRunner runner{
149+
"ConcatD",
150+
{inputs},
151+
{*output},
152+
{{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
153+
runner.AddInputNames(names);
154+
runner.Run(stream);
155+
}
156+
Tensor Concat(const std::vector<Tensor>& inputs,
157+
const framework::DDim& shape_out, int axis) {
158+
Tensor output;
159+
ConcatVoid(inputs, shape_out, axis, &output);
160+
return output;
161+
}
162+
Tensor Slice(const Tensor& x, const std::vector<int>& offsets,
163+
const std::vector<int>& size, const framework::DDim& shape) {
164+
Tensor y;
165+
y.mutable_data<T>(shape, place);
166+
const auto& runner =
167+
NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
168+
runner.Run(stream);
169+
return y;
170+
}
171+
172+
private:
173+
platform::Place place;
174+
aclrtStream stream;
175+
const framework::ExecutionContext& ctx;
176+
};
177+
178+
template <typename T>
179+
void Vector2Tensor(const framework::ExecutionContext& ctx,
180+
const std::vector<T>& vec, const framework::DDim& ddim,
181+
Tensor* tsr) {
182+
framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
183+
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
184+
tsr->Resize(ddim);
185+
}
186+
187+
template <typename T>
188+
void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb,
189+
const Tensor* pb, const Tensor* pbv, const bool norm,
190+
const std::vector<float>& variance, Tensor* out) {
191+
auto M = pb->dims()[0];
192+
auto N = tb->dims()[0];
193+
auto shape_0 = framework::make_ddim({4, 2});
194+
Tensor m_diff;
195+
Tensor m_aver;
196+
std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
197+
static_cast<T>(0), static_cast<T>(-1),
198+
static_cast<T>(1), static_cast<T>(0),
199+
static_cast<T>(0), static_cast<T>(1)};
200+
std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
201+
static_cast<T>(0), static_cast<T>(0.5),
202+
static_cast<T>(0.5), static_cast<T>(0),
203+
static_cast<T>(0), static_cast<T>(0.5)};
204+
Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
205+
Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
206+
207+
BoxCoderFunction<T> F(ctx);
208+
Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
209+
Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
210+
Tensor tb_xy = F.Dot(*tb, m_aver);
211+
Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
212+
213+
pb_xy.Resize({1, M, 2});
214+
pb_wh.Resize({1, M, 2});
215+
tb_xy.Resize({N, 1, 2});
216+
tb_wh.Resize({N, 1, 2});
217+
218+
auto shape_half = framework::make_ddim({N, M, 2});
219+
auto shape_full = framework::make_ddim({N, M, 4});
220+
221+
Tensor out_xy_0 = F.DivWithBroadCast(
222+
F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
223+
Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
224+
Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
225+
226+
if (pbv) {
227+
F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
228+
} else {
229+
Tensor t_var;
230+
std::vector<T> vec_var(4);
231+
for (auto i = 0; i < 4; i++) {
232+
vec_var[i] = static_cast<T>(variance[i]);
233+
}
234+
Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var);
235+
F.DivWithBroadCastVoid(out_0, t_var, shape_full, out);
236+
}
237+
}
238+
239+
template <typename T>
240+
void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb,
241+
const Tensor* pb, const Tensor* pbv, const bool norm,
242+
const std::vector<float>& variance, int axis, Tensor* out) {
243+
auto shape_0 = framework::make_ddim({4, 2});
244+
Tensor m_diff;
245+
Tensor m_aver;
246+
std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
247+
static_cast<T>(0), static_cast<T>(-1),
248+
static_cast<T>(1), static_cast<T>(0),
249+
static_cast<T>(0), static_cast<T>(1)};
250+
std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
251+
static_cast<T>(0), static_cast<T>(0.5),
252+
static_cast<T>(0.5), static_cast<T>(0),
253+
static_cast<T>(0), static_cast<T>(0.5)};
254+
Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
255+
Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
256+
257+
BoxCoderFunction<T> F(ctx);
258+
Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
259+
Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
260+
auto pb_resize_shape = axis == 0
261+
? framework::make_ddim({1, pb->dims()[0], 2})
262+
: framework::make_ddim({pb->dims()[0], 1, 2});
263+
pb_xy.Resize(pb_resize_shape);
264+
pb_wh.Resize(pb_resize_shape);
265+
266+
auto tbox_slice_shape =
267+
framework::make_ddim({tb->dims()[0], tb->dims()[1], 2});
268+
std::vector<int> tbox_slice_size = {static_cast<int>(tb->dims()[0]),
269+
static_cast<int>(tb->dims()[1]), 2};
270+
Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
271+
Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
272+
273+
Tensor tb_xy;
274+
Tensor tb_wh;
275+
if (pbv) {
276+
auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2});
277+
auto pbvt_resize_shape = axis == 0
278+
? framework::make_ddim({1, pbv->dims()[0], 2})
279+
: framework::make_ddim({pbv->dims()[0], 1, 2});
280+
std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
281+
Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
282+
Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
283+
pbv_t01.Resize(pbvt_resize_shape);
284+
pbv_t23.Resize(pbvt_resize_shape);
285+
286+
F.AddWithBroadCastVoid(
287+
F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape),
288+
pb_xy, tbox_slice_shape, &tb_xy);
289+
F.MulWithBroadCastVoid(
290+
F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh,
291+
tbox_slice_shape, &tb_wh);
292+
} else if (variance.empty()) {
293+
F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape),
294+
pb_xy, tbox_slice_shape, &tb_xy);
295+
F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
296+
} else {
297+
Tensor t_var01, t_var23;
298+
auto t_var_shape = framework::make_ddim({1, 1, 2});
299+
std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
300+
static_cast<T>(variance[1])};
301+
std::vector<T> vec_var23 = {static_cast<T>(variance[2]),
302+
static_cast<T>(variance[3])};
303+
Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01);
304+
Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23);
305+
F.AddWithBroadCastVoid(
306+
F.MulWithBroadCast(tbox01,
307+
F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape),
308+
tbox_slice_shape),
309+
pb_xy, tbox_slice_shape, &tb_xy);
310+
F.MulWithBroadCastVoid(
311+
F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh,
312+
tbox_slice_shape, &tb_wh);
313+
}
314+
Tensor obox01 =
315+
F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
316+
Tensor obox23 =
317+
F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
318+
(norm ? 0 : -1));
319+
F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
320+
}
321+
322+
template <typename T>
323+
class BoxCoderNPUKernel : public framework::OpKernel<T> {
324+
public:
325+
void Compute(const framework::ExecutionContext& ctx) const override {
326+
auto* prior_box = ctx.Input<Tensor>("PriorBox");
327+
auto* prior_box_var = ctx.Input<Tensor>("PriorBoxVar");
328+
auto* target_box = ctx.Input<framework::LoDTensor>("TargetBox");
329+
auto* output_box = ctx.Output<Tensor>("OutputBox");
330+
std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
331+
const int axis = ctx.Attr<int>("axis");
332+
333+
if (prior_box_var) {
334+
PADDLE_ENFORCE_EQ(variance.empty(), true,
335+
platform::errors::InvalidArgument(
336+
"Input 'PriorBoxVar' and attribute 'variance'"
337+
" of BoxCoder operator should not be used at the "
338+
"same time."));
339+
}
340+
if (!(variance.empty())) {
341+
PADDLE_ENFORCE_EQ(static_cast<int>(variance.size()), 4,
342+
platform::errors::InvalidArgument(
343+
"Size of attribute 'variance' in BoxCoder operator"
344+
" should be 4. But received size is %d",
345+
variance.size()));
346+
}
347+
348+
if (target_box->lod().size()) {
349+
PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
350+
platform::errors::InvalidArgument(
351+
"Input 'TargetBox' of BoxCoder operator only"
352+
" supports LoD with one level."));
353+
}
354+
355+
auto code_type = GetBoxCodeType(ctx.Attr<std::string>("code_type"));
356+
bool normalized = ctx.Attr<bool>("box_normalized");
357+
358+
if (code_type == BoxCodeType::kEncodeCenterSize) {
359+
BoxCoderEnc<T>(ctx, target_box, prior_box, prior_box_var, normalized,
360+
variance, output_box);
361+
} else {
362+
BoxCoderDec<T>(ctx, target_box, prior_box, prior_box_var, normalized,
363+
variance, axis, output_box);
364+
}
365+
}
366+
};
367+
368+
} // namespace operators
369+
} // namespace paddle
370+
371+
namespace ops = paddle::operators;
372+
namespace plat = paddle::platform;
373+
374+
REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel<float>,
375+
ops::BoxCoderNPUKernel<plat::float16>);

0 commit comments

Comments
 (0)