Skip to content

Commit 7cfddf2

Browse files
liujianhang-designluotao1
authored andcommitted
Optimize bilinear interpolate op with OpenMP (#17800)
Refactor the code to be OpenMP friendly test=develop
1 parent d6d33fd commit 7cfddf2

File tree

1 file changed

+56
-16
lines changed

1 file changed

+56
-16
lines changed

paddle/fluid/operators/interpolate_op.h

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#pragma once
1313
#include <string>
14+
#include <vector>
1415
#include "paddle/fluid/framework/op_registry.h"
1516
#include "paddle/fluid/operators/math/math_function.h"
1617

@@ -57,32 +58,71 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
5758
auto input_t = EigenTensor<T, 4>::From(input);
5859
auto output_t = EigenTensor<T, 4>::From(*output);
5960
bool align_flag = (align_mode == 0 && !align_corners);
60-
for (int k = 0; k < out_h; k++) { // loop for images
61+
62+
std::vector<int> vy_n, vy_s;
63+
std::vector<float> vd_n, vd_s;
64+
vy_n.reserve(out_h);
65+
vy_s.reserve(out_h);
66+
vd_n.reserve(out_h);
67+
vd_s.reserve(out_h);
68+
#ifdef PADDLE_WITH_MKLML
69+
#pragma omp parallel for
70+
#endif
71+
for (int k = 0; k < out_h; k++) {
6172
int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
6273
: static_cast<int>(ratio_h * k);
6374
y_n = (y_n > 0) ? y_n : 0;
6475
int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
6576
float d_n =
6677
align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
6778
float d_s = 1.f - d_n;
79+
{
80+
vy_n[k] = y_n;
81+
vy_s[k] = y_s;
82+
vd_n[k] = d_n;
83+
vd_s[k] = d_s;
84+
}
85+
}
6886

69-
for (int l = 0; l < out_w; l++) {
70-
int x_w = (align_mode == 0 && !align_corners)
71-
? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
72-
: static_cast<int>(ratio_w * l);
73-
x_w = (x_w > 0) ? x_w : 0;
74-
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
75-
float d_w =
76-
align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
77-
float d_e = 1.f - d_w;
87+
std::vector<int> vx_w, vx_e;
88+
std::vector<float> vd_w, vd_e;
89+
vx_w.reserve(out_w);
90+
vx_e.reserve(out_w);
91+
vd_w.reserve(out_w);
92+
vd_e.reserve(out_w);
93+
#ifdef PADDLE_WITH_MKLML
94+
#pragma omp parallel for
95+
#endif
96+
for (int l = 0; l < out_w; l++) {
97+
int x_w = (align_mode == 0 && !align_corners)
98+
? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
99+
: static_cast<int>(ratio_w * l);
100+
x_w = (x_w > 0) ? x_w : 0;
101+
int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
102+
float d_w =
103+
align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
104+
float d_e = 1.f - d_w;
105+
{
106+
vx_w[l] = x_w;
107+
vx_e[l] = x_e;
108+
vd_w[l] = d_w;
109+
vd_e[l] = d_e;
110+
}
111+
}
78112

79-
for (int i = 0; i < n; i++) { // loop for batches
80-
for (int j = 0; j < c; j++) { // loop for channels
113+
#ifdef PADDLE_WITH_MKLML
114+
#pragma omp parallel for collapse(4)
115+
#endif
116+
for (int i = 0; i < n; i++) { // loop for batches
117+
for (int j = 0; j < c; j++) { // loop for channels
118+
for (int k = 0; k < out_h; k++) { // loop for images
119+
for (int l = 0; l < out_w; l++) {
81120
// bilinear interpolation
82-
output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
83-
input_t(i, j, y_s, x_w) * d_n * d_e +
84-
input_t(i, j, y_n, x_e) * d_s * d_w +
85-
input_t(i, j, y_s, x_e) * d_n * d_w;
121+
T out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
122+
input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
123+
input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
124+
input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
125+
output_t(i, j, k, l) = out_t;
86126
}
87127
}
88128
}

0 commit comments

Comments
 (0)