PaddlePaddle
diff --git a/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/conv_op.h
Lines changed: 4 additions & 3 deletions b/‎paddle/fluid/operators/conv_op.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/conv_transpose_op.h
Lines changed: 4 additions & 3 deletions b/‎paddle/fluid/operators/conv_transpose_op.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎paddle/fluid/operators/cub_reduce.h
Lines changed: 322 additions & 0 deletions b/‎paddle/fluid/operators/cub_reduce.h
Lines changed: 322 additions & 0 deletions
@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
+    op_library(reduce_mean_op DEPS cub)
 else()
     op_library(conv_op DEPS vol2col im2col)
 endif()
 
@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
     math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                  output);
   }
 };
 
@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, input_grad, static_cast<T>(0));
       depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, input_grad);
+                             paddings, dilations, input_grad);
     }
 
     if (filter_grad) {
       filter_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
       depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
     }
   }
 };
 
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
     math::DepthwiseConvInputGradFunctor<DeviceContext, T>
         depthwiseConvInputGrad;
     depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           output);
+                           dilations, output);
   }
 };
 
@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
     if (input_grad) {
       math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
                     input_grad);
     }
 
@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
       math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
           depthwiseConvFilterGrad;
       depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
     }
   }
 };
 
@@ -0,0 +1,322 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+namespace detail {
+template <typename T, size_t ElementCount>
+struct Array {
+ public:
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE inline const T& operator[](size_t index) const {
+    return data_[index];
+  }
+
+  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
+
+  template <typename VectorLikeType>
+  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
+    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
+    size_t n = static_cast<size_t>(vec.size());
+    Array<T, ElementCount> ret;
+    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+    return ret;
+  }
+
+ private:
+  T data_[ElementCount];
+};
+
+// reduce the last axis of 2d array
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
+                               TransformOp transformer, Ty init,
+                               int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
+                             TransformOp transformer, Ty init, int reduce_num,
+                             Array<int, Rank> x_strides,
+                             Array<int, ReduceRank> reduce_dim,
+                             Array<int, ReduceRank> reduce_strides,
+                             Array<int, Rank - ReduceRank> left_dim,
+                             Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  Array<int, Rank> sub_index;
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
+
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
+  }
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
+  int n = static_cast<int>(dims.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+constexpr int kMaxBlockDim = 512;
+
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void TensorReduceImpl(
+    const Tx* x_data, Ty* y_data, const platform::Place& place,
+    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
+    int left_num, int reduce_num, const std::vector<int>& x_strides,
+    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
+    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
+    cudaStream_t stream) {
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+
+#define CUB_REDUCE_RANK_CASE(i, ...)                              \
+  case i: {                                                       \
+    constexpr auto kReduceRank = i;                               \
+    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
+                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
+        x_data, y_data, reducer, transformer, init, reduce_num,   \
+        Array<int, kRank>::From(x_strides),                       \
+        Array<int, kReduceRank>::From(reduce_dim),                \
+        Array<int, kReduceRank>::From(reduce_strides),            \
+        Array<int, kRank - kReduceRank>::From(left_dim),          \
+        Array<int, kRank - kReduceRank>::From(left_strides));     \
+  } break
+
+  int rank = x_strides.size();
+  int reduce_rank = reduce_strides.size();
+  if (rank == reduce_rank) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    return;
+  }
+  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
+                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
+        x_data, y_data, reducer, transformer, init, reduce_num);
+    return;
+  }
+  /*
+  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
+    // Currently, it is handled by code below, but inefficient
+    return;
+  }
+  */
+
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5););
+
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
+                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
+  }
+
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+
+}  // namespace detail
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
+                  std::vector<int> origin_reduce_dims, const Ty& init,
+                  const ReduceOp& reducer, const TransformOp& transformer,
+                  cudaStream_t stream) {
+  auto x_dim = framework::vectorize2int(x.dims());
+  std::vector<int> new_x_dim, new_reduce_dims;
+  int is_reduced = 0;
+  for (auto e : origin_reduce_dims) {
+    auto pos = e >= 0 ? e : e + x_dim.size();
+    is_reduced |= 1 << e;
+  }
+  for (int i = 0; i < x_dim.size(); i++) {
+    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+      new_x_dim.push_back(x_dim[i]);
+      if ((is_reduced >> i) & 1)
+        new_reduce_dims.push_back(new_x_dim.size() - 1);
+    } else {
+      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+    }
+  }
+  x_dim = new_x_dim;
+  origin_reduce_dims = new_reduce_dims;
+  int x_rank = static_cast<int>(x_dim.size());
+  std::set<int> left_set, reduce_set;
+  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
+
+  for (auto e : origin_reduce_dims) {
+    left_set.erase(e);
+    reduce_set.insert(e);
+  }
+
+  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
+  std::vector<int> left_dim(left_set.begin(), left_set.end());
+
+  std::vector<int> x_strides = detail::GetStrides(x_dim);
+  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
+  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+  int left_num = 1;
+  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
+
+  std::vector<int> y_dim(left_dim.size());
+  for (int i = 0; i < left_dim.size(); ++i) {
+    y_dim[i] = x_dim[left_dim[i]];
+  }
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+  if (reduce_num == 1) return;
+
+#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
+  case block_dim: {                                                      \
+    constexpr auto kBlockDim = block_dim;                                \
+    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
+        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
+        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
+        left_strides, stream);                                           \
+  } break
+
+  switch (detail::GetDesiredBlockDim(reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+
+}  // namespace operators
+}  // namespace paddle
Original file line number	Diff line number	Diff line change
`@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {`
`380`	`380`	`math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;`
`381`	`381`
`382`	`382`	`auto& dev_ctx = context.template device_context<DeviceContext>();`
`383`		`- depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);`
	`383`	`+ depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,`
	`384`	`+ output);`
`384`	`385`	`}`
`385`	`386`	`};`
`386`	`387`
`@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {`
`415`	`416`	`input_grad->mutable_data<T>(context.GetPlace());`
`416`	`417`	`set_zero(dev_ctx, input_grad, static_cast<T>(0));`
`417`	`418`	`depthwiseConvInputGrad(dev_ctx, input, filter, output_grad, strides,`
`418`		`- paddings, input_grad);`
	`419`	`+ paddings, dilations, input_grad);`
`419`	`420`	`}`
`420`	`421`
`421`	`422`	`if (filter_grad) {`
`422`	`423`	`filter_grad->mutable_data<T>(context.GetPlace());`
`423`	`424`	`set_zero(dev_ctx, filter_grad, static_cast<T>(0));`
`424`	`425`	`depthwiseConvFilterGrad(dev_ctx, input, output_grad, strides, paddings,`
`425`		`- filter_grad);`
	`426`	`+ dilations, filter_grad);`
`426`	`427`	`}`
`427`	`428`	`}`
`428`	`429`	`};`