From 768cf55f38f97fbc47682a84662d7bd376c28c6c Mon Sep 17 00:00:00 2001 From: superjomn Date: Tue, 29 May 2018 17:33:40 +0800 Subject: [PATCH 1/7] init --- .../fluid/inference/tensorrt/convert/fc_op.cc | 44 +++++++++++++++++++ .../inference/tensorrt/convert/mul_op.cc | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/fc_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc new file mode 100644 index 00000000000000..a949faa4b20919 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class FcOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op) override { + VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; + + framework::OpDesc op_desc(op, nullptr, nullptr); + + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + + // Create weights + TensorRTEngine::Weight(nvinfer1::DataType::kFLOAT, ) + + + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *const_cast(input1), +) + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index ed09f54bde00d1..3c1536bb240441 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter { public: MulOpConverter() {} void operator()(const framework::proto::OpDesc& op) override { - VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias"; + VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr, nullptr); // Declare inputs From 7fdbd92ac22b319a9f67ebe8a030dbbd7daf196e Mon Sep 17 00:00:00 2001 From: superjomn Date: Tue, 29 May 2018 21:14:12 +0800 Subject: [PATCH 2/7] init --- .../inference/tensorrt/convert/conv2d_op.cc | 3 +- .../fluid/inference/tensorrt/convert/fc_op.cc | 39 +++++++++++----- .../inference/tensorrt/convert/mul_op.cc | 3 +- .../inference/tensorrt/convert/op_converter.h | 38 +++++++++++----- .../inference/tensorrt/convert/test_fc_op.cc | 44 +++++++++++++++++++ .../inference/tensorrt/convert/test_mul_op.cc | 3 +- .../tensorrt/convert/test_op_converter.cc | 7 ++- .../inference/tensorrt/convert/ut_helper.h | 10 +++-- 8 files changed, 116 insertions(+), 31 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/test_fc_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 209936c3bafb0d..668d344f1bba1c 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -21,7 +21,8 @@ namespace tensorrt { class Conv2dOpConverter : public OpConverter { public: Conv2dOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope) override { LOG(INFO) << "convert a fluid conv2d op to tensorrt conv layer without bias"; } diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index a949faa4b20919..a86483059bccd8 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -12,33 +12,50 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace inference { namespace tensorrt { +/* + * FC converter convert a MUL op in Fluid to a FC layer in TRT. + */ class FcOpConverter : public OpConverter { public: - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope) override { VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Declare inputs - auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); - - // Create weights - TensorRTEngine::Weight(nvinfer1::DataType::kFLOAT, ) - - - auto* layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *const_cast(input1), -) + auto* X = engine_->GetITensor(op_desc.Input("X")[0]); + auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + auto* weight_data = Y_t->mutable_data(platform::CUDAPlace()); + + PADDLE_ENFORCE_EQ(Y_t->ddim().size(), 2UL); + size_t n_output = Y_t->ddim()[0]; + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + Y_t->memory_size()}; + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, + *const_cast(X), + n_output, weight.get(), bias.get()); } }; +REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 3c1536bb240441..4248356869298d 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -24,7 +24,8 @@ namespace tensorrt { class MulOpConverter : public OpConverter { public: MulOpConverter() {} - void operator()(const framework::proto::OpDesc& op) override { + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope) override { VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias"; framework::OpDesc op_desc(op, nullptr, nullptr); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 1cd3ed9a00acea..252f4e863d4d8d 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -31,27 +31,41 @@ namespace tensorrt { class OpConverter { public: OpConverter() {} - virtual void operator()(const framework::proto::OpDesc& op) {} - void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) { - std::string type = op.type(); - auto* it = Registry::Lookup(type); + // Converter logic for an op. + virtual void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope) {} + + // Convert a single fluid operaotr and add the corresponding layer to TRT. + void ConvertOp(const framework::proto::OpDesc& op, + const std::unordered_set& parameters, + const framework::Scope& scope, TensorRTEngine* engine) { + framework::OpDesc op_desc(op, nullptr, nullptr); + + OpConverter* it{nullptr}; + + if (op_desc.Type() == "mul") { + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + auto& Y = op_desc.Input("Y")[0]; + if (parameters.count(Y)) { + it = Registry::Lookup("fc"); + } + } + if (!it) { + it = Registry::Lookup(op_desc.Type()); + } PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type); it->SetEngine(engine); - (*it)(op); - } - - // convert fluid op to tensorrt layer - void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) { - OpConverter::Run(op, engine); + (*it)(op, scope); } // convert fluid block to tensorrt network void ConvertBlock(const framework::proto::BlockDesc& block, - TensorRTEngine* engine) { + const std::unordered_set& parameters, + const framework::Scope& scope, TensorRTEngine* engine) { for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); - OpConverter::Run(op, engine); + ConvertOp(op, parameters, scope, engine); } } diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc new file mode 100644 index 00000000000000..a54b7a5192141e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(fc_op, test) { + std::unordered_set parameters({"mul-Y"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + + validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); + validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul-X"}); + desc.SetInput("Y", {"mul-Y"}); + desc.SetOutput("Out", {"mul-Out"}); + + validator.SetOp(*desc.Proto()); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index d8b61d5f08ffd0..09b0c12543ef9f 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -21,7 +21,8 @@ namespace inference { namespace tensorrt { TEST(MulOpConverter, main) { - TRTConvertValidation validator(10, 1000); + framework::Scope scope; + TRTConvertValidation validator(10, {}, scope, 1000); validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 9ae7de9cbfa656..1d3f5eabb2f839 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + #include #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" namespace paddle { namespace inference { @@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) { conv2d_op->SetType("conv2d"); OpConverter converter; - converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/); + framework::Scope scope; + converter.ConvertBlock(*block->Proto(), {}, scope, + nullptr /*TensorRTEngine*/); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 37fcb5c50309db..1a5e4c20a91a91 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -58,7 +58,10 @@ class TRTConvertValidation { public: TRTConvertValidation() = delete; - TRTConvertValidation(int batch_size, int workspace_size = 1 << 10) { + TRTConvertValidation(int batch_size, + const std::unordered_set& parameters, + framework::Scope& scope, int workspace_size = 1 << 10) + : parameters_(parameters), scope_(scope) { // create engine. engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); engine_->InitNetwork(); @@ -96,7 +99,7 @@ class TRTConvertValidation { op_ = framework::OpRegistry::CreateOp(desc); OpConverter op_converter; - op_converter.ConvertOp(desc, engine_.get()); + op_converter.ConvertOp(desc, parameters_, scope_, engine_.get()); engine_->FreezeNetwork(); @@ -146,9 +149,10 @@ class TRTConvertValidation { private: std::unique_ptr engine_; cudaStream_t stream_; - framework::Scope scope_; + framework::Scope &scope_; std::unique_ptr op_; std::unique_ptr op_desc_; + const std::unordered_set& parameters_; }; } // namespace tensorrt From 8df90968604266550414f029100e366a4f08542b Mon Sep 17 00:00:00 2001 From: superjomn Date: Tue, 29 May 2018 21:51:35 +0800 Subject: [PATCH 3/7] init --- .../fluid/inference/tensorrt/convert/CMakeLists.txt | 2 ++ paddle/fluid/inference/tensorrt/convert/fc_op.cc | 11 ++++++----- .../fluid/inference/tensorrt/convert/op_converter.h | 2 +- paddle/fluid/inference/tensorrt/convert/ut_helper.h | 2 +- paddle/fluid/inference/tensorrt/engine.h | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 5ada1d63126920..23ca8bfac84f35 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) +nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index a86483059bccd8..86228bf6bb578d 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" @@ -40,17 +41,17 @@ class FcOpConverter : public OpConverter { auto* Y_t = Y_v->GetMutable(); auto* weight_data = Y_t->mutable_data(platform::CUDAPlace()); - PADDLE_ENFORCE_EQ(Y_t->ddim().size(), 2UL); - size_t n_output = Y_t->ddim()[0]; + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); + size_t n_output = Y_t->dims()[0]; TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), Y_t->memory_size()}; TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; - auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, - *const_cast(X), - n_output, weight.get(), bias.get()); + TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, + *const_cast(X), n_output, + weight.get(), bias.get()); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 252f4e863d4d8d..b678fb865990fb 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -54,7 +54,7 @@ class OpConverter { if (!it) { it = Registry::Lookup(op_desc.Type()); } - PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); it->SetEngine(engine); (*it)(op, scope); } diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 1a5e4c20a91a91..366b91cd9b319e 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -149,10 +149,10 @@ class TRTConvertValidation { private: std::unique_ptr engine_; cudaStream_t stream_; - framework::Scope &scope_; std::unique_ptr op_; std::unique_ptr op_desc_; const std::unordered_set& parameters_; + framework::Scope &scope_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b8298c6059e864..c7cb1e06632c7a 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -37,7 +37,7 @@ class TensorRTEngine : public EngineBase { // Weight is model parameter. class Weight { public: - Weight(nvinfer1::DataType dtype, void* value, int num_elem) { + Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) { w_.type = dtype; w_.values = value; w_.count = num_elem; From 96fe6a1bc16297990617c717586fdced318faa08 Mon Sep 17 00:00:00 2001 From: superjomn Date: Thu, 31 May 2018 08:29:33 +0800 Subject: [PATCH 4/7] fix compile error --- .../fluid/inference/tensorrt/convert/fc_op.cc | 43 ++++++++++++++----- .../inference/tensorrt/convert/op_converter.h | 5 ++- .../inference/tensorrt/convert/test_fc_op.cc | 10 +++-- .../inference/tensorrt/convert/test_mul_op.cc | 3 +- .../inference/tensorrt/convert/ut_helper.h | 34 ++++++++++----- paddle/fluid/inference/tensorrt/engine.cc | 2 + 6 files changed, 68 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 86228bf6bb578d..6335e9ce88ec27 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/place.h" @@ -32,26 +33,44 @@ class FcOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); // Y is a weight // Declare inputs - auto* X = engine_->GetITensor(op_desc.Input("X")[0]); + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + + // Declare weights auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); - auto* weight_data = Y_t->mutable_data(platform::CUDAPlace()); - - PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); - size_t n_output = Y_t->dims()[0]; + // This may trigger a CPU->GPU copy. + // TODO(Superjomn) use some smarter mutable_data. + auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix + size_t n_output = Y_t->dims()[1]; TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size()}; - TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + Y_t->memory_size() / sizeof(float)}; + + // Currently, the framework can only handle one fluid op -> one TRT layer, + // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just + // handle `mul`, leave `add` as another layer. + // DEBUG + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, + 0}; - TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, - *const_cast(X), n_output, - weight.get(), bias.get()); + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, + *const_cast(X), + n_output, weight.get(), bias.get()); + + auto output_name = op_desc.Output("Out").front(); + engine_->DeclareOutput(layer, 0, output_name); + auto* output = engine_->GetITensor(output_name); + LOG(INFO) << "output dim"; + for (int i = 0; i < output->getDimensions().nbDims; i++) { + LOG(INFO) << output->getDimensions().d[i]; + } } }; @@ -60,3 +79,5 @@ REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index b678fb865990fb..4d21e241c0fe0a 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -46,7 +46,7 @@ class OpConverter { if (op_desc.Type() == "mul") { PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); - auto& Y = op_desc.Input("Y")[0]; + std::string Y = op_desc.Input("Y")[0]; if (parameters.count(Y)) { it = Registry::Lookup("fc"); } @@ -54,7 +54,8 @@ class OpConverter { if (!it) { it = Registry::Lookup(op_desc.Type()); } - PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_desc.Type()); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); it->SetEngine(engine); (*it)(op, scope); } diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index a54b7a5192141e..9c4984598c12e1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -23,11 +23,11 @@ namespace tensorrt { TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); + TRTConvertValidation validator(20, parameters, scope, 1000); - validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); - validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); - validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); + validator.DeclInputVar("mul-X", nvinfer1::Dims4(12, 6, 1, 1)); + validator.DeclParamVar("mul-Y", nvinfer1::Dims2(6, 3)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(12, 3)); // Prepare Op description framework::OpDesc desc; @@ -37,6 +37,8 @@ TEST(fc_op, test) { desc.SetOutput("Out", {"mul-Out"}); validator.SetOp(*desc.Proto()); + + validator.Execute(10); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 09b0c12543ef9f..1ce1130e5d660d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -22,7 +22,8 @@ namespace tensorrt { TEST(MulOpConverter, main) { framework::Scope scope; - TRTConvertValidation validator(10, {}, scope, 1000); + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 366b91cd9b319e..a338c37717f551 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -76,19 +76,22 @@ class TRTConvertValidation { engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims); } + // Declare a parameter varaible in the scope. + void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims); + } + void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { DeclVar(name, dims); } + // Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const nvinfer1::Dims& dims) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); // Init Fluid tensor. - std::vector dim_vec(dims.nbDims); - for (int i = 0; i < dims.nbDims; i++) { - dim_vec[i] = dims.d[i]; - } + std::vector dim_vec(dims.d, dims.d + dims.nbDims); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); @@ -108,11 +111,17 @@ class TRTConvertValidation { // Set Inputs. for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; auto* var = scope_.FindVar(input); PADDLE_ENFORCE(var); auto tensor = var->GetMutable(); + LOG(INFO) << "set input for TRT " << input; + LOG(INFO) << tensor->data()[0]; + LOG(INFO) << tensor->data()[1]; + LOG(INFO) << "set data size " << analysis::AccuDims(tensor->dims(), tensor->dims().size()); + engine_->SetInputFromCPU( - input, static_cast(tensor->data()), + input, static_cast(tensor->data()), sizeof(float) * analysis::AccuDims(tensor->dims(), tensor->dims().size())); } @@ -120,18 +129,20 @@ class TRTConvertValidation { void Execute(int batch_size) { // Execute Fluid Op - // Execute TRT platform::CPUPlace place; platform::CPUDeviceContext ctx(place); - engine_->Execute(batch_size); - op_->Run(scope_, place); + // Execute TRT. + engine_->Execute(batch_size); + cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; - std::vector trt_out(200); + std::vector trt_out(200, 2008.); + LOG(INFO) << "get TRT output " << output; engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float)); + cudaStreamSynchronize(*engine_->stream()); auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); @@ -139,7 +150,8 @@ class TRTConvertValidation { // Compare two output ASSERT_FALSE(fluid_out.empty()); for (size_t i = 0; i < fluid_out.size(); i++) { - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001); + LOG(INFO) << fluid_out[i] << " " << trt_out[i]; + EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); } } } @@ -152,7 +164,7 @@ class TRTConvertValidation { std::unique_ptr op_; std::unique_ptr op_desc_; const std::unordered_set& parameters_; - framework::Scope &scope_; + framework::Scope& scope_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index fb27c8394c1f94..44a3c714a13795 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, name); auto* output = layer->getOutput(offset); + SetITensor(name, output); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); infer_network_->markOutput(*output); @@ -136,6 +137,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, // determine data size auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); + LOG(INFO) << "output size " << it->second; PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GE(max_size, it->second); auto& buf = buffer(name); From 5871e042e9aa241f586592e6ac79aa25a1375ac2 Mon Sep 17 00:00:00 2001 From: superjomn Date: Thu, 31 May 2018 20:37:57 +0800 Subject: [PATCH 5/7] fix ut --- .../fluid/inference/tensorrt/convert/fc_op.cc | 45 ++++++++++++++++--- .../inference/tensorrt/convert/test_fc_op.cc | 6 +-- .../inference/tensorrt/convert/ut_helper.h | 6 --- paddle/fluid/inference/tensorrt/engine.h | 2 + 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 6335e9ce88ec27..0a73d68e0fc980 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -22,6 +23,29 @@ namespace paddle { namespace inference { namespace tensorrt { +template +void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, + T* odata, nvinfer1::DimsHW ostrides) { + for (int h = 0; h < shape.h(); ++h) { + for (int w = 0; w < shape.w(); ++w) { + odata[h * ostrides.h() + w * ostrides.w()] = + idata[h * ostrides.h() + w * ostrides.w()]; + } + } +} + +void ReorderCKtoKC(TensorRTEngine::Weight& iweights, + TensorRTEngine::Weight* oweights) { + int c = iweights.dims[0]; + int k = iweights.dims[1]; + oweights->dims.assign({k, c}); + nvinfer1::DimsHW istrides = {1, k}; + nvinfer1::DimsHW ostrides = {c, 1}; + Reorder2({k, c}, static_cast(iweights.get().values), istrides, + static_cast(const_cast(oweights->get().values)), + ostrides); +} + /* * FC converter convert a MUL op in Fluid to a FC layer in TRT. */ @@ -49,16 +73,28 @@ class FcOpConverter : public OpConverter { PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix size_t n_output = Y_t->dims()[1]; + framework::LoDTensor tmp; + tmp.Resize(Y_t->dims()); + memcpy(tmp.mutable_data(platform::CPUPlace()), Y_t->data(), + Y_t->dims()[0] * Y_t->dims()[1]); + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), Y_t->memory_size() / sizeof(float)}; + TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, + static_cast(tmp.data()), + Y_t->memory_size() / sizeof(float)); + weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); + tmp_weight.dims = weight.dims; + + TensorRTEngine::Weight transposed = weight; + ReorderCKtoKC(tmp_weight, &weight); // Currently, the framework can only handle one fluid op -> one TRT layer, // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // handle `mul`, leave `add` as another layer. // DEBUG - TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, - 0}; + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *const_cast(X), @@ -66,11 +102,6 @@ class FcOpConverter : public OpConverter { auto output_name = op_desc.Output("Out").front(); engine_->DeclareOutput(layer, 0, output_name); - auto* output = engine_->GetITensor(output_name); - LOG(INFO) << "output dim"; - for (int i = 0; i < output->getDimensions().nbDims; i++) { - LOG(INFO) << output->getDimensions().d[i]; - } } }; diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 9c4984598c12e1..a30253072ac581 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -25,9 +25,9 @@ TEST(fc_op, test) { framework::Scope scope; TRTConvertValidation validator(20, parameters, scope, 1000); - validator.DeclInputVar("mul-X", nvinfer1::Dims4(12, 6, 1, 1)); - validator.DeclParamVar("mul-Y", nvinfer1::Dims2(6, 3)); - validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(12, 3)); + validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); + validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index a338c37717f551..f473ff8ecba509 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -115,10 +115,6 @@ class TRTConvertValidation { auto* var = scope_.FindVar(input); PADDLE_ENFORCE(var); auto tensor = var->GetMutable(); - LOG(INFO) << "set input for TRT " << input; - LOG(INFO) << tensor->data()[0]; - LOG(INFO) << tensor->data()[1]; - LOG(INFO) << "set data size " << analysis::AccuDims(tensor->dims(), tensor->dims().size()); engine_->SetInputFromCPU( input, static_cast(tensor->data()), @@ -140,7 +136,6 @@ class TRTConvertValidation { for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; std::vector trt_out(200, 2008.); - LOG(INFO) << "get TRT output " << output; engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float)); cudaStreamSynchronize(*engine_->stream()); @@ -150,7 +145,6 @@ class TRTConvertValidation { // Compare two output ASSERT_FALSE(fluid_out.empty()); for (size_t i = 0; i < fluid_out.size(); i++) { - LOG(INFO) << fluid_out[i] << " " << trt_out[i]; EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6); } } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index c7cb1e06632c7a..c1b0949950d276 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -44,6 +44,8 @@ class TensorRTEngine : public EngineBase { } const nvinfer1::Weights& get() { return w_; } + std::vector dims; + private: nvinfer1::Weights w_; }; From 5d24919f60aa76167e0e063fa8083ee365b970db Mon Sep 17 00:00:00 2001 From: superjomn Date: Thu, 31 May 2018 21:06:46 +0800 Subject: [PATCH 6/7] fix compile --- paddle/fluid/operators/tensorrt_engine_op.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 83e768b4dc9c60..4aea3776c3f8a9 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -31,8 +31,10 @@ void paddle::operators::TensorRTEngineKernel::Prepare( auto max_workspace = context.Attr("max_workspace"); engine_.reset(new inference::tensorrt::TensorRTEngine( max_batch_, max_workspace, nullptr)); + // TODO(Superjomn) parameters should be passed be analysised and passed from + // outside. inference::Singleton::Global().ConvertBlock( - block, engine_.get()); + block, {}, context.scope(), engine_.get()); engine_->FreezeNetwork(); } From d5bd249134ecd445a5f8cb9ee1aabf61fd206de1 Mon Sep 17 00:00:00 2001 From: superjomn Date: Fri, 1 Jun 2018 13:56:05 +0800 Subject: [PATCH 7/7] fix follow review --- paddle/fluid/inference/tensorrt/convert/fc_op.cc | 15 ++++++++++----- .../fluid/inference/tensorrt/convert/ut_helper.h | 6 ++++-- paddle/fluid/inference/tensorrt/engine.cc | 1 - paddle/fluid/operators/tensorrt_engine_op.cc | 3 +-- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 0a73d68e0fc980..bd05608d7620ee 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -23,6 +23,9 @@ namespace paddle { namespace inference { namespace tensorrt { +// Reorder the elements from istrides to ostrides, borrowed from TRT convert in +// tensorflow. +// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318 template void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, T* odata, nvinfer1::DimsHW ostrides) { @@ -34,6 +37,7 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, } } +// Reorder the data layout from CK to KC. void ReorderCKtoKC(TensorRTEngine::Weight& iweights, TensorRTEngine::Weight* oweights) { int c = iweights.dims[0]; @@ -57,8 +61,8 @@ class FcOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); // Declare inputs auto* X = engine_->GetITensor(op_desc.Input("X").front()); @@ -67,8 +71,8 @@ class FcOpConverter : public OpConverter { auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); - // This may trigger a CPU->GPU copy. - // TODO(Superjomn) use some smarter mutable_data. + // This may trigger a GPU->CPU copy, because TRT's weight can only be + // assigned from CPU memory, that can't be avoided. auto* weight_data = Y_t->mutable_data(platform::CPUPlace()); PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix size_t n_output = Y_t->dims()[1]; @@ -87,7 +91,8 @@ class FcOpConverter : public OpConverter { weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; - TensorRTEngine::Weight transposed = weight; + // The data layout of TRT FC layer's weight is different from fluid's FC, + // need to reorder the elements. ReorderCKtoKC(tmp_weight, &weight); // Currently, the framework can only handle one fluid op -> one TRT layer, diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f473ff8ecba509..6105d1d4056d4b 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -133,10 +133,12 @@ class TRTConvertValidation { cudaStreamSynchronize(*engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + const size_t output_space_size = 200; for (const auto& output : op_desc_->OutputArgumentNames()) { std::vector fluid_out; - std::vector trt_out(200, 2008.); - engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float)); + std::vector trt_out(output_space_size); + engine_->GetOutputInCPU(output, &trt_out[0], + output_space_size * sizeof(float)); cudaStreamSynchronize(*engine_->stream()); auto* var = scope_.FindVar(output); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 3bcde2f2c3dcd5..3d75fefc1a7351 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -151,7 +151,6 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, // determine data size auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); - LOG(INFO) << "output size " << it->second; PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GE(max_size, it->second); auto& buf = buffer(name); diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 4aea3776c3f8a9..855157e7c4c5c4 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -31,8 +31,7 @@ void paddle::operators::TensorRTEngineKernel::Prepare( auto max_workspace = context.Attr("max_workspace"); engine_.reset(new inference::tensorrt::TensorRTEngine( max_batch_, max_workspace, nullptr)); - // TODO(Superjomn) parameters should be passed be analysised and passed from - // outside. + // TODO(Superjomn) parameters should be passed after analysised from outside. inference::Singleton::Global().ConvertBlock( block, {}, context.scope(), engine_.get()); engine_->FreezeNetwork();