wangxicoding
diff --git a/‎paddlenlp/ops/CMakeLists.txt
+7-2 b/‎paddlenlp/ops/CMakeLists.txt
+7-2
diff --git a/‎paddlenlp/ops/__init__.py
+1 b/‎paddlenlp/ops/__init__.py
+1
diff --git a/‎paddlenlp/ops/faster_transformer/sample/config/decoder.sample.yaml
+39 b/‎paddlenlp/ops/faster_transformer/sample/config/decoder.sample.yaml
+39
diff --git a/‎paddlenlp/ops/faster_transformer/sample/decoder_sample.py
+126 b/‎paddlenlp/ops/faster_transformer/sample/decoder_sample.py
+126
diff --git a/‎paddlenlp/ops/faster_transformer/sample/encoder_decoder_sample.py
+100 b/‎paddlenlp/ops/faster_transformer/sample/encoder_decoder_sample.py
+100
diff --git a/‎paddlenlp/ops/faster_transformer/src/CMakeLists.txt
+6-2 b/‎paddlenlp/ops/faster_transformer/src/CMakeLists.txt
+6-2
@@ -26,6 +26,7 @@ option(USE_TENSORRT     "Compile with TensorRT."
 option(WITH_TRANSFORMER "Compile with Transformer"                                ON)
 option(WITH_GPT         "Compile with GPT"                                        OFF)
 option(WITH_UNIFIED         "Compile with Unified Transformer"                        ON)
+option(WITH_DECODER     "Compile with Transformer Decoder"                        ON)
 
 if(NOT WITH_GPU)
   message(FATAL_ERROR "Faster transformer custom op doesn't support CPU. Please add the flag -DWITH_GPU=ON to use GPU. ")
@@ -43,8 +44,12 @@ if(WITH_UNIFIED)
   list(APPEND decoding_op_files fusion_unified_decoding_op.cc fusion_unified_decoding_op.cu)
 endif()
 
-if(NOT WITH_TRANSFORMER AND NOT WITH_GPT)
-  message(FATAL_ERROR "-DWITH_TRANSFORMER=ON or/and -DWITH_GPT=ON must be set to use FasterTransformer. ")
+if(WITH_DECODER)
+  list(APPEND decoder_op_files fusion_decoder_op.cc fusion_decoder_op.cu)
+endif()
+
+if(NOT WITH_TRANSFORMER AND NOT WITH_GPT AND NOT WITH_DECODER)
+  message(FATAL_ERROR "-DWITH_TRANSFORMER=ON or/and -DWITH_GPT=ON or/and -DWITH_DECODER=ON must be set to use FasterTransformer. ")
 endif()
 
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 
@@ -14,6 +14,7 @@
 
 from .faster_transformer.transformer.decoding import *
 from .faster_transformer.transformer.faster_transformer import *
+from .faster_transformer.transformer.decoder import *
 from .einsum import *
 from .distributed import *
 from . import optimizer
@@ -0,0 +1,39 @@
+# Batch size during inference. 
+infer_batch_size: 8
+max_out_len: 256
+
+# Hyparams for model:
+# These following five vocabularies related configurations will be set
+# automatically according to the passed vocabulary path and special tokens.
+# Size of source word dictionary.
+src_vocab_size: 38512
+# Size of target word dictionay
+trg_vocab_size: 38512
+# Index for <bos> token
+bos_idx: 0
+# Index for <eos> token
+eos_idx: 1
+# Index for <unk> token
+unk_idx: 2
+# Max length of sequences deciding the size of position encoding table.
+max_length: 256
+# The dimension for word embeddings, which is also the last dimension of
+# the input and output of multi-head attention, position-wise feed-forward
+# networks, encoder and decoder.
+d_model: 512
+# Size of the hidden layer in position-wise feed-forward networks.
+d_inner_hid: 2048
+# Number of head used in multi-head attention.
+n_head: 8
+# Number of sub-layers to be stacked in the encoder.
+num_encoder_layers: 6
+# Number of sub-layers to be stacked in the decoder.
+num_decoder_layers: 6
+# Dropout rates.
+dropout: 0.1
+# The flag indicating whether to share embedding and softmax weights.
+# Vocabularies in source and target should be same for weight sharing.
+weight_sharing: True
+
+# Path of trained parameter, to make prediction
+init_from_params: base_trained_models/step_final
@@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from attrdict import AttrDict
+import argparse
+import time
+
+import yaml
+from pprint import pprint
+import paddle
+
+from paddlenlp.ops import FasterDecoder
+from paddlenlp.utils.log import logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="./config/decoder.sample.yaml",
+        type=str,
+        help="Path of the config file. ")
+    parser.add_argument(
+        "--decoder_lib",
+        default="../../build/lib/libdecoder_op.so",
+        type=str,
+        help="Path of libdecoder_op.so. ")
+    parser.add_argument(
+        "--use_fp16_decoder",
+        action="store_true",
+        help="Whether to use fp16 decoder to predict. ")
+    args = parser.parse_args()
+    return args
+
+
+def do_predict(args):
+    place = "gpu"
+    paddle.set_device(place)
+
+    # Define model
+    transformer = FasterDecoder(
+        src_vocab_size=args.src_vocab_size,
+        trg_vocab_size=args.trg_vocab_size,
+        max_length=args.max_length + 1,
+        num_encoder_layers=args.num_encoder_layers,
+        num_decoder_layers=args.num_decoder_layers,
+        n_head=args.n_head,
+        d_model=args.d_model,
+        d_inner_hid=args.d_inner_hid,
+        dropout=args.dropout,
+        weight_sharing=args.weight_sharing,
+        bos_id=args.bos_idx,
+        eos_id=args.eos_idx,
+        max_out_len=args.max_out_len,
+        decoder_lib=args.decoder_lib,
+        use_fp16_decoder=args.use_fp16_decoder)
+
+    # Load checkpoint.
+    transformer.load(
+        os.path.join(args.init_from_params, "transformer.pdparams"))
+    # Set evaluate mode
+    transformer.eval()
+
+    # Generate data randomly
+    dec_input = paddle.randn(
+        shape=[args.infer_batch_size, 1, args.d_model], dtype='float32')
+    enc_output = paddle.randn(
+        shape=[args.infer_batch_size, args.max_length, args.d_model],
+        dtype='float32')
+    mem_seq_lens = paddle.full(
+        shape=[args.infer_batch_size, 1],
+        fill_value=args.max_length,
+        dtype='int32')
+    dtype = 'float32'
+    if args.use_fp16_decoder:
+        dtype = 'float16'
+        dec_input = paddle.cast(dec_input, dtype=dtype)
+        enc_output = paddle.cast(enc_output, dtype=dtype)
+    self_cache = paddle.zeros(
+        shape=[
+            args.num_decoder_layers, 2, 0, args.infer_batch_size, args.d_model
+        ],
+        dtype=dtype)
+    mem_cache = paddle.zeros(
+        shape=[
+            args.num_decoder_layers, 2, args.infer_batch_size, args.max_length,
+            args.d_model
+        ],
+        dtype=dtype)
+
+    with paddle.no_grad():
+        for i in range(100):
+            # For warmup. 
+            if 50 == i:
+                start = time.time()
+            dec_output, self_cache, mem_cache = transformer.decoder(
+                from_tensor=dec_input,
+                memory_tensor=enc_output,
+                mem_seq_len=mem_seq_lens,
+                self_cache=self_cache,
+                mem_cache=mem_cache)
+        logger.info("Average test time for decoder is %f ms" % (
+            (time.time() - start) / 50 * 1000))
+
+
+if __name__ == "__main__":
+    ARGS = parse_args()
+    yaml_file = ARGS.config
+    with open(yaml_file, 'rt') as f:
+        args = AttrDict(yaml.safe_load(f))
+    args.decoder_lib = ARGS.decoder_lib
+    args.use_fp16_decoder = ARGS.use_fp16_decoder
+    pprint(args)
+
+    do_predict(args)
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from attrdict import AttrDict
+import argparse
+import time
+
+import yaml
+from pprint import pprint
+import paddle
+from paddlenlp.ops import FasterDecoder
+from paddlenlp.utils.log import logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="./config/decoder.sample.yaml",
+        type=str,
+        help="Path of the config file. ")
+    parser.add_argument(
+        "--decoder_lib",
+        default="../../build/lib/libdecoder_op.so",
+        type=str,
+        help="Path of libdecoder_op.so. ")
+    parser.add_argument(
+        "--use_fp16_decoder",
+        action="store_true",
+        help="Whether to use fp16 decoder to predict. ")
+    args = parser.parse_args()
+    return args
+
+
+def do_predict(args):
+    place = "gpu"
+    paddle.set_device(place)
+
+    # Define model
+    transformer = FasterDecoder(
+        src_vocab_size=args.src_vocab_size,
+        trg_vocab_size=args.trg_vocab_size,
+        max_length=args.max_length + 1,
+        num_encoder_layers=args.num_encoder_layers,
+        num_decoder_layers=args.num_decoder_layers,
+        n_head=args.n_head,
+        d_model=args.d_model,
+        d_inner_hid=args.d_inner_hid,
+        dropout=args.dropout,
+        weight_sharing=args.weight_sharing,
+        bos_id=args.bos_idx,
+        eos_id=args.eos_idx,
+        max_out_len=args.max_out_len,
+        decoder_lib=args.decoder_lib,
+        use_fp16_decoder=args.use_fp16_decoder)
+
+    # Load checkpoint.
+    transformer.load(
+        os.path.join(args.init_from_params, "transformer.pdparams"))
+    # Set evaluate mode
+    transformer.eval()
+
+    # Generate src_word randomly
+    src_word = paddle.randint(
+        0,
+        args.src_vocab_size,
+        shape=[args.infer_batch_size, args.max_length],
+        dtype='int64')
+
+    with paddle.no_grad():
+        for i in range(100):
+            # For warmup. 
+            if 50 == i:
+                start = time.time()
+            finished_seq, finished_scores = transformer(src_word=src_word)
+        logger.info("Average test time for decoder is %f ms" % (
+            (time.time() - start) / 50 * 1000))
+
+
+if __name__ == "__main__":
+    ARGS = parse_args()
+    yaml_file = ARGS.config
+    with open(yaml_file, 'rt') as f:
+        args = AttrDict(yaml.safe_load(f))
+    args.decoder_lib = ARGS.decoder_lib
+    args.use_fp16_decoder = ARGS.use_fp16_decoder
+    pprint(args)
+
+    do_predict(args)
@@ -145,7 +145,7 @@ if(ON_INFER)
     set(DEPS ${DEPS} shlwapi.lib)
   endif(NOT WIN32)
 
-  cuda_add_library(pd_infer_custom_op ${decoding_op_files} SHARED)
+  cuda_add_library(pd_infer_custom_op ${decoding_op_files} ${decoder_op_files} SHARED)
   add_dependencies(pd_infer_custom_op extern_${THIRD_PARTY_NAME})
   string(REPLACE "/" ";" DEMO_PATH ${DEMO})
 
@@ -269,4 +269,8 @@ else(ON_INFER)
   add_library(decoding_op SHARED ${decoding_op_files})
   add_dependencies(decoding_op extern_${THIRD_PARTY_NAME} boost)
   target_link_libraries(decoding_op PRIVATE -lcublas -lcudart ${lib_link} ${ft_lib_link})
-endif()
+
+  add_library(decoder_op SHARED ${decoder_op_files})
+  add_dependencies(decoder_op extern_${THIRD_PARTY_NAME} boost)
+  target_link_libraries(decoder_op PRIVATE -lcublas -lcudart -ldecoder ${lib_link})
+endif()