Transformer decoding support fuse qkv (#1455)

FrostML · ZeyuChen · guoshengCS · web-flow · commit fa1fa7565b46 · 2021-12-17T11:54:38.000+08:00
* decoding support fuseqkv ! ! !

* update force version

* fp16

* alternative

* force decoding support global cublashandle and cublaslthandle

* update

* update

* update

* rm ref

Co-authored-by: Zeyu Chen &lt;chenzeyu01@baidu.com&gt;
Co-authored-by: Guo Sheng &lt;whucsgs@163.com&gt;
diff --git a/paddlenlp/ops/faster_transformer/sample/decoding_sample.py b/paddlenlp/ops/faster_transformer/sample/decoding_sample.py
@@ -35,7 +35,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--config",
-        default="./sample/config/decoding.sample.yaml",
+        default="./faster_transformer/sample/config/decoding.sample.yaml",
         type=str,
         help="Path of the config file. ")
     parser.add_argument(
diff --git a/paddlenlp/ops/faster_transformer/src/cublas_handle.cc b/paddlenlp/ops/faster_transformer/src/cublas_handle.cc
@@ -25,4 +25,4 @@ CublasHandle* CublasHandle::GetInstance() {
 CublasHandle::~CublasHandle() {
   cublasDestroy(cublas_handle_);
   cublasLtDestroy(cublaslt_handle_);
-}
+}
diff --git a/paddlenlp/ops/faster_transformer/src/cublas_handle.h b/paddlenlp/ops/faster_transformer/src/cublas_handle.h
@@ -55,4 +55,4 @@ class CublasHandle {
   cublasLtHandle_t cublaslt_handle_;
 
   ~CublasHandle();
-};
+};
diff --git a/paddlenlp/ops/faster_transformer/src/fusion_decoding_op.cu b/paddlenlp/ops/faster_transformer/src/fusion_decoding_op.cu
@@ -21,7 +21,6 @@ limitations under the License. */
 #include <sstream>
 #include <vector>
 
-#include "cublas_handle.h"
 #include "fastertransformer/cuda/cub/cub.cuh"
 #include "fusion_decoding_op.h"
 #include "pd_traits.h"
@@ -125,6 +124,10 @@ std::vector<paddle::Tensor> decoding_kernel(
   DecoderInitParam<DataType_>* params =
       new DecoderInitParam<DataType_>[num_layer_];
 
+  auto q_weight_shape = self_attn_query_weight[0].shape();
+  auto k_weight_shape = self_attn_key_weight[0].shape();
+  bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true;
+
   for (int i = 0; i < num_layer_; i++) {
     params[i].stream = stream;
     params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
@@ -261,7 +264,8 @@ std::vector<paddle::Tensor> decoding_kernel(
         start_id_,
         end_id_,
         beam_search_diversity_rate_,
-        true);  // is_fuse_topk_softMax
+        true,  // is_fuse_topk_softMax
+        fuse_qkv);
 
     decoding_beam_search_->forward(params, decoding_params);
 
@@ -283,7 +287,7 @@ std::vector<paddle::Tensor> decoding_kernel(
         end_id_,
         beam_search_diversity_rate_,
         true,   // is_fuse_topk_softMax
-        false,  // is_fuse_qkv
+        fuse_qkv,
         true,   // keep_alive_beam
         alpha);
 
@@ -307,7 +311,8 @@ std::vector<paddle::Tensor> decoding_kernel(
                                                       start_id_,
                                                       end_id_,
                                                       candidate_num_,
-                                                      probability_threshold_);
+                                                      probability_threshold_,
+                                                      fuse_qkv);
 
     decoding_sampling_->forward(params, decoding_params);
 
diff --git a/paddlenlp/ops/faster_transformer/src/fusion_decoding_op.h b/paddlenlp/ops/faster_transformer/src/fusion_decoding_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "cublas_handle.h"
+
 #include "fastertransformer/decoding_beamsearch.h"
 #include "fastertransformer/decoding_sampling.h"
 #include "fastertransformer/open_decoder.h"
diff --git a/paddlenlp/ops/faster_transformer/src/fusion_force_decoding_op.cu b/paddlenlp/ops/faster_transformer/src/fusion_force_decoding_op.cu
@@ -83,20 +83,18 @@ std::vector<paddle::Tensor> decoding_kernel(
     paddle::Tensor& output_ids,
     paddle::Tensor& parent_ids,
     paddle::Tensor& sequence_length,
-    std::string decoding_strategy,
-    int beam_size,
-    int topk,
-    float topp,
-    int head_num_,
-    int size_per_head_,
-    int num_layer_,
-    int start_id_,
-    int end_id_,
-    int64_t max_seq_len_,
-    float beam_search_diversity_rate_,
-    float alpha,
-    cublasHandle_t cublas_handle_,
-    cublasLtHandle_t cublaslt_handle_,
+    const std::string& decoding_strategy,
+    const int beam_size,
+    const int topk,
+    const float topp,
+    const int head_num_,
+    const int size_per_head_,
+    const int num_layer_,
+    const int start_id_,
+    const int end_id_,
+    const int64_t max_seq_len_,
+    const float beam_search_diversity_rate_,
+    const float alpha,
     cudaStream_t stream) {
   int beam_width_ = (decoding_strategy == "beam_search" ||
                      decoding_strategy == "beam_search_v2")
@@ -119,8 +117,9 @@ std::vector<paddle::Tensor> decoding_kernel(
   typedef typename traits_::data_t data_t_;
 
   DecodingInitParam<DataType_> decoding_params;
-  decoding_params.cublas_handle = cublas_handle_;
-  decoding_params.cublaslt_handle = cublaslt_handle_;
+  decoding_params.cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
+  decoding_params.cublaslt_handle =
+      CublasHandle::GetInstance()->cublaslt_handle_;
 
   decoding_params.output_ids = output_ids.mutable_data<int>(input.place());
   decoding_params.parent_ids = parent_ids.mutable_data<int>(input.place());
@@ -156,10 +155,14 @@ std::vector<paddle::Tensor> decoding_kernel(
   DecoderInitParam<DataType_>* params =
       new DecoderInitParam<DataType_>[num_layer_];
 
+  auto q_weight_shape = self_attn_query_weight[0].shape();
+  auto k_weight_shape = self_attn_key_weight[0].shape();
+  bool fuse_qkv = (q_weight_shape[1] == k_weight_shape[1]) ? false : true;
+
   for (int i = 0; i < num_layer_; i++) {
     params[i].stream = stream;
-    params[i].cublas_handle = cublas_handle_;
-    params[i].cublaslt_handle = cublaslt_handle_;
+    params[i].cublas_handle = CublasHandle::GetInstance()->cublas_handle_;
+    params[i].cublaslt_handle = CublasHandle::GetInstance()->cublaslt_handle_;
 
     if (decoding_strategy == "beam_search" ||
         decoding_strategy == "beam_search_v2") {
@@ -292,7 +295,8 @@ std::vector<paddle::Tensor> decoding_kernel(
         start_id_,
         end_id_,
         beam_search_diversity_rate_,
-        true);  // is_fuse_topk_softMax
+        true,  // is_fuse_topk_softMax
+        fuse_qkv);  // is_fuse_qkv
 
     decoding_beam_search_->forward(params, decoding_params);
 
@@ -314,7 +318,7 @@ std::vector<paddle::Tensor> decoding_kernel(
         end_id_,
         beam_search_diversity_rate_,
         true,   // is_fuse_topk_softMax
-        false,  // is_fuse_qkv
+        fuse_qkv,  // is_fuse_qkv
         true,   // keep_alive_beam
         alpha);
 
@@ -338,7 +342,8 @@ std::vector<paddle::Tensor> decoding_kernel(
                                                       start_id_,
                                                       end_id_,
                                                       candidate_num_,
-                                                      probability_threshold_);
+                                                      probability_threshold_,
+                                                      fuse_qkv);
 
     decoding_sampling_->forward(params, decoding_params);
 
@@ -392,24 +397,20 @@ std::vector<paddle::Tensor> DecodingCUDAForward(
     paddle::Tensor& output_ids,
     paddle::Tensor& parent_ids,
     paddle::Tensor& sequence_length,
-    std::string decoding_strategy,
-    int beam_size,
-    int topk,
-    float topp,
-    int n_head,
-    int size_per_head,
-    int num_layer,
-    int bos_id,
-    int eos_id,
-    int64_t max_len,
-    float beam_search_diversity_rate,
-    float alpha) {
+    const std::string& decoding_strategy,
+    const int beam_size,
+    const int topk,
+    const float topp,
+    const int n_head,
+    const int size_per_head,
+    const int num_layer,
+    const int bos_id,
+    const int eos_id,
+    const int64_t max_len,
+    const float beam_search_diversity_rate,
+    const float alpha) {
   auto stream = input.stream();
-  cublasHandle_t cublas_handle_;
-  cublasCreate(&cublas_handle_);
-  cublasLtHandle_t cublaslt_handle_;
-  cublasLtCreate(&cublaslt_handle_);
-  cublasSetStream(cublas_handle_, stream);
+  cublasSetStream(CublasHandle::GetInstance()->cublas_handle_, stream);
 
   std::vector<paddle::Tensor> ret;
 
@@ -466,8 +467,6 @@ std::vector<paddle::Tensor> DecodingCUDAForward(
           max_len,
           beam_search_diversity_rate,
           alpha,
-          cublas_handle_,
-          cublaslt_handle_,
           stream);
       break;
     }
@@ -523,8 +522,6 @@ std::vector<paddle::Tensor> DecodingCUDAForward(
           max_len,
           beam_search_diversity_rate,
           alpha,
-          cublas_handle_,
-          cublaslt_handle_,
           stream);
       break;
     }
@@ -536,7 +533,5 @@ std::vector<paddle::Tensor> DecodingCUDAForward(
     }
   }
 
-  cublasDestroy(cublas_handle_);
-  cublasLtDestroy(cublaslt_handle_);
   return ret;
 }
diff --git a/paddlenlp/ops/faster_transformer/src/fusion_force_decoding_op.h b/paddlenlp/ops/faster_transformer/src/fusion_force_decoding_op.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "cublas_handle.h"
+
 #include "fastertransformer/decoding_beamsearch.h"
 #include "fastertransformer/decoding_sampling.h"
 #include "fastertransformer/open_decoder.h"
@@ -67,15 +69,15 @@ std::vector<paddle::Tensor> DecodingCUDAForward(
     paddle::Tensor& output_ids,
     paddle::Tensor& parent_ids,
     paddle::Tensor& sequence_length,
-    std::string decoding_strategy,
-    int beam_size,
-    int topk,
-    float topp,
-    int n_head,
-    int size_per_head,
-    int num_layer,
-    int bos_id,
-    int eos_id,
-    int64_t max_len,
-    float beam_search_diversity_rate,
-    float alpha);
+    const std::string& decoding_strategy,
+    const int beam_size,
+    const int topk,
+    const float topp,
+    const int n_head,
+    const int size_per_head,
+    const int num_layer,
+    const int bos_id,
+    const int eos_id,
+    const int64_t max_len,
+    const float beam_search_diversity_rate,
+    const float alpha);
diff --git a/paddlenlp/ops/faster_transformer/transformer/decoding.py b/paddlenlp/ops/faster_transformer/transformer/decoding.py
@@ -619,6 +619,13 @@ def __init__(self,
                 )
             load("FasterTransformer", verbose=True)
 
+        size_per_head = d_model / n_head
+        # fuse_qkv can only support size_per_head is one of [32, 64, 128].
+        if size_per_head in [32, 64, 128]:
+            self._fuse_qkv = True
+        else:
+            self._fuse_qkv = False
+
         super(InferTransformerDecoding, self).__init__()
         for arg, value in locals().items():
             if arg not in [
@@ -715,11 +722,41 @@ def __init__(self,
         self.ffn_out_weight = []
         self.ffn_out_bias = []
 
-        for mod in decoder.layers:
+        for i, mod in enumerate(decoder.layers):
             self.slf_ln_weight.append(mod.norm1.weight)
             self.slf_ln_bias.append(mod.norm1.bias)
-            self.slf_q_weight.append(mod.self_attn.q_proj.weight)
-            self.slf_q_bias.append(mod.self_attn.q_proj.bias)
+
+            if self._fuse_qkv:
+                q_weight_shape = mod.self_attn.q_proj.weight.shape
+                k_weight_shape = mod.self_attn.k_proj.weight.shape
+                v_weight_shape = mod.self_attn.v_proj.weight.shape
+
+                q_weights = self.create_parameter(
+                    shape=[
+                        q_weight_shape[0], q_weight_shape[1] + k_weight_shape[1]
+                        + v_weight_shape[1]
+                    ],
+                    dtype="float16" if use_fp16_decoding else "float32")
+                setattr(self, "slf_q_weight_" + str(i), q_weights)
+                self.slf_q_weight.append(
+                    getattr(self, "slf_q_weight_" + str(i)))
+
+                q_bias_shape = mod.self_attn.q_proj.bias.shape
+                k_bias_shape = mod.self_attn.k_proj.bias.shape
+                v_bias_shape = mod.self_attn.v_proj.bias.shape
+
+                q_biases = self.create_parameter(
+                    shape=[
+                        q_bias_shape[0] + k_bias_shape[0] + v_bias_shape[0]
+                    ],
+                    dtype="float16" if use_fp16_decoding else "float32",
+                    is_bias=True)
+                setattr(self, "slf_q_bias_" + str(i), q_biases)
+                self.slf_q_bias.append(getattr(self, "slf_q_bias_" + str(i)))
+            else:
+                self.slf_q_weight.append(mod.self_attn.q_proj.weight)
+                self.slf_q_bias.append(mod.self_attn.q_proj.bias)
+
             self.slf_k_weight.append(mod.self_attn.k_proj.weight)
             self.slf_k_bias.append(mod.self_attn.k_proj.bias)
             self.slf_v_weight.append(mod.self_attn.v_proj.weight)
diff --git a/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py b/paddlenlp/ops/faster_transformer/transformer/faster_transformer.py
@@ -272,9 +272,25 @@ def load(self, init_from_params):
         model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
             self.max_length, self.d_model)
 
+        if self.decoding._fuse_qkv:
+            for item in self.state_dict():
+                if "decoder" in item and "self_attn.q_proj" in item:
+                    num_layer = item.split(".")[3]
+                    param_type = item.split(".")[-1]
+
+                    model_dict["decoding.slf_q_" + param_type + "_" +
+                               num_layer] = np.concatenate(
+                                   (model_dict[item], model_dict[
+                                       "transformer.decoder.layers." + num_layer
+                                       + ".self_attn.k_proj." + param_type],
+                                    model_dict["transformer.decoder.layers." +
+                                               num_layer + ".self_attn.v_proj."
+                                               + param_type]),
+                                   axis=-1)
+
         if self.use_fp16_decoding:
             for item in self.state_dict():
-                if "decoder" in item:
+                if "decoder" in item or "decoding.slf" in item:
                     model_dict[item] = np.float16(model_dict[item])
             model_dict["decoding_linear.weight"] = np.float16(model_dict[
                 "decoding_linear.weight"])
@@ -377,9 +393,25 @@ def export_params(self, init_from_params, place):
         model_dict["decoder.pos_encoder.weight"] = position_encoding_init(
             self.max_length, self.d_model)
 
+        if self.decoding._fuse_qkv:
+            for item in self.state_dict():
+                if "decoder" in item and "self_attn.q_proj" in item:
+                    num_layer = item.split(".")[3]
+                    param_type = item.split(".")[-1]
+
+                    model_dict["decoding.slf_q_" + param_type + "_" +
+                               num_layer] = np.concatenate(
+                                   (model_dict[item], model_dict[
+                                       "transformer.decoder.layers." + num_layer
+                                       + ".self_attn.k_proj." + param_type],
+                                    model_dict["transformer.decoder.layers." +
+                                               num_layer + ".self_attn.v_proj."
+                                               + param_type]),
+                                   axis=-1)
+
         if self.use_fp16_decoding:
             for item in self.state_dict():
-                if "decoder" in item:
+                if "decoder" in item or "decoding.slf" in item:
                     model_dict[item] = np.float16(model_dict[item])
             model_dict["decoding_linear.weight"] = np.float16(model_dict[
                 "decoding_linear.weight"])