PaddlePaddle · Xreki · Feb 26, 2023 · Mar 17, 2022 · Sep 23, 2022 · Sep 23, 2022
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -67,14 +67,10 @@ class AutoTuneBase {
            const AlgorithmType& algo,
            const size_t key,
            Args&&... args) {
-    PADDLE_ENFORCE_GT(
-        kernels_.size(),
-        0,
-        phi::errors::InvalidArgument(
-            "kernel num must be greater than 0, now is %d", kernels_.size()));
     is_init_ = true;
-
+    CheckKernelSize();
     auto& cache = AutoTuneCache::Instance().Get(algo);
+
     if (cache.Find(key)) {
       auto best_idx = cache.Get(key);
       kernels_[best_idx].Run(args...);
@@ -91,19 +87,22 @@ class AutoTuneBase {
     }
   }
 
- private:
+ protected:
   bool is_init_{false};
   std::vector<KernelType> kernels_;
   mutable std::mutex mutex_;
 
-  template <typename Context, typename... Args>
-  size_t PickBestKernel(const Context& ctx, Args&&... args) {
-    std::lock_guard<std::mutex> lock(mutex_);
+  void CheckKernelSize() {
     PADDLE_ENFORCE_GT(
         kernels_.size(),
         0,
         phi::errors::InvalidArgument(
             "kernel num must be greater than 0, now is %d", kernels_.size()));
+  }
+
+  template <typename Context, typename... Args>
+  size_t PickBestKernel(const Context& ctx, Args&&... args) {
+    std::lock_guard<std::mutex> lock(mutex_);
     size_t best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
@@ -143,36 +142,71 @@ class AutoTuneBase {
   }
 };
 
-template <typename T, typename ReturnType, typename... Args>
-static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> MakeAutoTuner(
-    ReturnType (*func)(Args...)) {
-  auto obj = MakeCallback<T>(func);
-  return AutoTuneBase<T, decltype(obj)>(obj);
-}
-
 template <typename T, typename ReturnType, typename... Args>
 class TransposeAutoTuner
     : public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> {
  public:
-  static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>* Instance(
+  static TransposeAutoTuner<T, ReturnType, Args...>* Instance(
       ReturnType (*func)(Args...)) {
     static std::once_flag transpose_init_flag_;
-    static std::unique_ptr<
-        AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>>
+    static std::unique_ptr<TransposeAutoTuner<T, ReturnType, Args...>>
         instance_;
     std::call_once(transpose_init_flag_, [&] {
       auto obj = MakeCallback<T>(func);
-      instance_.reset(new AutoTuneBase<T, decltype(obj)>(obj));
+      instance_.reset(new TransposeAutoTuner<T, ReturnType, Args...>);
+      instance_->AddCallBack(func);
+    });
+    return instance_.get();
+  }
+};
+
+template <typename T, typename ReturnType, typename... Args>
+class MatmulAutoTuner
+    : public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> {
+ public:
+  static MatmulAutoTuner<T, ReturnType, Args...>* Instance(
+      ReturnType (*func)(Args...)) {
+    static std::once_flag matmul_init_flag_;
+    static std::unique_ptr<MatmulAutoTuner<T, ReturnType, Args...>> instance_;
+    std::call_once(matmul_init_flag_, [&] {
+      auto obj = MakeCallback<T>(func);
+      instance_.reset(new MatmulAutoTuner<T, ReturnType, Args...>);
+      instance_->AddCallBack(func);
     });
     return instance_.get();
   }
+
+  template <typename Context>
+  void RunMatmul(const Context& ctx, const size_t key, Args... args) {
+    this->is_init_ = true;
+    this->CheckKernelSize();
+    auto& cache = AutoTuneCache::Instance().GetMatmul();
+    if (cache.Find(key)) {
+      auto best_idx = cache.Get(key);
+      this->kernels_[best_idx].Run(args...);
+    } else {
+      bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
+      if (use_autotune) {
+        auto best_idx = this->PickBestKernel(ctx, args...);
+        cache.Set(key, best_idx);
+      } else {
+        this->kernels_[0].Run(args...);
+      }
+    }
+  }
 };
 
 template <typename T, typename ReturnType, typename... Args>
-static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>*
-MakeTransposeTuner(ReturnType (*func)(Args...)) {
+static TransposeAutoTuner<T, ReturnType, Args...>* MakeTransposeTuner(
+    ReturnType (*func)(Args...)) {
   return TransposeAutoTuner<T, ReturnType, Args...>::Instance(func);
 }
 
+template <typename T, typename ReturnType, typename... Args>
+static MatmulAutoTuner<T, ReturnType, Args...>* MakeMatmulTuner(
+    ReturnType (*func)(Args...)) {
+  return MatmulAutoTuner<T, ReturnType, Args...>::Instance(func);
+}
+
 }  // namespace autotune
 }  // namespace phi
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
@@ -44,28 +44,33 @@ enum class AlgorithmType {
   kConvBackwardData = 2,
   kConvBackwardFilter = 3,
   kTranspose = 4,
-#ifdef PADDLE_WITH_CUDNN_FRONTEND
-  kConvForwardV8 = 5,
-  kConvBackwardDataV8 = 6,
-  kConvBackwardFilterV8 = 7,
-  kAlgorithmCount = 8
+  kMatmul = 5,
+#if !defined(PADDLE_WITH_CUDNN_FRONTEND)
+  kAlgorithmCount = 6
 #else
-  kAlgorithmCount = 5
+  kConvForwardV8 = 6,
+  kConvBackwardDataV8 = 7,
+  kConvBackwardFilterV8 = 8,
+  kAlgorithmCount = 9
 #endif
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
-// (todo. hong) use cudnnConvolutionFwdAlgo_t
-using AlgorithmsCacheMap = AlgorithmsCache<size_t, int64_t>;
 // AlgorithmType -> AlgorithmsCache
+using AlgorithmsCacheMap = AlgorithmsCache<size_t, int64_t>;
 using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
+
+// (todo. hong) use cudnnConvolutionFwdAlgo_t
 using ConvAlgorithmsCacheMap = ConvAlgorithmsCache<ConvAutoTuneResult>;
 using ConvAlgorithmsTypeMap =
     std::unordered_map<int64_t, ConvAlgorithmsCacheMap>;
+
+using MatmulAlgorithmsCacheMap = MatmulAlgorithmsCache<size_t, int64_t>;
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
 using CudnnV8AlgorithmsTypeMap =
     std::unordered_map<int64_t, CudnnFrontendPlanCache>;
 #endif
+
 class AutoTuneCache {
  public:
   static AutoTuneCache& Instance() {
@@ -77,6 +82,8 @@ class AutoTuneCache {
     return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
+  MatmulAlgorithmsCacheMap& GetMatmul() { return matmul_auto_tune_map_; }
+
   ConvAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
     return conv_auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
@@ -87,8 +94,6 @@ class AutoTuneCache {
   }
 #endif
 
-  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
-
   void Clean() {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();
@@ -162,6 +167,7 @@ class AutoTuneCache {
 
   AlgorithmsTypeMap auto_tune_map_;
   ConvAlgorithmsTypeMap conv_auto_tune_map_;
+  MatmulAlgorithmsCacheMap matmul_auto_tune_map_;
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
   CudnnV8AlgorithmsTypeMap cudnn_v8_auto_tune_map_;
 #endif

diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
@@ -60,6 +60,45 @@ size_t GetKey(Args&&... args) {
   return seed;
 }
 
+struct MatmulHashValueType {
+  uint64_t data[8];
+};
+
+struct MatmulCacheKey {
+ public:
+  MatmulCacheKey() {}
+  MatmulCacheKey(const std::vector<int64_t>& x_dims,
+                 const std::vector<int64_t>& y_dims,
+                 const bool trans_x,
+                 const bool trans_y,
+                 phi::DataType dtype)
+      : x_dims_(x_dims),
+        y_dims_(y_dims),
+        trans_x_(trans_x),
+        trans_y_(trans_y),
+        dtype_(dtype) {
+    key_ = GetKey(x_dims_,
+                  y_dims_,
+                  static_cast<int64_t>(trans_x_),
+                  static_cast<int64_t>(trans_y_),
+                  static_cast<int64_t>(dtype_));
+  }
+
+  const size_t QueryKey() const { return key_; }
+  const size_t GetSize() { return x_dims_.size(); }
+  const size_t GetSubKey(int64_t idx) { return GetKey(key_, idx); }
+
+ private:
+  int size_;
+  size_t key_;
+  std::vector<int64_t> x_dims_;
+  std::vector<int64_t> y_dims_;
+  bool trans_x_;
+  bool trans_y_;
+  int best_algo_;
+  phi::DataType dtype_;
+};
+
 struct ConvCacheKey {
   ConvCacheKey() {}
   ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
@@ -213,5 +252,34 @@ class ConvAlgorithmsCache : public AlgorithmsCache<ConvCacheKey,
   }
 };
 
+template <typename KeyT, typename AlgorithmT>
+class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> {
+ public:
+  MatmulAlgorithmsCache() : AlgorithmsCache<KeyT, AlgorithmT>() {}
+
+  bool FindSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    bool ret = (sub_hash_.find(sub_key) != sub_hash_.end()) ? true : false;
+    return ret;
+  }
+
+  void SetSubKey(const KeyT& sub_key, const MatmulHashValueType* algo) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    sub_hash_[sub_key] = *algo;
+  }
+
+  MatmulHashValueType* GetSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    PADDLE_ENFORCE_NE(
+        sub_hash_.find(sub_key),
+        sub_hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return &(sub_hash_[sub_key]);
+  }
+
+ private:
+  std::unordered_map<KeyT, MatmulHashValueType> sub_hash_;
+};
+
 }  // namespace autotune
 }  // namespace phi