PaddlePaddle
diff --git a/‎paddle/phi/kernels/autotune/auto_tune_base.h
+42-4 b/‎paddle/phi/kernels/autotune/auto_tune_base.h
+42-4
diff --git a/‎paddle/phi/kernels/autotune/cache.h
+16-10 b/‎paddle/phi/kernels/autotune/cache.h
+16-10
diff --git a/‎paddle/phi/kernels/autotune/cache_base.h
+54 b/‎paddle/phi/kernels/autotune/cache_base.h
+54
@@ -141,7 +141,43 @@ class AutoTuneBase {
   }
 };
 
-// To init the auto_tuner object.
+template <typename T, typename ReturnType, typename... Args>
+class MatmulAutoTuner
+    : public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> {
+ public:
+  static MatmulAutoTuner<T, ReturnType, Args...>* Instance(
+      ReturnType (*func)(Args...)) {
+    static std::once_flag matmul_init_flag;
+    static std::unique_ptr<MatmulAutoTuner<T, ReturnType, Args...>> instance;
+    std::call_once(matmul_init_flag, [&] {
+      auto obj = MakeCallback<T>(func);
+      instance.reset(new MatmulAutoTuner<T, ReturnType, Args...>);
+      instance->AddCallBack(func);
+    });
+    return instance.get();
+  }
+
+  template <typename Context>
+  void Run(const Context& ctx, const size_t key, Args... args) {
+    this->is_init_ = true;
+    this->CheckKernelSize();
+    auto& cache = AutoTuneCache::Instance().GetMatmul();
+    if (cache.Find(key)) {
+      auto best_idx = cache.Get(key);
+      this->kernels_[best_idx].Run(args...);
+    } else {
+      bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
+      if (use_autotune) {
+        auto best_idx = this->PickBestKernel(ctx, args...);
+        cache.Set(key, best_idx);
+      } else {
+        this->kernels_[0].Run(args...);
+      }
+    }
+  }
+};
+
+// Define the auto_tuner inital object.
 #define DEFINE_AUTOTUNER_COMMON_OBJ(name)                                \
   template <typename T, typename ReturnType, typename... Args>           \
   class name##AutoTuner                                                  \
@@ -161,18 +197,20 @@ class AutoTuneBase {
     }                                                                    \
   };
 
-// To init auto_tuner inital function.
+// Define the auto_tuner inital function.
 #define DEFINE_AUTOTUNER_FN(name)                                    \
   template <typename T, typename ReturnType, typename... Args>       \
   static name##AutoTuner<T, ReturnType, Args...>* Make##name##Tuner( \
       ReturnType (*func)(Args...)) {                                 \
     return name##AutoTuner<T, ReturnType, Args...>::Instance(func);  \
   }
 
-#define DEFINE_AUTOTUNER(name) \
-  DEFINE_AUTOTUNER_COMMON_OBJ(name) DEFINE_AUTOTUNER_FN(name)
+#define DEFINE_AUTOTUNER(name)      \
+  DEFINE_AUTOTUNER_COMMON_OBJ(name) \
+  DEFINE_AUTOTUNER_FN(name)
 
 DEFINE_AUTOTUNER(Transpose)
+DEFINE_AUTOTUNER_FN(Matmul)
 
 #undef DEFINE_AUTOTUNER_COMMON_OBJECT
 #undef DEFINE_AUTOTUNER_FN
 
@@ -44,28 +44,33 @@ enum class AlgorithmType {
   kConvBackwardData = 2,
   kConvBackwardFilter = 3,
   kTranspose = 4,
-#ifdef PADDLE_WITH_CUDNN_FRONTEND
-  kConvForwardV8 = 5,
-  kConvBackwardDataV8 = 6,
-  kConvBackwardFilterV8 = 7,
-  kAlgorithmCount = 8
+  kMatmul = 5,
+#if !defined(PADDLE_WITH_CUDNN_FRONTEND)
+  kAlgorithmCount = 6
 #else
-  kAlgorithmCount = 5
+  kConvForwardV8 = 6,
+  kConvBackwardDataV8 = 7,
+  kConvBackwardFilterV8 = 8,
+  kAlgorithmCount = 9
 #endif
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
-// (todo. hong) use cudnnConvolutionFwdAlgo_t
-using AlgorithmsCacheMap = AlgorithmsCache<size_t, int64_t>;
 // AlgorithmType -> AlgorithmsCache
+using AlgorithmsCacheMap = AlgorithmsCache<size_t, int64_t>;
 using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
+
+// (todo. hong) use cudnnConvolutionFwdAlgo_t
 using ConvAlgorithmsCacheMap = ConvAlgorithmsCache<ConvAutoTuneResult>;
 using ConvAlgorithmsTypeMap =
     std::unordered_map<int64_t, ConvAlgorithmsCacheMap>;
+
+using MatmulAlgorithmsCacheMap = MatmulAlgorithmsCache<size_t, int64_t>;
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
 using CudnnV8AlgorithmsTypeMap =
     std::unordered_map<int64_t, CudnnFrontendPlanCache>;
 #endif
+
 class AutoTuneCache {
  public:
   static AutoTuneCache& Instance() {
@@ -77,6 +82,8 @@ class AutoTuneCache {
     return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
+  MatmulAlgorithmsCacheMap& GetMatmul() { return matmul_auto_tune_map_; }
+
   ConvAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
     return conv_auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
@@ -87,8 +94,6 @@ class AutoTuneCache {
   }
 #endif
 
-  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
-
   void Clean() {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();
@@ -162,6 +167,7 @@ class AutoTuneCache {
 
   AlgorithmsTypeMap auto_tune_map_;
   ConvAlgorithmsTypeMap conv_auto_tune_map_;
+  MatmulAlgorithmsCacheMap matmul_auto_tune_map_;
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
   CudnnV8AlgorithmsTypeMap cudnn_v8_auto_tune_map_;
 #endif
 
@@ -60,6 +60,31 @@ size_t GenKey(Args&&... args) {
   return seed;
 }
 
+struct MatmulHashValueType {
+  uint64_t data[8];
+};
+
+struct MatmulCacheKey {
+ public:
+  MatmulCacheKey() {}
+  MatmulCacheKey(const std::vector<int64_t>& x_dims,
+                 const std::vector<int64_t>& y_dims,
+                 const bool trans_x,
+                 const bool trans_y,
+                 phi::DataType dtype) {
+    key = GenKey(x_dims,
+                 y_dims,
+                 static_cast<int64_t>(trans_x),
+                 static_cast<int64_t>(trans_y),
+                 static_cast<int64_t>(dtype));
+  }
+  size_t GetKey() const { return key; }
+  size_t GetSubKey(int64_t idx) const { return GenKey(key, idx); }
+
+ private:
+  size_t key;
+};
+
 struct ConvCacheKey {
   ConvCacheKey() {}
   ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
@@ -213,5 +238,34 @@ class ConvAlgorithmsCache : public AlgorithmsCache<ConvCacheKey,
   }
 };
 
+template <typename KeyT, typename AlgorithmT>
+class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> {
+ public:
+  MatmulAlgorithmsCache() : AlgorithmsCache<KeyT, AlgorithmT>() {}
+
+  bool FindSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    bool ret = (sub_hash_.find(sub_key) != sub_hash_.end()) ? true : false;
+    return ret;
+  }
+
+  void SetSubKey(const KeyT& sub_key, const MatmulHashValueType* algo) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    sub_hash_[sub_key] = *algo;
+  }
+
+  MatmulHashValueType* GetSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    PADDLE_ENFORCE_NE(
+        sub_hash_.find(sub_key),
+        sub_hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return &(sub_hash_[sub_key]);
+  }
+
+ private:
+  std::unordered_map<KeyT, MatmulHashValueType> sub_hash_;
+};
+
 }  // namespace autotune
 }  // namespace phi