-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Matmul performance optimization with cuBlasLt #46431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 62 commits
7f42952
1dba1a6
96cf58a
07677e3
ee801c3
67bf57c
64ee6d7
de873b4
3aa505a
c6c5ca2
4187e28
14180cc
2c6eaa5
1eaf75f
5f8c72b
444b1c4
fbb8361
19de67a
427e98c
cbf1f3d
c6dbe30
2a9ef0a
ba99367
d326b58
2ccb0ea
2a14bdb
ff38003
3c7e544
66475ea
e3fd59b
9c2b658
218990e
40d66f9
f49b23d
8e8dda6
75e83bb
192a1a8
e636886
5783696
11bf150
8eb3aa8
9405067
c780a94
307c89e
3dae0f9
e0c40bc
2e8a684
65a77d9
d876e7e
faaa937
5285192
f02d2e9
adec3bd
4372c0d
6791ff5
7f5b526
80fafc5
8fe0afe
fbda72c
6ec9106
ad58d06
c4a540d
fea6614
335c134
cb7c608
9cdfa2c
e2ed925
c1a7448
eef4555
9044737
16864be
cc539d7
cf85133
c35bdea
e863cbe
febeb01
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,14 +67,10 @@ class AutoTuneBase { | |
const AlgorithmType& algo, | ||
const size_t key, | ||
Args&&... args) { | ||
PADDLE_ENFORCE_GT( | ||
kernels_.size(), | ||
0, | ||
phi::errors::InvalidArgument( | ||
"kernel num must be greater than 0, now is %d", kernels_.size())); | ||
is_init_ = true; | ||
|
||
CheckKernelSize(); | ||
auto& cache = AutoTuneCache::Instance().Get(algo); | ||
|
||
if (cache.Find(key)) { | ||
auto best_idx = cache.Get(key); | ||
kernels_[best_idx].Run(args...); | ||
|
@@ -91,19 +87,22 @@ class AutoTuneBase { | |
} | ||
} | ||
|
||
private: | ||
protected: | ||
bool is_init_{false}; | ||
std::vector<KernelType> kernels_; | ||
mutable std::mutex mutex_; | ||
|
||
template <typename Context, typename... Args> | ||
size_t PickBestKernel(const Context& ctx, Args&&... args) { | ||
std::lock_guard<std::mutex> lock(mutex_); | ||
void CheckKernelSize() { | ||
PADDLE_ENFORCE_GT( | ||
kernels_.size(), | ||
0, | ||
phi::errors::InvalidArgument( | ||
"kernel num must be greater than 0, now is %d", kernels_.size())); | ||
} | ||
|
||
template <typename Context, typename... Args> | ||
size_t PickBestKernel(const Context& ctx, Args&&... args) { | ||
std::lock_guard<std::mutex> lock(mutex_); | ||
size_t best_idx = 0; | ||
float min_time = std::numeric_limits<float>::max(); | ||
|
||
|
@@ -143,36 +142,71 @@ class AutoTuneBase { | |
} | ||
}; | ||
|
||
template <typename T, typename ReturnType, typename... Args> | ||
static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> MakeAutoTuner( | ||
ReturnType (*func)(Args...)) { | ||
auto obj = MakeCallback<T>(func); | ||
return AutoTuneBase<T, decltype(obj)>(obj); | ||
} | ||
|
||
template <typename T, typename ReturnType, typename... Args> | ||
class TransposeAutoTuner | ||
: public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> { | ||
public: | ||
static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>* Instance( | ||
static TransposeAutoTuner<T, ReturnType, Args...>* Instance( | ||
ReturnType (*func)(Args...)) { | ||
static std::once_flag transpose_init_flag_; | ||
static std::unique_ptr< | ||
AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>> | ||
static std::unique_ptr<TransposeAutoTuner<T, ReturnType, Args...>> | ||
instance_; | ||
std::call_once(transpose_init_flag_, [&] { | ||
auto obj = MakeCallback<T>(func); | ||
instance_.reset(new AutoTuneBase<T, decltype(obj)>(obj)); | ||
instance_.reset(new TransposeAutoTuner<T, ReturnType, Args...>); | ||
instance_->AddCallBack(func); | ||
}); | ||
return instance_.get(); | ||
} | ||
}; | ||
|
||
template <typename T, typename ReturnType, typename... Args> | ||
class MatmulAutoTuner | ||
: public AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>> { | ||
public: | ||
static MatmulAutoTuner<T, ReturnType, Args...>* Instance( | ||
ReturnType (*func)(Args...)) { | ||
static std::once_flag matmul_init_flag_; | ||
static std::unique_ptr<MatmulAutoTuner<T, ReturnType, Args...>> instance_; | ||
std::call_once(matmul_init_flag_, [&] { | ||
auto obj = MakeCallback<T>(func); | ||
instance_.reset(new MatmulAutoTuner<T, ReturnType, Args...>); | ||
instance_->AddCallBack(func); | ||
}); | ||
return instance_.get(); | ||
} | ||
|
||
template <typename Context> | ||
void RunMatmul(const Context& ctx, const size_t key, Args... args) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 不如在基类里面封装一个 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 已根据建议修改,仅仅在外部的 |
||
this->is_init_ = true; | ||
this->CheckKernelSize(); | ||
auto& cache = AutoTuneCache::Instance().GetMatmul(); | ||
if (cache.Find(key)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 似乎,没有开启AutoTune功能的时候,这里会多1次查cache的开销。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这块比较难避免,AutoTune关闭的状态存在于调优功能开启的之前,和之后,这里的操作逻辑与 |
||
auto best_idx = cache.Get(key); | ||
this->kernels_[best_idx].Run(args...); | ||
} else { | ||
bool use_autotune = AutoTuneStatus::Instance().UseAutoTune(); | ||
if (use_autotune) { | ||
auto best_idx = this->PickBestKernel(ctx, args...); | ||
cache.Set(key, best_idx); | ||
} else { | ||
this->kernels_[0].Run(args...); | ||
} | ||
} | ||
} | ||
}; | ||
|
||
template <typename T, typename ReturnType, typename... Args> | ||
static AutoTuneBase<T, KernelCallback<T, ReturnType, Args...>>* | ||
MakeTransposeTuner(ReturnType (*func)(Args...)) { | ||
static TransposeAutoTuner<T, ReturnType, Args...>* MakeTransposeTuner( | ||
ReturnType (*func)(Args...)) { | ||
return TransposeAutoTuner<T, ReturnType, Args...>::Instance(func); | ||
} | ||
|
||
template <typename T, typename ReturnType, typename... Args> | ||
static MatmulAutoTuner<T, ReturnType, Args...>* MakeMatmulTuner( | ||
ReturnType (*func)(Args...)) { | ||
return MatmulAutoTuner<T, ReturnType, Args...>::Instance(func); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 定义个宏吧, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 根据建议已修改 |
||
|
||
} // namespace autotune | ||
} // namespace phi |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,6 +60,45 @@ size_t GetKey(Args&&... args) { | |
return seed; | ||
} | ||
|
||
struct MatmulHashValueType { | ||
uint64_t data[8]; | ||
}; | ||
|
||
struct MatmulCacheKey { | ||
public: | ||
MatmulCacheKey() {} | ||
MatmulCacheKey(const std::vector<int64_t>& x_dims, | ||
const std::vector<int64_t>& y_dims, | ||
const bool trans_x, | ||
const bool trans_y, | ||
phi::DataType dtype) | ||
: x_dims_(x_dims), | ||
y_dims_(y_dims), | ||
trans_x_(trans_x), | ||
trans_y_(trans_y), | ||
dtype_(dtype) { | ||
key_ = GetKey(x_dims_, | ||
y_dims_, | ||
static_cast<int64_t>(trans_x_), | ||
static_cast<int64_t>(trans_y_), | ||
static_cast<int64_t>(dtype_)); | ||
} | ||
|
||
const size_t QueryKey() const { return key_; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
const size_t GetSize() { return x_dims_.size(); } | ||
const size_t GetSubKey(int64_t idx) { return GetKey(key_, idx); } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 输入 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes,就是GenSubKey的含义 |
||
|
||
private: | ||
int size_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. struct成员不用加 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 根据建议修改. |
||
size_t key_; | ||
std::vector<int64_t> x_dims_; | ||
std::vector<int64_t> y_dims_; | ||
bool trans_x_; | ||
bool trans_y_; | ||
int best_algo_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个成员变量也访问不到。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 已删除,并额外删除 |
||
phi::DataType dtype_; | ||
}; | ||
|
||
struct ConvCacheKey { | ||
ConvCacheKey() {} | ||
ConvCacheKey(const std::vector<int64_t>& arg_x_dims, | ||
|
@@ -213,5 +252,34 @@ class ConvAlgorithmsCache : public AlgorithmsCache<ConvCacheKey, | |
} | ||
}; | ||
|
||
template <typename KeyT, typename AlgorithmT> | ||
class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> { | ||
public: | ||
MatmulAlgorithmsCache() : AlgorithmsCache<KeyT, AlgorithmT>() {} | ||
|
||
bool FindSubKey(const KeyT& sub_key) { | ||
std::lock_guard<std::mutex> lock(*(this->cache_mutex_)); | ||
bool ret = (sub_hash_.find(sub_key) != sub_hash_.end()) ? true : false; | ||
return ret; | ||
} | ||
|
||
void SetSubKey(const KeyT& sub_key, const MatmulHashValueType* algo) { | ||
std::lock_guard<std::mutex> lock(*(this->cache_mutex_)); | ||
sub_hash_[sub_key] = *algo; | ||
} | ||
|
||
MatmulHashValueType* GetSubKey(const KeyT& sub_key) { | ||
std::lock_guard<std::mutex> lock(*(this->cache_mutex_)); | ||
PADDLE_ENFORCE_NE( | ||
sub_hash_.find(sub_key), | ||
sub_hash_.end(), | ||
phi::errors::PreconditionNotMet("The key does not exist.")); | ||
return &(sub_hash_[sub_key]); | ||
} | ||
|
||
private: | ||
std::unordered_map<KeyT, MatmulHashValueType> sub_hash_; | ||
}; | ||
|
||
} // namespace autotune | ||
} // namespace phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
作为函数内部的局部变量,变量名不要加
_
后缀。There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
已修改