Skip to content

Commit 1cd7e68

Browse files
authored
optimize conv algo cache (#41891)
* optimizer conv alog speed * code polish * remove useless code * fix compile error * fix cpu compile error * not use cudnn alog t * add search cache max number * polish code * fix cache test bug * add groups data format to conv args * fix cache test bug * fix cudnn_deterministic bug * fix test switch auto tune bug * fix test swith autotune bug; * fix conv cache bug * fix cache test error * fix cache test bug * fix windows mac compile error * fix workspace search error * update cudnn cache * fix cache test bug; test=develop * fix autotune swith test error * polish code * oplish code
1 parent f2f3f6e commit 1cd7e68

File tree

12 files changed

+347
-111
lines changed

12 files changed

+347
-111
lines changed

paddle/fluid/operators/conv_base_helper.h

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,13 @@ struct SearchAlgorithm {};
4444
template <typename AlgoT>
4545
struct SearchResult {
4646
SearchResult() {}
47+
explicit SearchResult(const phi::autotune::DnnNode& node)
48+
: algo(static_cast<AlgoT>(node.algo)),
49+
workspace_size(node.workspace_size) {}
50+
4751
explicit SearchResult(AlgoT a) : algo(a) {}
52+
explicit SearchResult(AlgoT a, float t, size_t size)
53+
: algo(a), time(t), workspace_size(size) {}
4854

4955
AlgoT algo = static_cast<AlgoT>(0);
5056
float time = -1.f;
@@ -76,28 +82,50 @@ struct ConvArgsBase {
7682
// dilations
7783
std::vector<int> d;
7884

85+
// groups
86+
int group;
87+
88+
// data foramt
89+
DataLayout data_layout;
90+
7991
ConvArgsBase(const framework::Tensor* x,
8092
const framework::Tensor* w,
8193
const framework::Tensor* o,
8294
const std::vector<int> s,
8395
const std::vector<int> p,
8496
const std::vector<int> d,
85-
DataT dtype)
86-
: x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
97+
DataT dtype,
98+
int g,
99+
DataLayout layout)
100+
: x(x),
101+
w(w),
102+
o(o),
103+
s(s),
104+
p(p),
105+
d(d),
106+
cudnn_dtype(dtype),
107+
group(g),
108+
data_layout(layout) {}
87109

88110
template <typename T>
89-
size_t GetCacheKey() const {
111+
phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
90112
auto x_shape = phi::vectorize(x->dims());
91113
auto w_shape = phi::vectorize(w->dims());
92114
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
93-
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
94-
return phi::autotune::ConvKey(
115+
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d
116+
<< ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
117+
<< ", group=" << group
118+
<< ", data layout=" << static_cast<int64_t>(data_layout);
119+
120+
return phi::autotune::ConvCacheKey(
95121
x_shape,
96122
w_shape,
97123
p,
98124
s,
99125
d,
100-
paddle::experimental::CppTypeToDataType<T>::Type());
126+
paddle::experimental::CppTypeToDataType<T>::Type(),
127+
group,
128+
static_cast<int64_t>(data_layout));
101129
}
102130
};
103131

paddle/fluid/operators/conv_cudnn_helper.h

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -191,32 +191,36 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
191191
SetConvMathType(ctx, dtype, args.cdesc);
192192

193193
if (deterministic) {
194-
result = FindAlgoDeterministic();
194+
result = FindAlgoDeterministic(args);
195195
} else {
196196
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
197197
// 2. Once turning on auto-tune, runn heuristic search(default) before
198198
// auto-tune process, run exhaustive_search during mentioned process.
199199
// 3. After auto-tune process, run cached algorithm if cached, run
200200
// default mode for the rest.
201-
size_t key = args.GetCacheKey<T>();
201+
auto key = args.Convert2ConvCacheKey<T>();
202202
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
203203
if (cache.Find(key)) {
204-
result.algo = static_cast<AlgoT>(cache.Get(key));
204+
auto t = cache.Get(key);
205+
result.algo = static_cast<AlgoT>(t.algo);
206+
result.workspace_size = t.workspace_size;
205207
} else {
206208
bool use_autotune =
207209
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
208210
if (exhaustive_search || use_autotune) {
209211
result = FindAlgoExhaustiveSearch<T>(args, ctx);
210-
cache.Set(key, static_cast<int64_t>(result.algo));
211212
} else {
212213
result = FindAlgoHeuristic(args, ctx);
213214
}
215+
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
216+
result.workspace_size);
217+
cache.Set(key, node);
214218
}
215219
}
216220
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
217221
<< ", deterministic=" << deterministic
218-
<< ", choose algo=" << result.algo << ", workspace="
219-
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
222+
<< ", choose algo=" << result.algo
223+
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
220224
return result;
221225
}
222226

@@ -236,8 +240,9 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
236240
}
237241

238242
private:
239-
static SearchResult<AlgoT> FindAlgoDeterministic() {
240-
return SearchResult<AlgoT>(static_cast<AlgoT>(1));
243+
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
244+
auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
245+
return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
241246
}
242247

243248
// Heuristic search mode, calling the cudnnGetXxxAlgorithm.
@@ -298,6 +303,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
298303
workspace_size_limit,
299304
&(result.algo)));
300305
#endif
306+
result.workspace_size = GetWorkspaceSize(args, result.algo);
301307
return result;
302308
}
303309

@@ -343,6 +349,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
343349
ChooseAlgoByWorkspace<PerfT, AlgoT>(
344350
perf_results, workspace_size_limit, &result);
345351

352+
result.workspace_size = GetWorkspaceSize(args, result.algo);
346353
return result;
347354
}
348355

@@ -394,33 +401,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
394401
SetConvMathType(ctx, dtype, args.cdesc);
395402

396403
if (deterministic) {
397-
result = FindAlgoDeterministic();
404+
result = FindAlgoDeterministic(args);
398405
} else {
399406
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
400407
// 2. Once turning on auto-tune, runn heuristic search(default) before
401408
// auto-tune process, run exhaustive_search during mentioned process.
402409
// 3. After auto-tune process, run cached algorithm if cached, run
403410
// default mode for the rest.
404-
size_t key = args.GetCacheKey<T>();
411+
auto key = args.Convert2ConvCacheKey<T>();
405412
auto& cache =
406413
phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
407414
if (cache.Find(key)) {
408-
result.algo = static_cast<AlgoT>(cache.Get(key));
415+
auto t = cache.Get(key);
416+
result.algo = static_cast<AlgoT>(t.algo);
417+
result.workspace_size = t.workspace_size;
409418
} else {
410419
bool use_autotune =
411420
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
412421
if (exhaustive_search || use_autotune) {
413422
result = FindAlgoExhaustiveSearch<T>(args, ctx);
414-
cache.Set(key, static_cast<int64_t>(result.algo));
415423
} else {
416424
result = FindAlgoHeuristic(args, ctx);
417425
}
426+
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
427+
result.workspace_size);
428+
cache.Set(key, node);
418429
}
419430
}
420431
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
421432
<< ", deterministic=" << deterministic
422-
<< ", choose algo=" << result.algo << ", workspace="
423-
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
433+
<< ", choose algo=" << result.algo
434+
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
424435
return result;
425436
}
426437

@@ -440,8 +451,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
440451
}
441452

442453
private:
443-
static SearchResult<AlgoT> FindAlgoDeterministic() {
444-
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
454+
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
455+
auto workspace_size =
456+
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
457+
return SearchResult<AlgoT>(
458+
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
445459
}
446460

447461
static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -513,7 +527,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
513527
workspace_size_limit,
514528
&(result.algo)));
515529
#endif
516-
530+
result.workspace_size = GetWorkspaceSize(args, result.algo);
517531
return result;
518532
}
519533

@@ -559,6 +573,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
559573
ChooseAlgoByWorkspace<PerfT, AlgoT>(
560574
perf_results, workspace_size_limit, &result);
561575

576+
result.workspace_size = GetWorkspaceSize(args, result.algo);
562577
return result;
563578
}
564579

@@ -609,33 +624,37 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
609624
SetConvMathType(ctx, dtype, args.cdesc);
610625

611626
if (deterministic) {
612-
result = FindAlgoDeterministic();
627+
result = FindAlgoDeterministic(args);
613628
} else {
614629
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
615630
// 2. Once turning on auto-tune, runn heuristic search(default) before
616631
// auto-tune process, run exhaustive_search during mentioned process.
617632
// 3. After auto-tune process, run cached algorithm if cached, run
618633
// default mode for the rest.
619-
size_t key = args.GetCacheKey<T>();
634+
auto key = args.Convert2ConvCacheKey<T>();
620635
auto& cache =
621636
phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
622637
if (cache.Find(key)) {
623-
result.algo = static_cast<AlgoT>(cache.Get(key));
638+
auto t = cache.Get(key);
639+
result.algo = static_cast<AlgoT>(t.algo);
640+
result.workspace_size = t.workspace_size;
624641
} else {
625642
bool use_autotune =
626643
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
627644
if (exhaustive_search || use_autotune) {
628645
result = FindAlgoExhaustiveSearch<T>(args, ctx);
629-
cache.Set(key, static_cast<int64_t>(result.algo));
630646
} else {
631647
result = FindAlgoHeuristic(args, ctx);
632648
}
649+
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
650+
result.workspace_size);
651+
cache.Set(key, node);
633652
}
634653
}
635654
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
636655
<< ", deterministic=" << deterministic
637-
<< ", choose algo=" << result.algo << ", workspace="
638-
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
656+
<< ", choose algo=" << result.algo
657+
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
639658
return result;
640659
}
641660

@@ -656,8 +675,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
656675
}
657676

658677
private:
659-
static SearchResult<AlgoT> FindAlgoDeterministic() {
660-
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
678+
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
679+
auto workspace_size =
680+
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
681+
return SearchResult<AlgoT>(
682+
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
661683
}
662684

663685
static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -718,6 +740,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
718740
&(result.algo)));
719741
#endif
720742

743+
result.workspace_size = GetWorkspaceSize(args, result.algo);
721744
return result;
722745
}
723746

@@ -786,6 +809,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
786809
ChooseAlgo(perf_results, workspace_size_limit, &result);
787810
}
788811

812+
result.workspace_size = GetWorkspaceSize(args, result.algo);
789813
return result;
790814
}
791815

paddle/fluid/platform/flags.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,17 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
984984
*/
985985
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
986986

987+
/**
988+
* Conv Search cache max number related FLAG
989+
* Name: FLAGS_search_cache_max_number
990+
* Since Version: 2.3.0
991+
* Value Range: int32, default=1000000
992+
* Example:
993+
*/
994+
PADDLE_DEFINE_EXPORTED_int32(search_cache_max_number,
995+
1000000,
996+
"search_cache_max_number.");
997+
987998
/**
988999
* Preformance related FLAG
9891000
* Name: einsum_opt

paddle/phi/kernels/autotune/cache.cc

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,6 @@
2121
namespace phi {
2222
namespace autotune {
2323

24-
// Define the cache key of operator
25-
size_t ConvKey(const std::vector<int64_t>& x_dims,
26-
const std::vector<int64_t>& w_dims,
27-
const std::vector<int>& strides,
28-
const std::vector<int>& paddings,
29-
const std::vector<int>& dilations,
30-
phi::DataType dtype) {
31-
return GetKey(x_dims,
32-
w_dims,
33-
strides,
34-
paddings,
35-
dilations,
36-
static_cast<int64_t>(dtype));
37-
}
38-
3924
size_t TransposeKey(const std::vector<int64_t>& x_dims,
4025
const std::vector<int32_t>& perm,
4126
phi::DataType dtype) {
@@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() {
7358
cache_hits += v.second.CacheHits();
7459
cache_misses += v.second.CacheMisses();
7560
}
61+
62+
for (auto& v : cudnn_auto_tune_map_) {
63+
VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
64+
<< AlgorithmTypeString(v.first)
65+
<< " Cache Size: " << v.second.Size()
66+
<< " Hits: " << v.second.CacheHits()
67+
<< " Misses: " << v.second.CacheMisses()
68+
<< " Hit Rate: " << v.second.CacheHitRate();
69+
size += v.second.Size();
70+
cache_hits += v.second.CacheHits();
71+
cache_misses += v.second.CacheMisses();
72+
}
73+
7674
total_size_ = size;
7775
total_cache_hits_ = cache_hits;
7876
total_cache_misses_ = cache_misses;

0 commit comments

Comments
 (0)