Skip to content

Commit 253c7f4

Browse files
authored
[Cherry-pick][Release/1.5][Bug fix]Fix dygraph mem leak release/1.5 (PaddlePaddle#18133)
* fix dygraph mem leak, test=release/1.5 * polish msg, test=release/1.5
1 parent c50fb58 commit 253c7f4

File tree

11 files changed

+141
-7
lines changed

11 files changed

+141
-7
lines changed

paddle/fluid/imperative/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
cc_library(imperative_flag SRCS flags.cc DEPS gflags)
2+
13
if(WITH_PYTHON)
2-
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler)
4+
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
35
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
46
cc_library(engine SRCS engine.cc)
57
cc_library(imperative_profiler SRCS profiler.cc)

paddle/fluid/imperative/flags.cc

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/imperative/flags.h"
16+
#include "gflags/gflags.h"
17+
18+
DEFINE_uint64(dygraph_debug, 0,
19+
"Debug level of dygraph. This flag is not "
20+
"open to users");
21+
22+
namespace paddle {
23+
namespace imperative {
24+
25+
bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; }
26+
27+
uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; }
28+
29+
} // namespace imperative
30+
} // namespace paddle

paddle/fluid/imperative/flags.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <cstdint>
18+
19+
namespace paddle {
20+
namespace imperative {
21+
22+
extern bool IsDebugEnabled();
23+
extern uint64_t GetDebugLevel();
24+
25+
} // namespace imperative
26+
} // namespace paddle

paddle/fluid/imperative/layer.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,27 @@
3434
namespace paddle {
3535
namespace imperative {
3636

37+
void ThreadSafeNameSet::Insert(const std::string& name) {
38+
std::lock_guard<std::mutex> guard(mtx_);
39+
set_.insert(name);
40+
}
41+
42+
void ThreadSafeNameSet::Remove(const std::string& name) {
43+
std::lock_guard<std::mutex> guard(mtx_);
44+
auto iter = set_.find(name);
45+
PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name);
46+
set_.erase(iter);
47+
}
48+
49+
std::vector<std::string> ThreadSafeNameSet::Names() const {
50+
std::lock_guard<std::mutex> guard(mtx_);
51+
return std::vector<std::string>(set_.begin(), set_.end());
52+
}
53+
54+
ThreadSafeNameSet VarBase::name_set_;
55+
56+
std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
57+
3758
using framework::Variable;
3859

3960
namespace detail {

paddle/fluid/imperative/layer.h

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,11 @@
1414

1515
#pragma once
1616

17-
#include <map> // NOLINT
18-
#include <memory> // NOLINT
17+
#include <cstdint>
18+
#include <map> // NOLINT
19+
#include <memory> // NOLINT
20+
#include <mutex> // NOLINT
21+
#include <set>
1922
#include <string> // NOLINT
2023
#include <unordered_map> // NOLINT
2124
#include <utility>
@@ -34,6 +37,7 @@
3437
#include "paddle/fluid/operators/math/math_function.h"
3538
#include "paddle/fluid/imperative/backward_strategy.h"
3639
#include "paddle/fluid/imperative/type_defs.h"
40+
#include "paddle/fluid/imperative/flags.h"
3741

3842
namespace paddle {
3943
namespace imperative {
@@ -108,13 +112,28 @@ class PreparedOp {
108112

109113
class OpBase;
110114

115+
class ThreadSafeNameSet {
116+
public:
117+
void Insert(const std::string& name);
118+
119+
void Remove(const std::string& name);
120+
121+
std::vector<std::string> Names() const;
122+
123+
private:
124+
std::multiset<std::string> set_;
125+
mutable std::mutex mtx_;
126+
};
127+
111128
/* The wrapper for Variable which holds a Variable and a VarBase of its
112129
* gradient. This object should be managed totally by Python intepreter.
113130
*
114131
* Nearly all interface should be implemented in C++.
115132
*/
116133
class VarBase {
117134
public:
135+
static std::vector<std::string> AliveVarNames();
136+
118137
// Internal interface, create VarBase from exist variable
119138
VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
120139
VarBase* grad, bool stop_gradient)
@@ -180,13 +199,20 @@ class VarBase {
180199
}
181200
VLOG(8) << "create varbase: " << name_ << " type: " << dtype
182201
<< " place: " << place << "Stop gradient: " << stop_gradient_;
202+
203+
if (IsDebugEnabled()) {
204+
name_set_.Insert(name_);
205+
}
183206
}
184207

185208
public:
186209
virtual ~VarBase() {
187210
pre_op_ = nullptr;
188211
pre_op_out_idx_ = -1;
189212
VLOG(8) << "destruct varbase: " << name_;
213+
if (IsDebugEnabled()) {
214+
name_set_.Remove(name_);
215+
}
190216
}
191217

192218
inline void SetName(const std::string& name) { name_ = name; }
@@ -297,6 +323,9 @@ class VarBase {
297323
OpBase* pre_op_;
298324
std::string pre_op_out_name_;
299325
int pre_op_out_idx_;
326+
327+
// A private flag to check memory leak
328+
static ThreadSafeNameSet name_set_;
300329
};
301330

302331
/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its

paddle/fluid/pybind/imperative.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,13 @@ void BindImperative(pybind11::module *m_ptr) {
194194

195195
m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
196196

197+
m.def("_is_dygraph_debug_enabled",
198+
[]() { return imperative::IsDebugEnabled(); });
199+
m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
200+
197201
py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
198202
m, "VarBase", R"DOC()DOC")
203+
.def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
199204
.def(
200205
py::init<const std::string &, paddle::framework::proto::VarType::Type,
201206
const std::vector<int64_t>, const paddle::platform::CPUPlace,

python/paddle/fluid/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def __bootstrap__():
142142
'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
143143
'enable_parallel_graph', 'fuse_parameter_groups_size',
144144
'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
145-
'tracer_profile_fname'
145+
'tracer_profile_fname', 'dygraph_debug'
146146
]
147147
if 'Darwin' not in sysstr:
148148
read_env_flags.append('use_pinned_memory')

python/paddle/fluid/core.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
from .core_avx import _set_eager_deletion_mode
5858
from .core_avx import _set_fuse_parameter_group_size
5959
from .core_avx import _set_fuse_parameter_memory_size
60+
from .core_avx import _is_dygraph_debug_enabled
61+
from .core_avx import _dygraph_debug_level
6062
except ImportError as error:
6163
sys.stderr.write(
6264
error.__class__.__name__ +
@@ -79,6 +81,8 @@
7981
from .core_noavx import _set_eager_deletion_mode
8082
from .core_noavx import _set_fuse_parameter_group_size
8183
from .core_noavx import _set_fuse_parameter_memory_size
84+
from .core_noavx import _is_dygraph_debug_enabled
85+
from .core_noavx import _dygraph_debug_level
8286
except ImportError as error:
8387
sys.exit("Error: Can not load core_noavx.* ." +
8488
error.__class__.__name__)

python/paddle/fluid/dygraph/base.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
1515
import contextlib
1616
import numpy as np
17+
import os
1718

1819
from paddle.fluid import core
1920
from paddle.fluid import framework
2021
from .tracer import Tracer
22+
import logging
2123

2224
__all__ = [
2325
'enabled',
@@ -136,6 +138,21 @@ def guard(place=None):
136138
yield
137139

138140

141+
def _print_debug_msg():
142+
if not core._is_dygraph_debug_enabled():
143+
logging.warn(
144+
'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
145+
)
146+
return
147+
148+
unique_name_size = len(framework.unique_name.generator.ids)
149+
tracer_var_size = len(framework._dygraph_tracer()._vars)
150+
alive_cpp_var_size = len(core.VarBase._alive_vars())
151+
logging.warn(
152+
'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
153+
.format(unique_name_size, tracer_var_size, alive_cpp_var_size))
154+
155+
139156
def to_variable(value, block=None, name=None):
140157
"""
141158
This function will create a variable from ndarray

python/paddle/fluid/dygraph/learning_rate_scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def create_lr_var(self, lr):
6060
shape=[1],
6161
value=float(lr),
6262
dtype=self.dtype,
63-
persistable=True)
63+
persistable=False)
6464
return lr
6565

6666
def step(self):

python/paddle/fluid/unique_name.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,15 @@ def generate(key):
7979

8080
# FIXME(zjl): The previous naming rule in static graph would
8181
# cause memory leak in dygraph mode. It is because the previous
82-
# nameing rule would use `conv_0.tmp` as the key, and in dygraph
82+
# naming rule would use `conv_0.tmp` as the key, and in dygraph
8383
# mode, `conv_i` increases as batch increases. Thus, keys would
8484
# increase in a way like `conv_0.tmp`, `conv_1.tmp`, ....
8585
# Not find a better way to fix this bug in dygraph mode. In TF,
8686
# variable name is meaningless in eager execution mode, and in
8787
# PyTorch, there is no variable name at all. Maybe we should
8888
# discard variable name in dygraph mode.
8989
#
90-
# Another concern is that save/load inference. Usually, user
90+
# Another concern is that save/load interfaces. Usually, user
9191
# would save model in static graph mode, and load it in dygraph
9292
# mode. Therefore, we keep the variable name of Parameter currently.
9393
#

0 commit comments

Comments
 (0)