Skip to content

Improve the initializer Interface for fc, sequence_conv and conv2d layers #5760

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 20, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions python/paddle/v2/fluid/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,37 @@ def unique_name(prefix):
return "_".join([prefix, str(uid)])


def convert_np_dtype_to_dtype_(np_dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why make type convert as a global function? I think the staticmethod is more proper here because we can not call type convert function out of Variable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason this was made a global function is because this is needed outside of the Variable class. In layer_helper, we want to make sure that every parameter which has not been supplied an initializer has a default initializer. This initializer depends on the dtype of the parameter. If the parameter is of type float, then XavierInitializer is used otherwise the parameter is initialized with Zeros for int and bool types.

Now we need this method outside because users can also pass np datatypes as dtypes. The initializer needs to be specified in layer_helper and hence we need to check whether the supplied datatype (which could be np.datatype or core.DataType) is of type float. Do you have any suggestion on how to accomplish this without making this global?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find a solution resolve this problem. convert_np_dtype_to_dtype_ goes the wrong way...
this function just makes user can configure a data type string float32, float64. But we should only let user configure support datatype like paddle.float32, paddle.float64, and make the real type conversion(from/to numpy) happens in the feed/fetch implementation.

dtype = np.dtype(np_dtype)
if dtype == np.float32:
return core.DataType.FP32
elif dtype == np.float64:
return core.DataType.FP64
elif dtype == np.float16:
return core.DataType.FP16
elif dtype == np.int32:
return core.DataType.INT32
elif dtype == np.int16:
return core.DataType.INT16
elif dtype == np.int64:
return core.DataType.INT64
elif dtype == np.bool:
return core.DataType.BOOL
else:
raise ValueError("Not supported numpy dtype " + str(dtype))


def dtype_is_floating(dtype):
if not isinstance(dtype, core.DataType):
dtype = convert_np_dtype_to_dtype_(dtype)

if (dtype == core.DataType.FP16 or dtype == core.DataType.FP16 or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dtype == core.DataType.FP16 should be FP32 here. But I think that we need more general type asserts in c++ side, just throw an exception when the configuration is wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above.

dtype == core.DataType.FP64):
return True
else:
return False


def _debug_string_(proto, throw_on_error=True):
error_fields = list()
if not proto.IsInitialized(error_fields) and throw_on_error:
Expand Down Expand Up @@ -66,7 +97,7 @@ def __init__(self,
"matched.".format(self.name, old_shape, shape))
if dtype is not None:
if not isinstance(dtype, core.DataType):
dtype = Variable._convert_np_dtype_to_dtype_(dtype)
dtype = convert_np_dtype_to_dtype_(dtype)
if is_new_var:
self.desc.set_data_type(dtype)
else:
Expand Down Expand Up @@ -148,26 +179,6 @@ def _unique_var_name_():
uid = core.unique_integer(prefix) # unique during whole process.
return "_".join([prefix, str(uid)])

@staticmethod
def _convert_np_dtype_to_dtype_(np_dtype):
dtype = np.dtype(np_dtype)
if dtype == np.float32:
return core.DataType.FP32
elif dtype == np.float64:
return core.DataType.FP64
elif dtype == np.float16:
return core.DataType.FP16
elif dtype == np.int32:
return core.DataType.INT32
elif dtype == np.int16:
return core.DataType.INT16
elif dtype == np.int64:
return core.DataType.INT64
elif dtype == np.bool:
return core.DataType.BOOL
else:
raise ValueError("Not supported numpy dtype " + str(dtype))


def get_all_op_protos():
"""
Expand Down
33 changes: 26 additions & 7 deletions python/paddle/v2/fluid/layer_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import itertools

from paddle.v2.fluid.framework import Variable, g_main_program, \
g_startup_program, unique_name, Program
g_startup_program, unique_name, Program, dtype_is_floating
from paddle.v2.fluid.initializer import ConstantInitializer, \
UniformInitializer, XavierInitializer

Expand Down Expand Up @@ -61,7 +61,7 @@ def input(self, input_param_name='input'):

@property
def param_attr(self):
default = {'name': None, 'initializer': XavierInitializer()}
default = {'name': None}
actual = self.kwargs.get('param_attr', None)
if actual is None:
actual = default
Expand All @@ -72,7 +72,7 @@ def param_attr(self):

@property
def bias_attr(self):
default = {'name': None, 'initializer': ConstantInitializer()}
default = {'name': None}
bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is None:
bias_attr = default
Expand Down Expand Up @@ -119,6 +119,8 @@ def create_parameter(self, attr, shape, dtype, suffix='w',
attr_copy = copy.deepcopy(attr)
if initializer is not None:
attr_copy['initializer'] = initializer
else:
attr_copy['initializer'] = _get_default_initializer(dtype)
if attr_copy['name'] is None:
attr_copy['name'] = unique_name(".".join([self.name, suffix]))
self.startup_program.global_block().create_parameter(
Expand Down Expand Up @@ -149,13 +151,19 @@ def set_variable_initializer(self, var, initializer):
persistable=True,
initializer=initializer)

def append_bias_op(self, input_var, dim_start=1, dim_end=None):
def append_bias_op(self,
input_var,
bias_initializer,
dim_start=1,
dim_end=None):
"""
Append bias operator and return its output. If the user does not set
bias_attr, append_bias_op will return input_var

:param input_var: the input variable. The len(input_var.shape) is larger
or equal than 2.
:param input_var: the input variable. The len(input_var.shape) is
larger or equal than 2.
:bias_initializer: an instance of a subclass of Initializer used to
initialize the bias
:param dim_start:
:param dim_end: the shape of the bias will be
input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
Expand All @@ -167,7 +175,11 @@ def append_bias_op(self, input_var, dim_start=1, dim_end=None):
return input_var

b = self.create_parameter(
attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
attr=bias_attr,
shape=size,
dtype=input_var.data_type,
suffix='b',
initializer=bias_initializer)
tmp = self.create_tmp_variable(dtype=input_var.data_type)
self.append_op(
type='elementwise_add',
Expand All @@ -191,3 +203,10 @@ def append_activation(self, input_var):
outputs={"Y": [tmp]},
attrs=act)
return tmp

def _get_default_initializer(dtype):
if dtype is None or dtype_is_floating(dtype) == True:
return XavierInitializer()
else:
# For integer and boolean types, initialize with all zeros
return ConstantInitializer()
71 changes: 63 additions & 8 deletions python/paddle/v2/fluid/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
Operator
from paddle.v2.fluid.initializer import ConstantInitializer, \
NormalInitializer
NormalInitializer, XavierInitializer
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re
import cStringIO
Expand All @@ -18,7 +18,9 @@
def fc(input,
size,
param_attr=None,
param_initializer=None,
bias_attr=None,
bias_initializer=None,
name=None,
act=None,
num_flatten_dims=1,
Expand All @@ -31,7 +33,11 @@ def fc(input,
input: The input tensor to the function
size: The size of the layer
param_attr: The parameters/weights to the FC Layer
param_initializer: Initializer used for the weight/parameter.
If None, XavierInitializer() is used
bias_attr: The bias parameter for the FC layer
bias_initializer: Initializer used for the bias.
If None, then ConstantInitializer() is used
name: Name/alias of the function
act: Activation to be applied to the output of FC layer
num_flatten_dims: Number of columns in input
Expand All @@ -50,18 +56,34 @@ def fc(input,
to the LayerHelper constructor.

"""

def _get_default_param_initializer():
return XavierInitializer()

def _get_default_bias_initializer():
return ConstantInitializer()

helper = LayerHelper('fc', **locals())

dtype = helper.input_dtype()

if param_initializer is None:
param_initializer = _get_default_param_initializer()

if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()

mul_results = []
for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape
param_shape = [
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size]
w = helper.create_parameter(
attr=param_attr, shape=param_shape, dtype=dtype)
attr=param_attr,
initializer=param_initializer,
shape=param_shape,
dtype=dtype)
tmp = helper.create_tmp_variable(dtype)
helper.append_op(
type="mul",
Expand All @@ -82,7 +104,7 @@ def fc(input,
helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
# add bias
pre_activation = helper.append_bias_op(pre_bias)
pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
# add activation
return helper.append_activation(pre_activation)

Expand Down Expand Up @@ -599,24 +621,41 @@ def sequence_conv(input,
act=None,
padding=None,
bias_attr=None,
bias_initializer=None,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
"""
This function creates the op for sequence_conv, using the inputs and
other convolutional configurations for the filters and stride as given
in the input parameters to the function.
"""

def _get_default_bias_initializer():
return ConstantInitializer()

def _get_default_param_initializer():
return XavierInitializer()

# FIXME(dzh) : want to unify the argument of python layer
# function. So we ignore some unecessary attributes.
# such as, padding_trainable, context_start.

helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype()

if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()

filter_shape = [filter_size * input.shape[1], num_filters]
filter = helper.create_parameter(
attr=helper.param_attr, shape=filter_shape, dtype=dtype)
attr=helper.param_attr,
shape=filter_shape,
dtype=dtype,
initializer=param_initializer)
pre_bias = helper.create_tmp_variable(dtype)

helper.append_op(
Expand All @@ -631,7 +670,7 @@ def sequence_conv(input,
'contextStart': -int(filter_size / 2),
'contextLength': filter_size
})
pre_act = helper.append_bias_op(pre_bias)
pre_act = helper.append_bias_op(pre_bias, _get_default_bias_initializer)
return helper.append_activation(pre_act)


Expand All @@ -644,7 +683,9 @@ def conv2d(input,
stride=[1, 1],
padding=None,
bias_attr=None,
bias_initializer=None,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
"""
Expand All @@ -654,6 +695,14 @@ def conv2d(input,
This funciton can also append an activation on top of the
conv-2d output, if mentioned in the input parameters.
"""

def _get_default_bias_initializer():
return ConstantInitializer()

def _get_default_param_initializer(filter_size, num_channels):
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
return NormalInitializer(0.0, std, 0)

helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype()

Expand All @@ -675,12 +724,17 @@ def conv2d(input,
input_shape = input.shape
filter_shape = [num_filters, num_filter_channels] + filter_size

std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
if param_initializer is None:
param_initializer = _get_default_param_initializer(filter_size,
num_channels)
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()

filter = helper.create_parameter(
attr=helper.param_attr,
shape=filter_shape,
dtype=dtype,
initializer=NormalInitializer(0.0, std, 0))
initializer=param_initializer)
pre_bias = helper.create_tmp_variable(dtype)

helper.append_op(
Expand All @@ -694,7 +748,8 @@ def conv2d(input,
'paddings': padding,
'groups': groups})

pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
pre_act = helper.append_bias_op(
pre_bias, bias_initializer, dim_start=1, dim_end=2)

return helper.append_activation(pre_act)

Expand Down