Improve the initializer Interface for fc, sequence_conv and conv2d layers

Abhinav Arora · Abhinav Arora · commit 8556834b51c9 · 2017-11-18T20:53:44.000+05:30
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
@@ -15,6 +15,37 @@ def unique_name(prefix):
     return "_".join([prefix, str(uid)])
 
 
+def convert_np_dtype_to_dtype_(np_dtype):
+    dtype = np.dtype(np_dtype)
+    if dtype == np.float32:
+        return core.DataType.FP32
+    elif dtype == np.float64:
+        return core.DataType.FP64
+    elif dtype == np.float16:
+        return core.DataType.FP16
+    elif dtype == np.int32:
+        return core.DataType.INT32
+    elif dtype == np.int16:
+        return core.DataType.INT16
+    elif dtype == np.int64:
+        return core.DataType.INT64
+    elif dtype == np.bool:
+        return core.DataType.BOOL
+    else:
+        raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def dtype_is_floating(dtype):
+    if not isinstance(dtype, core.DataType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if (dtype == core.DataType.FP16 or dtype == core.DataType.FP16 or
+            dtype == core.DataType.FP64):
+        return True
+    else:
+        return False
+
+
 def _debug_string_(proto, throw_on_error=True):
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
@@ -66,7 +97,7 @@ def __init__(self,
                         "matched.".format(self.name, old_shape, shape))
         if dtype is not None:
             if not isinstance(dtype, core.DataType):
-                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+                dtype = convert_np_dtype_to_dtype_(dtype)
             if is_new_var:
                 self.desc.set_data_type(dtype)
             else:
@@ -148,26 +179,6 @@ def _unique_var_name_():
         uid = core.unique_integer(prefix)  # unique during whole process.
         return "_".join([prefix, str(uid)])
 
-    @staticmethod
-    def _convert_np_dtype_to_dtype_(np_dtype):
-        dtype = np.dtype(np_dtype)
-        if dtype == np.float32:
-            return core.DataType.FP32
-        elif dtype == np.float64:
-            return core.DataType.FP64
-        elif dtype == np.float16:
-            return core.DataType.FP16
-        elif dtype == np.int32:
-            return core.DataType.INT32
-        elif dtype == np.int16:
-            return core.DataType.INT16
-        elif dtype == np.int64:
-            return core.DataType.INT64
-        elif dtype == np.bool:
-            return core.DataType.BOOL
-        else:
-            raise ValueError("Not supported numpy dtype " + str(dtype))
-
 
 def get_all_op_protos():
     """
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
@@ -2,7 +2,7 @@
 import itertools
 
 from paddle.v2.fluid.framework import Variable, g_main_program, \
-    g_startup_program, unique_name, Program
+    g_startup_program, unique_name, Program, dtype_is_floating
 from paddle.v2.fluid.initializer import ConstantInitializer, \
     UniformInitializer, XavierInitializer
 
@@ -61,7 +61,7 @@ def input(self, input_param_name='input'):
 
     @property
     def param_attr(self):
-        default = {'name': None, 'initializer': XavierInitializer()}
+        default = {'name': None}
         actual = self.kwargs.get('param_attr', None)
         if actual is None:
             actual = default
@@ -72,7 +72,7 @@ def param_attr(self):
 
     @property
     def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
+        default = {'name': None}
         bias_attr = self.kwargs.get('bias_attr', None)
         if bias_attr is None:
             bias_attr = default
@@ -119,6 +119,8 @@ def create_parameter(self, attr, shape, dtype, suffix='w',
         attr_copy = copy.deepcopy(attr)
         if initializer is not None:
             attr_copy['initializer'] = initializer
+        else:
+            attr_copy['initializer'] = _get_default_initializer(dtype)
         if attr_copy['name'] is None:
             attr_copy['name'] = unique_name(".".join([self.name, suffix]))
         self.startup_program.global_block().create_parameter(
@@ -149,13 +151,19 @@ def set_variable_initializer(self, var, initializer):
             persistable=True,
             initializer=initializer)
 
-    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
+    def append_bias_op(self,
+                       input_var,
+                       bias_initializer,
+                       dim_start=1,
+                       dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
 
-        :param input_var: the input variable. The len(input_var.shape) is larger
-        or equal than 2.
+        :param input_var: the input variable. The len(input_var.shape) is
+        larger or equal than 2.
+        :bias_initializer: an instance of a subclass of Initializer used to
+        initialize the bias
         :param dim_start:
         :param dim_end: the shape of the bias will be
         input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
@@ -167,7 +175,11 @@ def append_bias_op(self, input_var, dim_start=1, dim_end=None):
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
+            attr=bias_attr,
+            shape=size,
+            dtype=input_var.data_type,
+            suffix='b',
+            initializer=bias_initializer)
         tmp = self.create_tmp_variable(dtype=input_var.data_type)
         self.append_op(
             type='elementwise_add',
@@ -191,3 +203,10 @@ def append_activation(self, input_var):
             outputs={"Y": [tmp]},
             attrs=act)
         return tmp
+
+    def _get_default_initializer(dtype):
+        if dtype is None or dtype_is_floating(dtype) == True:
+            return XavierInitializer()
+        else:
+            # For integer and boolean types, initialize with all zeros
+            return ConstantInitializer()
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
@@ -3,7 +3,7 @@
 from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
     Operator
 from paddle.v2.fluid.initializer import ConstantInitializer, \
-    NormalInitializer
+    NormalInitializer, XavierInitializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
@@ -18,7 +18,9 @@
 def fc(input,
        size,
        param_attr=None,
+       param_initializer=None,
        bias_attr=None,
+       bias_initializer=None,
        name=None,
        act=None,
        num_flatten_dims=1,
@@ -31,7 +33,11 @@ def fc(input,
        input: The input tensor to the function
        size: The size of the layer
        param_attr: The parameters/weights to the FC Layer
+       param_initializer: Initializer used for the weight/parameter.
+       If None, XavierInitializer() is used
        bias_attr: The bias parameter for the FC layer
+       bias_initializer: Initializer used for the bias.
+       If None, then ConstantInitializer() is used
        name: Name/alias of the function
        act: Activation to be applied to the output of FC layer
        num_flatten_dims: Number of columns in input
@@ -50,18 +56,34 @@ def fc(input,
     to the LayerHelper constructor.
 
     """
+
+    def _get_default_param_initializer():
+        return XavierInitializer()
+
+    def _get_default_bias_initializer():
+        return ConstantInitializer()
+
     helper = LayerHelper('fc', **locals())
 
     dtype = helper.input_dtype()
 
+    if param_initializer is None:
+        param_initializer = _get_default_param_initializer()
+
+    if bias_initializer is None:
+        bias_initializer = _get_default_bias_initializer()
+
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
         w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype)
+            attr=param_attr,
+            initializer=param_initializer,
+            shape=param_shape,
+            dtype=dtype)
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
@@ -82,7 +104,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias)
+    pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -599,24 +621,41 @@ def sequence_conv(input,
                   act=None,
                   padding=None,
                   bias_attr=None,
+                  bias_initializer=None,
                   param_attr=None,
+                  param_initializer=None,
                   main_program=None,
                   startup_program=None):
     """
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
     in the input parameters to the function.
     """
+
+    def _get_default_bias_initializer():
+        return ConstantInitializer()
+
+    def _get_default_param_initializer():
+        return XavierInitializer()
+
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
 
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
 
+    if param_initializer is None:
+        param_initializer = _get_default_param_initializer()
+    if bias_initializer is None:
+        bias_initializer = _get_default_bias_initializer()
+
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter = helper.create_parameter(
-        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        initializer=param_initializer)
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -631,7 +670,7 @@ def sequence_conv(input,
             'contextStart': -int(filter_size / 2),
             'contextLength': filter_size
         })
-    pre_act = helper.append_bias_op(pre_bias)
+    pre_act = helper.append_bias_op(pre_bias, _get_default_bias_initializer)
     return helper.append_activation(pre_act)
 
 
@@ -644,7 +683,9 @@ def conv2d(input,
            stride=[1, 1],
            padding=None,
            bias_attr=None,
+           bias_initializer=None,
            param_attr=None,
+           param_initializer=None,
            main_program=None,
            startup_program=None):
     """
@@ -654,6 +695,14 @@ def conv2d(input,
     This funciton can also append an activation on top of the
     conv-2d output, if mentioned in the input parameters.
     """
+
+    def _get_default_bias_initializer():
+        return ConstantInitializer()
+
+    def _get_default_param_initializer(filter_size, num_channels):
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return NormalInitializer(0.0, std, 0)
+
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -675,12 +724,17 @@ def conv2d(input,
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
-    std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+    if param_initializer is None:
+        param_initializer = _get_default_param_initializer(filter_size,
+                                                           num_channels)
+    if bias_initializer is None:
+        bias_initializer = _get_default_bias_initializer()
+
     filter = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        initializer=NormalInitializer(0.0, std, 0))
+        initializer=param_initializer)
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -694,7 +748,8 @@ def conv2d(input,
                'paddings': padding,
                'groups': groups})
 
-    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    pre_act = helper.append_bias_op(
+        pre_bias, bias_initializer, dim_start=1, dim_end=2)
 
     return helper.append_activation(pre_act)