PaddlePaddle · yuchen202 · Sep 8, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
@@ -1309,6 +1309,9 @@ class BatchNorm2D(_BatchNormBase):
         moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
         moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
+    - :math:`x` : mini-batch data
+    - :math:`m` : the size of the mini-batch data
+
     The normalization function formula is as follows:
 
     ..  math::

diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
@@ -457,8 +457,8 @@ class MaxPool2D(Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_mask(bool, optional): Whether to return the max indices along with the outputs.
-        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+        return_mask(bool, optional): Whether to return the max indices along with the outputs. Default is false.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_height, input_width]`.
         name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.

diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
@@ -65,9 +65,9 @@ class Momentum(Optimizer):
             Otherwise, the regularization setting here in optimizer will take effect. \
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three clipping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            some derived class of ``GradientClipBase`` . There are three clipping strategies:
+            some derived class of ``GradientClipBase`` . There are three clipping strategies:
+            :ref:`paddle.nn.ClipGradByGlobalNorm <cn_api_fluid_clip_ClipGradByGlobalNorm>` , :ref:`paddle.nn.ClipGradByNorm <cn_api_fluid_clip_ClipGradByNorm>` , :ref:`paddle.nn.ClipGradByValue <cn_api_fluid_clip_ClipGradByValue>`.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
         rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
             Often choose to be ``1.0/batch_size``.

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
@@ -1119,7 +1119,7 @@ def concat(x, axis=0, name=None):
             # out1
             # [[ 1  2  3 11 12 13 21 22]
             #  [ 4  5  6 14 15 16 23 24]]
-            # out2 out3
+            # out2 and out3
             # [[ 1  2  3]
             #  [ 4  5  6]
             #  [11 12 13]