PaddlePaddle
diff --git a/‎python/paddle/fluid/layers/detection.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/layers/detection.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/fluid/layers/nn.py
Lines changed: 12 additions & 152 deletions b/‎python/paddle/fluid/layers/nn.py
Lines changed: 12 additions & 152 deletions
diff --git a/‎python/paddle/fluid/layers/rnn.py
Lines changed: 2 additions & 2 deletions b/‎python/paddle/fluid/layers/rnn.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
Lines changed: 2 additions & 2 deletions b/‎python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/dist_transformer.py
Lines changed: 3 additions & 3 deletions b/‎python/paddle/fluid/tests/unittests/dist_transformer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
Lines changed: 4 additions & 2 deletions b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
Lines changed: 2 additions & 2 deletions b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
Lines changed: 1 addition & 1 deletion b/‎python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
Lines changed: 1 addition & 1 deletion
@@ -626,7 +626,7 @@ class number, M is number of bounding boxes.
         target_box=loc,
         code_type='decode_center_size',
     )
-    scores = nn.softmax(input=scores)
+    scores = paddle.nn.functional.softmax(scores)
     scores = paddle.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_variable_for_type_inference(
 
@@ -68,7 +68,6 @@
     'linear_chain_crf',
     'crf_decoding',
     'conv2d',
-    'softmax',
     'pool2d',
     'batch_norm',
     'dropout',
@@ -145,7 +144,7 @@ def _get_reduce_dim(dim, input):
         else:
             raise TypeError(
                 "The type of dim must be int, list, tuple or range, but received {}".format(
-                    type(axis)
+                    type(dim)
                 )
             )
     if dim is None:
@@ -679,7 +678,7 @@ def _pull_gpups_sparse(
         size(int|list of int): The embedding size parameter of each input, which indicates the size of
             each embedding vector respectively.
         dtype(str): The dtype refers to the data type of output tensor. Only supports
-	    float32 now.
+        float32 now.
 
     Returns:
         Variable|list of Variable: The tensor variable storing the embeddings of the \
@@ -742,7 +741,7 @@ def _pull_box_sparse(
         size(int): The embedding size parameter, which indicates the size of
             each embedding vector respectively.
         dtype(str): The dtype refers to the data type of output tensor. Only supports
-	    float32 now.
+        float32 now.
 
     Returns:
         Variable|list of Variable: The tensor variable storing the embeddings of the \
@@ -1123,147 +1122,6 @@ def get_attrs(prog, dropout_prob, is_test, seed):
     return out
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
-def softmax(input, use_cudnn=True, name=None, axis=-1):
-    r"""
-    This operator implements the softmax layer. The calculation process is as follows:
-
-    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
-
-    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-    second dimension(row length) is the same as the dimension :attr:`axis` of the input
-    tensor, and the first dimension(column length) is the product of all other
-    dimensions of the input tensor. For each row of the matrix, the softmax operator
-    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-
-    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
-    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.
-
-    It computes the exponential of the given dimension and the sum of exponential
-    values of all the other dimensions in the K-dimensional vector input.
-    Then the ratio of the exponential of the given dimension and the sum of
-    exponential values of all the other dimensions is the output of the softmax
-    operator.
-
-    For each row :math:`i` and each column :math:`j` in the matrix, we have:
-
-    .. math::
-
-        Out[i, j] = \\frac{\\exp(X[i, j])}{\\sum_j(exp(X[i, j])}
-
-    Example:
-
-    .. code-block:: text
-
-        Case 1:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-
-          Attrs:
-            axis = -1
-
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
-
-        Case 2:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-          Attrs:
-            axis = 1
-
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
-                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
-                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
-
-    Args:
-        input (Tensor): The input tensor. A multi-dimension ``Tensor`` with type float32 or float64.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve performance, set use_cudnn to True by default.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . Default: None.
-            will be named automatically. Default: None.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should
-            be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of
-            input tensor. Default: -1. -1 means the last dimension.
-
-    Returns:
-        Tensor: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input`` .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
-                                [3.0, 4.0, 5.0, 6.0],
-                                [7.0, 8.0, 8.0, 9.0]],
-                                [[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
-            y = F.softmax(x, axis=1)
-            print(y)
-            # [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-            #   [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-            #   [0.97555870, 0.97555870, 0.93623954, 0.93623954]],
-            #  [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-            #   [0.26762316, 0.26762316, 0.26762316, 0.26762316],
-            #   [0.72747517, 0.72747517, 0.72747517, 0.72747517]]]
-
-    """
-
-    if in_dygraph_mode():
-        return _C_ops.softmax(input, axis)
-
-    if _non_static_mode():
-        return _legacy_C_ops.softmax(
-            input, 'axis', axis, 'use_cudnn', use_cudnn
-        )
-
-    inputs = {"X": [input]}
-    attrs = {"axis": axis, "use_cudnn": use_cudnn}
-
-    helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(
-        input, 'input/x', ['float16', 'float32', 'float64'], 'softmax'
-    )
-
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs=attrs,
-    )
-    return softmax_out
-
-
 def conv2d(
     input,
     num_filters,
@@ -1788,7 +1646,7 @@ def is_list_or_tuple(ele):
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0]
-            if ceil_mode != False:
+            if ceil_mode is not False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
                     "Received ceil_mode: True."
@@ -6643,7 +6501,7 @@ def deformable_roi_pooling(
         )
 
     input_channels = input.shape[1]
-    if position_sensitive == False:
+    if position_sensitive is False:
         output_channels = input_channels
     else:
         output_channels = input_channels / pooled_height / pooled_width
@@ -6841,11 +6699,11 @@ def mish(x, threshold=20, name=None):
 
     .. math::
 
-	out = \\begin{cases}
-		x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
-		x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
-		x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
-	      \\end{cases}
+    out = \\begin{cases}
+        x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
+        x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
+        x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
+          \\end{cases}
 
     Args:
         x (Variable): Input feature, multi-dimensional Tensor. The data type
@@ -6867,9 +6725,11 @@ def mish(x, threshold=20, name=None):
 
     .. code-block:: python
 
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
 
+        paddle.enable_static()
         DATATYPE='float32'
 
         x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
 
@@ -1304,7 +1304,7 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state):
                 self.noend_mask_tensor, "float64"
             )
 
-        step_log_probs = paddle.log(nn.softmax(logits))
+        step_log_probs = paddle.log(paddle.nn.functional.softmax(logits))
         step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
         log_probs = nn.elementwise_add(
             x=step_log_probs, y=beam_state.log_probs, axis=0
@@ -2330,7 +2330,7 @@ def sample(self, time, outputs, states):
             if self.softmax_temperature is not None
             else outputs
         )
-        probs = nn.softmax(logits)
+        probs = paddle.nn.functional.softmax(logits)
         # TODO: remove this stop_gradient. The stop_gradient of sample_ids can
         # not pass to probs, since sampling_id op does not have corresponding
         # grad op and thus can not pass.
 
@@ -354,7 +354,7 @@ def run_one_loop(self, model, opt, data):
         label.stop_gradient = True
 
         out = model(img)
-        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+        softmax_out = paddle.nn.functional.softmax(out, use_cudnn=False)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
         avg_loss = paddle.mean(x=loss)
         return avg_loss
 
@@ -342,7 +342,7 @@ def forward(self, queries, keys, values, attn_bias):
         )
         if attn_bias is not None:
             product += attn_bias
-        weights = fluid.layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if self._dropout_rate:
             weights_droped = fluid.layers.dropout(
                 weights,
@@ -849,7 +849,7 @@ def forward(self, dec_inputs=None, enc_output=None):
 
         if dec_inputs is None:
             # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
+            predict_out = paddle.nn.functional.softmax(predict)
             return predict_out
         return predict
 
 
@@ -1177,7 +1177,7 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
         if attn_bias:
             product += attn_bias
-        weights = layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if dropout_rate:
             weights = layers.dropout(
                 weights,
@@ -1715,7 +1715,7 @@ def wrap_decoder(
             bias_attr=const_bias_attr,
         )
     if dec_inputs is None:
-        predict = layers.softmax(predict)
+        predict = paddle.nn.functional.softmax(predict)
     return predict
 
 
@@ -1834,7 +1834,7 @@ def beam_search():
             logits = paddle.reshape(logits, (-1, trg_vocab_size))
 
             topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size
+                input=paddle.nn.functional.softmax(logits), k=beam_size
             )
             accu_scores = layers.elementwise_add(
                 x=paddle.log(topk_scores),
 
@@ -435,7 +435,9 @@ def beam_search(self, inputs):
             cell_outputs = self._split_batch_beams(step_input)
             cell_outputs = self.fc(cell_outputs)
 
-            step_log_probs = paddle.log(fluid.layers.softmax(cell_outputs))
+            step_log_probs = paddle.log(
+                paddle.nn.functional.softmax(cell_outputs)
+            )
             noend_array = [-self.kinf] * self.tar_vocab_size
             noend_array[self.beam_end_token] = 0
             noend_mask_tensor = to_variable(
@@ -703,7 +705,7 @@ def attention(self, query, enc_output, mask=None):
             attn = paddle.transpose(attn, [1, 0, 2])
             attn = paddle.add(attn, mask * 1000000000)
             attn = paddle.transpose(attn, [1, 0, 2])
-        weight = fluid.layers.softmax(attn)
+        weight = paddle.nn.functional.softmax(attn)
         weight_memory = fluid.layers.matmul(weight, memory)
 
         return weight_memory
 
@@ -67,7 +67,7 @@ def forward(self, input, cache=None):
             cache["k"], cache["v"] = k, v
 
         weight = fluid.layers.matmul(x=q, y=k, transpose_y=True)
-        weight = fluid.layers.softmax(weight)
+        weight = paddle.nn.functional.softmax(weight)
         out = fluid.layers.matmul(weight, v)
 
         return out
@@ -113,7 +113,7 @@ def forward(self, input, max_len=4):
 # Test to call function defined outside of class.
 def update_cache(cache):
     for k, val in cache.items():
-        cache[k] = fluid.layers.softmax(val)
+        cache[k] = paddle.nn.functional.softmax(val)
 
     return cache
 
 
@@ -308,7 +308,7 @@ def forward(self, x, label=None):
 
 # Test to call function behind caller.
 def softmax(x):
-    return fluid.layers.softmax(x)
+    return paddle.nn.functional.softmax(x)
 
 
 class TestNetWithExternalFunc(TestDygraphIfElseNet):
 
@@ -535,7 +535,7 @@ def train_mobilenet(args, to_static):
                 out = net(img)
 
                 t_end = time.time()
-                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                softmax_out = paddle.nn.functional.softmax(out)
                 loss = fluid.layers.cross_entropy(
                     input=softmax_out, label=label
                 )
 
@@ -48,7 +48,7 @@ def forward(self, x):
         x = fluid.layers.relu(x)
         action_scores = self.affine2(x)
 
-        log_prob = fluid.layers.softmax(action_scores, axis=1)
+        log_prob = paddle.nn.functional.softmax(action_scores, axis=1)
 
         return log_prob
 
 
@@ -343,7 +343,7 @@ def forward(self, inputs, label):
         y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
         out = self.out(y)
 
-        softmax_out = fluid.layers.softmax(out)
+        softmax_out = paddle.nn.functional.softmax(out)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
         avg_loss = paddle.mean(x=loss)
Original file line number	Diff line number	Diff line change
`@@ -626,7 +626,7 @@ class number, M is number of bounding boxes.`
`626`	`626`	`target_box=loc,`
`627`	`627`	`code_type='decode_center_size',`
`628`	`628`	`)`
`629`		`- scores = nn.softmax(input=scores)`
	`629`	`+ scores = paddle.nn.functional.softmax(scores)`
`630`	`630`	`scores = paddle.transpose(scores, perm=[0, 2, 1])`
`631`	`631`	`scores.stop_gradient = True`
`632`	`632`	`nmsed_outs = helper.create_variable_for_type_inference(`
Original file line number	Diff line number	Diff line change
`@@ -535,7 +535,7 @@ def train_mobilenet(args, to_static):`
`535`	`535`	`out = net(img)`
`536`	`536`
`537`	`537`	`t_end = time.time()`
`538`		`- softmax_out = fluid.layers.softmax(out, use_cudnn=False)`
	`538`	`+ softmax_out = paddle.nn.functional.softmax(out)`
`539`	`539`	`loss = fluid.layers.cross_entropy(`
`540`	`540`	`input=softmax_out, label=label`
`541`	`541`	`)`