GradientClipByValue

class paddle.fluid.clip.GradientClipByValue(max, min=None, need_clip=None)[source]

Limit the value of multi-dimensional Tensor \(X\) to the range [min, max].

  • Any values less than min are set to min.

  • Any values greater than max are set to max.

The multi-dimensional Tensor \(X\) is not passed from this class, but the gradients of all parameters in Program . If need_clip is not None, then only part of gradients can be selected for gradient clipping.

Gradient clip will takes effect after being set in optimizer , see the document optimizer (for example: SGDOptimizer).

Parameters
  • max (float) – The maximum value to clip by.

  • min (float, optional) – The minimum value to clip by. if not set by user, it will be set to -max automatically. In this case, max must be greater than 0.

  • need_clip (function, optional) – Type: function. This function accepts a Parameter and returns bool (True: the gradient of this Parameter need to be clipped, False: not need). Default: None, and gradients of all parameters in the network will be clipped.

Examples

# use for Static mode
import paddle
import paddle.fluid as fluid
import numpy as np

main_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(
        main_program=main_prog, startup_program=startup_prog):
    image = fluid.data(
        name='x', shape=[-1, 2], dtype='float32')
    predict = fluid.layers.fc(input=image, size=3, act='relu') # Trainable parameters: fc_0.w.0, fc_0.b.0
    loss = fluid.layers.mean(predict)

    # Clip all parameters in network:
    clip = fluid.clip.GradientClipByValue(min=-1, max=1)

    # Clip a part of parameters in network: (e.g. fc_0.w_0)
    # pass a function(fileter_func) to need_clip, and fileter_func receive a Parameter, and return bool
    # def fileter_func(Parameter):
    # # It can be easily filtered by Parameter.name (name can be set in fluid.ParamAttr, and the default name is fc_0.w_0, fc_0.b_0)
    #   return Parameter.name=="fc_0.w_0"
    # clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)

    sgd_optimizer = fluid.optimizer.SGDOptimizer(learning_rate=0.1, grad_clip=clip)
    sgd_optimizer.minimize(loss)

place = fluid.CPUPlace()
exe = fluid.Executor(place)
x = np.random.uniform(-100, 100, (10, 2)).astype('float32')
exe.run(startup_prog)
out = exe.run(main_prog, feed={'x': x}, fetch_list=loss)


# use for Dygraph mode
import paddle
import paddle.fluid as fluid

with fluid.dygraph.guard():
    linear = fluid.dygraph.Linear(10, 10)  # Trainable parameters:: linear_0.w.0, linear_0.b.0
    inputs = fluid.layers.uniform_random([32, 10]).astype('float32')
    out = linear(fluid.dygraph.to_variable(inputs))
    loss = fluid.layers.reduce_mean(out)
    loss.backward()

    # Clip all parameters in network:
    clip = fluid.clip.GradientClipByValue(min=-1, max=1)

    # Clip a part of parameters in network: (e.g. linear_0.w_0)
    # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
    # def fileter_func(ParamBase):
    # # It can be easily filtered by ParamBase.name(name can be set in fluid.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
    #   return ParamBase.name == "linear_0.w_0"
    # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
    #   return ParamBase.name == linear.weight.name
    # clip = fluid.clip.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)

    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=0.1, parameter_list=linear.parameters(), grad_clip=clip)
    sgd_optimizer.minimize(loss)