数值稳定性和模型初始化¶

梯度消失

In [1]:

%matplotlib inline
import torch
from d2l import torch as d2l

x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True)
y = torch.sigmoid(x)
y.backward(torch.ones_like(x))

d2l.plot(x.detach().numpy(), [y.detach().numpy(), x.grad.numpy()],
         legend=['sigmoid', 'gradient'], figsize=(4.5, 2.5))

梯度爆炸

In [2]:

M = torch.normal(0, 1, size=(4, 4))
print('一个矩阵 \n', M)
for i in range(100):
    M = torch.mm(M, torch.normal(0, 1, size=(4, 4)))

print('乘以100个矩阵后\n', M)

一个矩阵 
 tensor([[ 1.4482,  0.1321, -0.3654,  0.2553],
        [ 1.6057, -0.0775,  0.2758, -0.0886],
        [-0.9053, -0.9715,  1.3556,  0.0492],
        [-1.7316, -1.5284, -0.3882, -0.5320]])
乘以100个矩阵后
 tensor([[-7.5557e+22,  6.7367e+22, -1.3285e+23, -2.3105e+22],
        [-1.5998e+23,  1.4264e+23, -2.8128e+23, -4.8921e+22],
        [-1.4575e+23,  1.2995e+23, -2.5626e+23, -4.4569e+22],
        [-7.0795e+22,  6.3121e+22, -1.2448e+23, -2.1649e+22]])