菜鸟想请教下gradienttape和optimizer的一些问题。做了一个二次函数的拟合
1.官方文档里面提到调用GradientTape.gradien()方法,就会释放GradientTapede拥有的资源,这个资源是指计算完的梯度么?
2.就是使用optimizer.apply_gradient()应用计算的梯度,并没有使用minimize(),这里怎么确保他是往损失minimize的方向呢?
3.对于方向传递,我进行GradientTape.gradien()计算梯度,我直接计算loss与第一层参数的梯度,可以使整体每层的参数得到更新,在backprop推导公式时都是一层一层的推导,可以理解为直接将loss与第一层参数进行连接求梯度,就能将内部参数全部更新么?
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
def add_layer(inputs, Weights, biases, activation_function=None):
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)
return outputs, Weights, biases
def loss(predicted_y, target_y):
return tf.math.reduce_mean(tf.math.square(predicted_y - target_y))
x_data = np.linspace(-1, 1, 300)[:, np.newaxis]
noise = np.random.normal(0, 0.05, x_data.shape)
y_data = np.square(x_data) - 0.5 + noise
x_data = x_data.astype(np.float32)
noise = noise.astype(np.float32)
y_data = y_data.astype(np.float32)
optimizer = tf.optimizers.SGD(learning_rate=0.1)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x_data, y_data)
plt.ion()
plt.show()
Wl = tf.Variable(tf.random.normal((1, 10)))
bl = tf.Variable(tf.zeros((1, 10)) + 0.1)
Wp = tf.Variable(tf.random.normal((10, 1)))
bp = tf.Variable(tf.zeros((1, 1)) + 0.1)
for i in range(1000):
with tf.GradientTape() as tape:
l1, W1, b1 = add_layer(x_data, Wl, bl, activation_function=tf.nn.relu)
prediction, W2, b2 = add_layer(l1, Wp, bp, activation_function=None)
loss_val = loss(prediction, y_data)
print(loss_val)
grads = tape.gradient(loss_val, [W1, b1, W2, b2]) # 这里只适用W1,b1也能得到结果
optimizer.apply_gradients(zip(grads, [W1, b1, W2, b2]))
# print('W1: {},\n b1{},\n Wp: {},\n bp: {}\n\n'.format(W1.numpy(), b1.numpy(), Wp.numpy(), bp.numpy()))
if i % 50 == 0:
try:
ax.lines.remove(lines[0])
except Exception:
pass
lines = ax.plot(x_data, prediction.numpy(), 'r-', lw=5)
plt.pause(0.1)