计算用于使用TF.Gradienttape分配其他变量的变量的梯度

发布于 2025-02-08 18:59:24 字数 3784 浏览 4 评论 0原文

如何相对于线性组合中使用的另一个变量来计算变量上的梯度?以下代码以tensorflow急切模式执行。

在较旧的问题中进行了更多挖掘,类似的问题出现了。但是,尚不清楚如何解决这个问题。 另一个相关的问题是这个,但是这里相同的变量是重复使用的,并且tensorflow v1

我还阅读了这个问题那里提供了潜在的解决方案。 但是,我将在神经网络的内部模型权重中应用它,但是我不知道如何在实践中应用张量征服。

a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')

with tf.GradientTape() as tape:
  c.assign(a + b)
  loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # prints None

# or another attempt
with tf.GradientTape(watch_accessed_variables=False) as tape:
   tape.watch([b,c])
   c.assign(a + b)
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # also outputs None

# Working, but c is a variable in my use case
with tf.GradientTape() as tape:
   c = a + b
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # Works

扩展:

import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.01)

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    loss = tf.norm(c) # scalar

# This works as expected
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=0.0024197185>, <tf.Tensor: shape=(), dtype=float32, numpy=0.009702832>]
# Here I would expect a 1D gradient to use the Gradient Descent method?
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=1.4518311>, <tf.Tensor: shape=(), dtype=float32, numpy=5.8216996>]

# Example what I'd like to achieve;
with tf.GradientTape() as tape:
  c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
  loss = tf.norm(c_) # scalar

print(tape.gradient(loss,x)) 
# tf.Tensor(5.0933886, shape=(), dtype=float32)

一个更复杂的问题:

import tensorflow as tf

a = [tf.Variable([1.0, 2.0], name='a'), tf.Variable([5.0], name='aa'), tf.Variable(7.0, name='aaa')]
b = [tf.Variable([3.0, 4.0], name='b'), tf.Variable([6.0], name='bb'), tf.Variable(8.0, name='aaa')]
c = [tf.Variable([1.0, 1.0], name='c'), tf.Variable([1.0], name='cc'), tf.Variable(1.0, name='ccc')]
x = tf.Variable(0.5, name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)

    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)

    loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
    loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))

print(loss, loss_without_assign)
# tf.Tensor(9.974969, shape=(), dtype=float32) tf.Tensor(9.974969, shape=(), dtype=float32)

# Gives same result
#partial_grads = tf.nest.map_structure(lambda d, e: tf.nest.map_structure(lambda f, g: tape.gradient(loss, f, output_gradients=tape.gradient(g, x)), d, e), c, c_)
partial_grads = tf.nest.map_structure(lambda d, e: tape.gradient(loss, d, output_gradients=tape.gradient(e, x)), c, c_)

# Should not use mean?
print(tf.reduce_sum(tf.nest.map_structure(lambda z: tf.reduce_mean(z), partial_grads)))
print(tape.gradient(loss_without_assign, x))
# Rather close
# tf.Tensor(2.3057716, shape=(), dtype=float32)
# tf.Tensor(2.3057709, shape=(), dtype=float32)

How can one calculate the gradient on a variable with respect to another variable used in a linear combination? The following code is executed in TensorFlow eager mode.

Some more digging in older questions, a similar question showed up. However, it is not clear on how to solve this issue.
Another related question is this one, but here the same variable is reused and TensorFlow v1.

I also read in this question that tf.assign (v1?) does not support gradients and a potential solution is provided there.
However, I'd apply it in context of internal model weights of neural networks, but I don't know how to apply that tensor-approach in practice.

a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')

with tf.GradientTape() as tape:
  c.assign(a + b)
  loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # prints None

# or another attempt
with tf.GradientTape(watch_accessed_variables=False) as tape:
   tape.watch([b,c])
   c.assign(a + b)
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # also outputs None

# Working, but c is a variable in my use case
with tf.GradientTape() as tape:
   c = a + b
   loss = tf.reduce_mean(c**2)

print(tape.gradient(loss, b)) # Works

Extension:

import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.01)

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    loss = tf.norm(c) # scalar

# This works as expected
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=0.0024197185>, <tf.Tensor: shape=(), dtype=float32, numpy=0.009702832>]
# Here I would expect a 1D gradient to use the Gradient Descent method?
print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)))
# [<tf.Tensor: shape=(), dtype=float32, numpy=1.4518311>, <tf.Tensor: shape=(), dtype=float32, numpy=5.8216996>]

# Example what I'd like to achieve;
with tf.GradientTape() as tape:
  c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
  loss = tf.norm(c_) # scalar

print(tape.gradient(loss,x)) 
# tf.Tensor(5.0933886, shape=(), dtype=float32)

A more sophisticated issue:

import tensorflow as tf

a = [tf.Variable([1.0, 2.0], name='a'), tf.Variable([5.0], name='aa'), tf.Variable(7.0, name='aaa')]
b = [tf.Variable([3.0, 4.0], name='b'), tf.Variable([6.0], name='bb'), tf.Variable(8.0, name='aaa')]
c = [tf.Variable([1.0, 1.0], name='c'), tf.Variable([1.0], name='cc'), tf.Variable(1.0, name='ccc')]
x = tf.Variable(0.5, name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)

    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)

    loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
    loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))

print(loss, loss_without_assign)
# tf.Tensor(9.974969, shape=(), dtype=float32) tf.Tensor(9.974969, shape=(), dtype=float32)

# Gives same result
#partial_grads = tf.nest.map_structure(lambda d, e: tf.nest.map_structure(lambda f, g: tape.gradient(loss, f, output_gradients=tape.gradient(g, x)), d, e), c, c_)
partial_grads = tf.nest.map_structure(lambda d, e: tape.gradient(loss, d, output_gradients=tape.gradient(e, x)), c, c_)

# Should not use mean?
print(tf.reduce_sum(tf.nest.map_structure(lambda z: tf.reduce_mean(z), partial_grads)))
print(tape.gradient(loss_without_assign, x))
# Rather close
# tf.Tensor(2.3057716, shape=(), dtype=float32)
# tf.Tensor(2.3057709, shape=(), dtype=float32)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

能怎样 2025-02-15 18:59:24

也许您可以尝试如下:

import tensorflow as tf
a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')

with tf.GradientTape(persistent=True) as tape:
  c_ = a + 2*b
  c.assign(c_)
  loss = tf.reduce_mean(c**2)

print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b))) 
# tf.Tensor(20.0, shape=(), dtype=float32)

PS output_gradientstf.gradienttape.gradient隐藏在角落且很少发现的参数,可用于手动构建级联分化。

  • 用于扩展:
import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.0, name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    loss = tf.norm(c) # scalar
print(tape.gradient(loss,c[0],output_gradients=tape.gradient(c_[0],x))+\
      tape.gradient(loss,c[1],output_gradients=tape.gradient(c_[1],x)))
# tf.Tensor(5.0932484, shape=(), dtype=float32)

解释:

因为tf.gradienttape基于矩阵差异理论,但将在.gradient()。例如,在标量的情况下,在矩阵理论中得出vector,我们将获得vector派生,但在tf中。 gradienttape,A ydre_sum(例如)将被应用于汇总标量。

这里tape.gradient(损失,c,output_gradients = tape.gradient(c_,x))
Acctally DID:

tape.gradient(loss,c[0],output_gradients=tape.gradient(c_,x)[0]),
tape.gradient(loss,c[1],output_gradients=tape.gradient(c_,x)[1])

but 
tape.gradient(c_,x)[0] != tape.gradient(c_[0],x)
tape.gradient(c_,x)[1] != tape.gradient(c_[1],x)

so tape.gradient(损失,C,output_gradients = tape.gradient(c_,x))与我们的最初意图相反。

  • 对于更复杂的问题:
    jacobian是需要的
import tensorflow as tf

tf.keras.utils.set_random_seed(0)
a = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
b = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
c = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
x = tf.Variable(tf.random.normal(shape=[]), name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)

    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)

    loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
    loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))

print(loss, loss_without_assign)
print(tf.reduce_sum([
    tf.reduce_sum(tape.jacobian(c_[0],x)*tape.gradient(loss,c[0])),
    tf.reduce_sum(tape.jacobian(c_[1],x)*tape.gradient(loss,c[1])),
    tf.reduce_sum(tape.jacobian(c_[2],x)*tape.gradient(loss,c[2]))
    ]))
# tf.Tensor(0.7263656, shape=(), dtype=float32)
print(tape.gradient(loss_without_assign, x))
# tf.Tensor(0.7263656, shape=(), dtype=float32)

Maybe you can try as following:

import tensorflow as tf
a = tf.Variable(1.0, name='a')
b = tf.Variable(2.0, name='b')
c = tf.Variable(3.0, name='c')

with tf.GradientTape(persistent=True) as tape:
  c_ = a + 2*b
  c.assign(c_)
  loss = tf.reduce_mean(c**2)

print(tape.gradient(loss,c,output_gradients=tape.gradient(c_,b))) 
# tf.Tensor(20.0, shape=(), dtype=float32)

P.S. output_gradients is a parameter of tf.GradientTape.gradient that hidden in the corner and rarely found, which can be used to manually build cascade differentiation.

  • For Extension:
import tensorflow as tf
a = [tf.Variable(1.0, name='a'), tf.Variable(4.0, name='aa')]
b = [tf.Variable(2.0, name='b'), tf.Variable(9.0, name='bb')]
c = [tf.Variable(3.0, name='c'), tf.Variable(0.0, name='cc')]
x = tf.Variable(0.0, name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)
    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)
    loss = tf.norm(c) # scalar
print(tape.gradient(loss,c[0],output_gradients=tape.gradient(c_[0],x))+\
      tape.gradient(loss,c[1],output_gradients=tape.gradient(c_[1],x)))
# tf.Tensor(5.0932484, shape=(), dtype=float32)

Explaination:

Because tf.GradientTape is based on the matrix differential theory, but will collect all derivation of a same variable (add them to a whole) after .gradient(). Such as derive a vector with respect to a scalar, in matrix theory, we will get a vector derivation, but in tf.GradientTape, a reduce_sum like will be applied then to got a summated scalar.

Here tape.gradient(loss,c,output_gradients=tape.gradient(c_,x))
acctually did:

tape.gradient(loss,c[0],output_gradients=tape.gradient(c_,x)[0]),
tape.gradient(loss,c[1],output_gradients=tape.gradient(c_,x)[1])

but 
tape.gradient(c_,x)[0] != tape.gradient(c_[0],x)
tape.gradient(c_,x)[1] != tape.gradient(c_[1],x)

So tape.gradient(loss,c,output_gradients=tape.gradient(c_,x)) contrary to our original intention.

  • For the more sophisticated issue:
    jacobian is needed
import tensorflow as tf

tf.keras.utils.set_random_seed(0)
a = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
b = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
c = [tf.Variable(tf.random.normal(shape=[2])),tf.Variable(tf.random.normal(shape=[1])),tf.Variable(tf.random.normal(shape=[]))]
x = tf.Variable(tf.random.normal(shape=[]), name='x')

with tf.GradientTape(persistent=True) as tape:
    c_ = tf.nest.map_structure(lambda _a, _b: (1-x)*_a+ x*_b, a, b)

    tf.nest.map_structure(lambda x, y: x.assign(y), c, c_)

    loss = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c))
    loss_without_assign = tf.norm(tf.nest.map_structure(lambda e: tf.norm(e), c_))

print(loss, loss_without_assign)
print(tf.reduce_sum([
    tf.reduce_sum(tape.jacobian(c_[0],x)*tape.gradient(loss,c[0])),
    tf.reduce_sum(tape.jacobian(c_[1],x)*tape.gradient(loss,c[1])),
    tf.reduce_sum(tape.jacobian(c_[2],x)*tape.gradient(loss,c[2]))
    ]))
# tf.Tensor(0.7263656, shape=(), dtype=float32)
print(tape.gradient(loss_without_assign, x))
# tf.Tensor(0.7263656, shape=(), dtype=float32)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文