尝试使用张量流学习深度内核，得到“ValueError：没有为任何变量提供梯度”；错误

发布于 2025-01-11 18:16:03 字数 10126 浏览 6 评论 0原文

您好，堆栈溢出社区，

我对张量流和编程相当陌生，因此可能存在我忽略的明显错误。

我的开发环境由运行在tensorflow docker容器中的vs代码+运行在Windows机器上的venv（tf 2.5.3，python 3.8.10）组成。

正如我在无数帖子中读到的那样，我遇到的错误很可能源于张量流图中的断开连接。然而，我无法确定它发生在哪里，或者我是否对代码中真正基本的东西一无所知。

更新

正如 @xdurch0 所指出的，我的代码中有一个简单的错误，我没有在磁带范围内定义损失函数。然而还有另一个问题： tf.concat 方法中存在错误（github 上的错误报告）当标量连接时，这会导致神秘的除以零错误。这可以通过使用例如tf.expand_dims将标量扩展为单个条目列表来避免。

抱歉，意大利面的详细描述如下：

我尝试做的事情的描述：

我首先在 MNIST 上训练一个模型，我想用一种依赖于我稍后尝试学习的深层内核的方法来检查哪些隐藏层。

# this is the code from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
from __future__ import print_function
from tarfile import SYMTYPE
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D


import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf


# batch_size = 128
num_classes = 10
epochs = 12

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
model.summary()

成功训练模型后（这可以正常工作），我想在其上训练深度内核，以便根据单个示例来衡量概率分布之间的相似性：

# semantic aware kernel mit maxpooling layer (model.layers[3])
# parameters w=(eps0, h1, h2), h1: bandwidth_sem_layer, h2: bandwidth
# @tf.function
def gaussian_kernel(x, y, h):  
    k = 1/(tf.sqrt(2*tf.constant(np.pi))) * tf.exp(-(tf.abs(x-y)**2)/(2*h+1e-10))
    return tf.math.reduce_sum(k), k

# @tf.function
def s_f(x, y, h, sem_layer=sem_layer, model=model):
    mod = keras.Model(model.inputs, model.layers[sem_layer].output)
    s = gaussian_kernel(mod(x), mod(y), h)
    return s


# @tf.function
def k_w(x,y, h1, h2, eps0, sem_layer=sem_layer, model=model): 
    k = ((1-eps0)*s_f(x, y, h1, sem_layer=sem_layer, model=model)[0] +eps0)*gaussian_kernel(x,y, h2)[0]
    return k 



# @tf.function
def SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model):
    '''
    calculates SAMMD^2 value. 
    Returns: SAMMD^2 value as M and array H
    '''
    assert x_data.shape == y_data.shape
    n = x_data.shape[0]
    h = tf.constant(0, tf.float32)
    for i in range(n):
        


        for j in range(n):
            if i ==0 and j ==0:
                continue
            else:
                h0 = h
                if j==1:
                    if j!=i:
                        h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                        h = tf.concat([h0,h], axis=0)
                    if j==i:
                        h=tf.constant(0, tf.float32)
                        h = tf.concat([h0,h], axis=0)
                elif j==i:
                    h=tf.constant(0, tf.float32)
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)

                elif j>1:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)
                else:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    


        if i==0:
            H0 = h
        elif i==1:
            H = tf.concat([H0,h], axis=0)
            H0 = H

        elif i>1:
            H = tf.concat([H0,h], axis=0)
            H0 = H       


    H = tf.reshape(H, shape=(n,n))
    M = (1/ (n*(n-1))) * tf.math.reduce_sum(H);  
    return M, H, n

# @tf.function
def sigma2(H, n, reg_lambda=1e-8):
    H2 = (tf.math.reduce_sum(H, axis=1))**2
    out = (4/ n**3) * tf.math.reduce_sum(H2) - (4/ n**4) * (tf.math.reduce_sum(H))**2 + tf.constant(reg_lambda)
    return out

epochs = 2
sem_layer = 3
x_data = x_test[0:10,...]
y_data = x_test[0:10,...]

# Optimierung 
optimizer = keras.optimizers.Adam(learning_rate=0.0002)
h1      = tf.Variable(0.5)  
h2      = tf.Variable(0.5)
eps0    = tf.Variable(0.01)
n = 2 # x_data.shape[0]
mod = keras.Model(model.inputs, model.layers[sem_layer].output)
reg_lambda=1e-8


# Training loop
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    
    # for i in range(n):
    with tf.GradientTape() as tape:
        tape.watch([h1, h2, eps0])

        M, H, m = SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model)
        assert M is not None
        assert H is not None
        V = sigma2(H, m)
        J =  M / (tf.sqrt(V) + 1e-10)
        # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)

        

    grads = tape.gradient(-J, [h1, h2, eps0])
    
    optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))

    print(f'epoch {epoch} J value = {J}')

当我运行代码时，我得到的错误是在第一个纪元中，当调用 apply_gradients 时提出：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tf/SHAP/MNIST_LRP.py in <cell line: 2>()
     293     # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)
     297 grads = tape.gradient(-J, [h1, h2, eps0])
---> 299 optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))
     301 print(f'epoch {epoch} J value = {J}')

File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:630, in OptimizerV2.apply_gradients(self, grads_and_vars, name, experimental_aggregate_gradients)
    589 def apply_gradients(self,
    590                     grads_and_vars,
    591                     name=None,
    592                     experimental_aggregate_gradients=True):
    593   """Apply gradients to variables.
    594 
    595   This is the second part of `minimize()`. It returns an `Operation` that
   (...)
    628     RuntimeError: If called in a cross-replica context.
    629   """
--> 630   grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
    631   var_list = [v for (_, v) in grads_and_vars]
    633   with ops.name_scope_v2(self._name):
    634     # Create iteration if necessary.

File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/utils.py:75, in filter_empty_gradients(grads_and_vars)
     72 filtered = tuple(filtered)
     74 if not filtered:
---> 75   raise ValueError("No gradients provided for any variable: %s." %
     76                    ([v.name for _, v in grads_and_vars],))
     77 if vars_with_empty_grads:
     78   logging.warning(
     79       ("Gradients do not exist for variables %s when minimizing the loss."),
     80       ([v.name for v in vars_with_empty_grads]))

ValueError: No gradients provided for any variable: ['Variable:0', 'Variable:0', 'Variable:0'].

我不明白如何修复这个空梯度，并且非常感谢您的帮助。

原文

Hi stack overflow community,

I am fairly new to tensorflow and programming, hence there may be obvious errors I just overlook.

My dev environment consists of vs code running in a tensorflow docker container + venv (tf 2.5.3, python 3.8.10) running on a windows machine.

As I read in countless posts, the error I encounter most likely stems from a disconnect in the tensorflow graph. However I can't pinpoint where it happens or whether I am ignorant to something really basic in my code.

UPDATE

There was a simple error in my code as pointed out by @xdurch0, where I didn't define the loss function inside the tape scope. However there was another problem:
There is a bug in the tf.concat method (bug report on github) when scalars are concatenated, which led to a cryptic division by zero error. This can be circumvented by expanding the scalar to a single entry list using e.g. tf.expand_dims.

Sorry for the pasta a detailled description follows:

Description of what I try to do:

I first train a model on MNIST, which hidden layers I would like to examine with the a method that relies on the deep kernel I try to learn later.

# this is the code from https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
from __future__ import print_function
from tarfile import SYMTYPE
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D


import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf


# batch_size = 128
num_classes = 10
epochs = 12

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
model.summary()

After the model is successfully trained (this works without a problem), I would like to train the deep kernel on it to give me a measure of similarity between probability distributions on basis of single examples:

# semantic aware kernel mit maxpooling layer (model.layers[3])
# parameters w=(eps0, h1, h2), h1: bandwidth_sem_layer, h2: bandwidth
# @tf.function
def gaussian_kernel(x, y, h):  
    k = 1/(tf.sqrt(2*tf.constant(np.pi))) * tf.exp(-(tf.abs(x-y)**2)/(2*h+1e-10))
    return tf.math.reduce_sum(k), k

# @tf.function
def s_f(x, y, h, sem_layer=sem_layer, model=model):
    mod = keras.Model(model.inputs, model.layers[sem_layer].output)
    s = gaussian_kernel(mod(x), mod(y), h)
    return s


# @tf.function
def k_w(x,y, h1, h2, eps0, sem_layer=sem_layer, model=model): 
    k = ((1-eps0)*s_f(x, y, h1, sem_layer=sem_layer, model=model)[0] +eps0)*gaussian_kernel(x,y, h2)[0]
    return k 



# @tf.function
def SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model):
    '''
    calculates SAMMD^2 value. 
    Returns: SAMMD^2 value as M and array H
    '''
    assert x_data.shape == y_data.shape
    n = x_data.shape[0]
    h = tf.constant(0, tf.float32)
    for i in range(n):
        


        for j in range(n):
            if i ==0 and j ==0:
                continue
            else:
                h0 = h
                if j==1:
                    if j!=i:
                        h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                                + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                        h = tf.concat([h0,h], axis=0)
                    if j==i:
                        h=tf.constant(0, tf.float32)
                        h = tf.concat([h0,h], axis=0)
                elif j==i:
                    h=tf.constant(0, tf.float32)
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)

                elif j>1:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    
                    h = tf.concat([h0,tf.expand_dims(h, axis=0)], axis=0)
                else:
                    h = k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(y_data[i], axis=0),tf.expand_dims(x_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model) \
                            + k_w(tf.expand_dims(x_data[i], axis=0),tf.expand_dims(y_data[j], axis=0), h1, h2, eps0, sem_layer=sem_layer, model=model)    


        if i==0:
            H0 = h
        elif i==1:
            H = tf.concat([H0,h], axis=0)
            H0 = H

        elif i>1:
            H = tf.concat([H0,h], axis=0)
            H0 = H       


    H = tf.reshape(H, shape=(n,n))
    M = (1/ (n*(n-1))) * tf.math.reduce_sum(H);  
    return M, H, n

# @tf.function
def sigma2(H, n, reg_lambda=1e-8):
    H2 = (tf.math.reduce_sum(H, axis=1))**2
    out = (4/ n**3) * tf.math.reduce_sum(H2) - (4/ n**4) * (tf.math.reduce_sum(H))**2 + tf.constant(reg_lambda)
    return out

epochs = 2
sem_layer = 3
x_data = x_test[0:10,...]
y_data = x_test[0:10,...]

# Optimierung 
optimizer = keras.optimizers.Adam(learning_rate=0.0002)
h1      = tf.Variable(0.5)  
h2      = tf.Variable(0.5)
eps0    = tf.Variable(0.01)
n = 2 # x_data.shape[0]
mod = keras.Model(model.inputs, model.layers[sem_layer].output)
reg_lambda=1e-8


# Training loop
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    
    # for i in range(n):
    with tf.GradientTape() as tape:
        tape.watch([h1, h2, eps0])

        M, H, m = SAMMD2(x_data, y_data, h1, h2, eps0, sem_layer=sem_layer, model=model)
        assert M is not None
        assert H is not None
        V = sigma2(H, m)
        J =  M / (tf.sqrt(V) + 1e-10)
        # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)

        

    grads = tape.gradient(-J, [h1, h2, eps0])
    
    optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))

    print(f'epoch {epoch} J value = {J}')

When I run the code I get an error is raised in the first epoch, when apply_gradients is called:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tf/SHAP/MNIST_LRP.py in <cell line: 2>()
     293     # cost_value = cost(x_data, y_data) #, h1, h2, eps0, sem_layer=sem_layer)
     297 grads = tape.gradient(-J, [h1, h2, eps0])
---> 299 optimizer.apply_gradients(zip(grads, [h1, h2, eps0]))
     301 print(f'epoch {epoch} J value = {J}')

File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:630, in OptimizerV2.apply_gradients(self, grads_and_vars, name, experimental_aggregate_gradients)
    589 def apply_gradients(self,
    590                     grads_and_vars,
    591                     name=None,
    592                     experimental_aggregate_gradients=True):
    593   """Apply gradients to variables.
    594 
    595   This is the second part of `minimize()`. It returns an `Operation` that
   (...)
    628     RuntimeError: If called in a cross-replica context.
    629   """
--> 630   grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
    631   var_list = [v for (_, v) in grads_and_vars]
    633   with ops.name_scope_v2(self._name):
    634     # Create iteration if necessary.

File /tf/.env/shapvenv/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/utils.py:75, in filter_empty_gradients(grads_and_vars)
     72 filtered = tuple(filtered)
     74 if not filtered:
---> 75   raise ValueError("No gradients provided for any variable: %s." %
     76                    ([v.name for _, v in grads_and_vars],))
     77 if vars_with_empty_grads:
     78   logging.warning(
     79       ("Gradients do not exist for variables %s when minimizing the loss."),
     80       ([v.name for v in vars_with_empty_grads]))

ValueError: No gradients provided for any variable: ['Variable:0', 'Variable:0', 'Variable:0'].

I don't understand how to fix this empty gradient and would be very appriciative of your help.

分享到QQ

分享到微博