如何用 CNN 的预训练词嵌入替换 keras 嵌入

发布于 2025-01-21 03:40:08 字数 1779 浏览 1 评论 0原文

我目前正在研究如何在文本分类中使用 CNN，并在堆栈溢出上找到了一些与 keras 嵌入层一起使用的代码。

我使用 keras 嵌入运行了代码，但现在想测试预训练的嵌入会发生什么，我已经从 gensim 下载了 word2vec api，但不知道如何从那里调整代码？

我的问题是如何用 word2vec 模型或 Glove 等预先训练的嵌入替换 keras 嵌入层？

这是代码

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

原文

I am currently studying how CNNs can be used in text classification and found some code on stack overflow that had worked with the use of a keras embedding layer.

I ran the code with the keras embedding but now want to test out what would happen with a pre-trained embedding, I have downloaded the word2vec api from gensim but dont know how to adapt the code from there?

My question is how can I replace the keras embedding layer with a pre-trained embedding like the word2vec model or Glove?

heres is the code

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=3, callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

倒数 2025-01-28 03:40:08

这读取包含权重的文本文件，将单词及其权重存储在字典中，然后使用您的拟合令牌的词汇将它们映射到新的矩阵中。

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from tensorflow import keras
import itertools
import numpy as np


# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

word_index = keras.datasets.imdb.get_word_index()

embedding_vecor_length = 300  # same as the embeds to be loaded below
embeddings_dictionary = dict()
glove_file = open('./embeds/glove.6B.300d.txt', 'rb')

for line in glove_file:
    records = line.split()  # seperates each line by a white space
    word = records[0]  # the first element is the word
    vector_dimensions = np.asarray(
        records[1:], dtype='float32')  # the rest are the weights
    # storing in dictionary
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

# len_of_vocab = len(word_index)
embeddings_matrix = np.zeros((top_words, embedding_vecor_length))
# mapping to a new matrix, using only the words in your tokenizer's vocabulary
for word, index in word_index.items():
    if index>=top_words:
        continue
    # the weights of the individual words in your vocabulary
    embedding_vector = embeddings_dictionary.get(bytes(word, 'utf-8'))
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


# Using embedding from Keras
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,
          input_length=max_review_length, name="embeddinglayer", weights=[embeddings_matrix], trainable=True))


# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=3, callbacks=[
          tensorBoardCallback], batch_size=64)


# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

This reads the text file containing the weights, stores the words and their weights in a dictionary, then maps them into a new matrix using the vocabulary of your fit tokenizer.

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from tensorflow import keras
import itertools
import numpy as np


# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

word_index = keras.datasets.imdb.get_word_index()

embedding_vecor_length = 300  # same as the embeds to be loaded below
embeddings_dictionary = dict()
glove_file = open('./embeds/glove.6B.300d.txt', 'rb')

for line in glove_file:
    records = line.split()  # seperates each line by a white space
    word = records[0]  # the first element is the word
    vector_dimensions = np.asarray(
        records[1:], dtype='float32')  # the rest are the weights
    # storing in dictionary
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

# len_of_vocab = len(word_index)
embeddings_matrix = np.zeros((top_words, embedding_vecor_length))
# mapping to a new matrix, using only the words in your tokenizer's vocabulary
for word, index in word_index.items():
    if index>=top_words:
        continue
    # the weights of the individual words in your vocabulary
    embedding_vector = embeddings_dictionary.get(bytes(word, 'utf-8'))
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

# Pad the sequence to the same length
max_review_length = 1600
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)


# Using embedding from Keras
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,
          input_length=max_review_length, name="embeddinglayer", weights=[embeddings_matrix], trainable=True))


# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=3, callbacks=[
          tensorBoardCallback], batch_size=64)


# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

回复收藏 0 原文

~没有更多了~