每次运行下一个句子预测代码时，都会随机输出

发布于 2025-02-04 03:03:40 字数 3559 浏览 3 评论 0原文

根据下面提供的代码，我试图在自定义数据集上运行NSP（下一个句子预测）。训练后的损失每次都不同，并且该模型每次都会提供不同的精度。我想念或做错了什么？

pip install transformers[torch]
from transformers import BertTokenizer, BertForNextSentencePrediction 
import torch  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)
import random
 
sentence_a = []
sentence_b = []
label = []
 
for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)
 
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([label]).T
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
 
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
from transformers import AdamW
 
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)
 
from tqdm import tqdm  # for our progress bar
 
epochs = 2
 
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

在下面的代码中，我正在对看不见的数据测试模型：

from torch.nn import functional as f
from torch.nn.functional import softmax
prompt = "sentence 1 text"
prompt2 = "sentence 2 text"
output = tokenizer.encode_plus(prompt,prompt2, return_tensors="pt")
result = model(**output)[0]
prob = softmax(result, dim=1)
print(prob)

因此，对于我所知，概率和损失的价值每次都不同，据我所知，这应该是相同的。

原文

Based on the code provided below, I am trying to run NSP (Next Sentence Prediction) on a custom dataset. The loss after training the model is different every time and the model give different accuracies every time. What am I missing or doing wrong?

pip install transformers[torch]
from transformers import BertTokenizer, BertForNextSentencePrediction 
import torch  
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
with open('clean.txt', 'r') as fp:
    text = fp.read().split('\n')
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)
import random
 
sentence_a = []
sentence_b = []
label = []
 
for paragraph in text:
    sentences = [
        sentence for sentence in paragraph.split('.') if sentence != ''
    ]
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)
 
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs['labels'] = torch.LongTensor([label]).T
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
 
dataset = MeditationsDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
from transformers import AdamW
 
# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)
 
from tqdm import tqdm  # for our progress bar
 
epochs = 2
 
for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In the code below I am testing the model on unseen data:

from torch.nn import functional as f
from torch.nn.functional import softmax
prompt = "sentence 1 text"
prompt2 = "sentence 2 text"
output = tokenizer.encode_plus(prompt,prompt2, return_tensors="pt")
result = model(**output)[0]
prob = softmax(result, dim=1)
print(prob)

So, the value of prob and loss is different every single time for the same unseen data which to the best of my knowledge should be same.

分享到QQ

分享到微博