berttokenizer错误值:输入NAN无效。应该是字符串,字符串的列表/元组或整数的列表/元组

发布于 2025-01-22 07:44:57 字数 4181 浏览 0 评论 0原文

import pandas as pd
from sklearn.model_selection import train_test_split

# read text data
df = pd.read_csv('E:/bert4keras-master/resume_data/111.txt', header=None,encoding='utf-8', sep='\t',names=['label', 'sentence'])

print(df)


# split text data
train, valid_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=123, stratify=df['label'])
print(valid_test.head)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['label'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):  # len(Dataset)
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]
    text = self.X[index]
    inputs = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(self.y[index])
    }




    # label one-hot
y_train = pd.get_dummies(train, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_valid = pd.get_dummies(valid, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_test = pd.get_dummies(test, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values

# make dataset 


max_len = 256

tokenizer = BertTokenizer.from_pretrained('E:/bert4keras-master/pytorch_bert_large/')
dataset_train = CreateDataset(train['sentence'], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid['sentence'], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test['sentence'], y_test, tokenizer, max_len)

      # dataloader
      dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
      dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)

当我尝试训练模型log = trial_model(dataset_train,dataset_valid,batch_size,batch_size,model,criterion,ipementizer,num_epochs,device = device =设备>

错误

>>> log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 29, in train_model
  File "<stdin>", line 8, in calculate_loss_and_accuracy
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
    data = self._next_data()
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<stdin>", line 11, in __getitem__
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils_base.py", line 2556, in encode_plus
    return self._encode_plus(
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 647, in _encode_plus
    first_ids = get_input_ids(text)
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 634, in get_input_ids
    raise ValueError(
ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

,我非常感谢您,请非常感谢,我会遇到此错误。

import pandas as pd
from sklearn.model_selection import train_test_split

# read text data
df = pd.read_csv('E:/bert4keras-master/resume_data/111.txt', header=None,encoding='utf-8', sep='\t',names=['label', 'sentence'])

print(df)


# split text data
train, valid_test = train_test_split(df, test_size=0.3, shuffle=True, random_state=123, stratify=df['label'])
print(valid_test.head)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['label'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):  # len(Dataset)
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]
    text = self.X[index]
    inputs = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(self.y[index])
    }




    # label one-hot
y_train = pd.get_dummies(train, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_valid = pd.get_dummies(valid, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values
y_test = pd.get_dummies(test, columns=['label'])[['label_Exp','label_PI','label_Sum','label_Edu', 'label_QC', 'label_Skill', 'label_Obj']].values

# make dataset 


max_len = 256

tokenizer = BertTokenizer.from_pretrained('E:/bert4keras-master/pytorch_bert_large/')
dataset_train = CreateDataset(train['sentence'], y_train, tokenizer, max_len)
dataset_valid = CreateDataset(valid['sentence'], y_valid, tokenizer, max_len)
dataset_test = CreateDataset(test['sentence'], y_test, tokenizer, max_len)

      # dataloader
      dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
      dataloader_valid = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False)

and I get this error when I try to train model log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)

ERROR

>>> log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, device=device)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 29, in train_model
  File "<stdin>", line 8, in calculate_loss_and_accuracy
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
    data = self._next_data()
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<stdin>", line 11, in __getitem__
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils_base.py", line 2556, in encode_plus
    return self._encode_plus(
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 647, in _encode_plus
    first_ids = get_input_ids(text)
  File "E:\anaconda3\envs\py38pytorch\lib\site-packages\transformers\tokenization_utils.py", line 634, in get_input_ids
    raise ValueError(
ValueError: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.

any suggestions please Thank you very much

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

假装不在乎 2025-01-29 07:44:57

是的应该调试
输入= self.tokenizer.encode_plus(
文本,
add_special_tokens = true,
max_length = self.max_len,
pad_to_to_max_length = true

“文本”应该字符串或列表字符串...

yoy should debug in
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True
)
'text' should string or list string...

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文