Hugginface Dataloader BERT ValueError:解压值太多(预期为 2),使用 Pytorch 调整 AX 超参数

发布于 2025-01-12 18:45:36 字数 5889 浏览 0 评论 0原文

从一周开始我就遇到了这个错误,我尝试了一切,所以事实是我没有深入理解正在发生的事情(我是 pytorch 实现的新手)。无论如何,我正在尝试实现一个 Bert 分类器来区分 2 个序列类,并使用 AX 超参数调整。 这是我的数据集样本预期实现的所有代码(我有 3 个 csv,train-test-val)。非常感谢 !

                                           0        1
M A T T D R P T P D G T D A I D L T T R V R R...    1
M K K L F Q T E P L L E L F N C N E L R I I G...    0
M L V A A A V C P H P P L L I P E L A A G A A...    1
M I V A W G N S G S G L L I L I L S L A V S A...    0
M V E E G R R L A A L H P N I V V K L P T T E...    1
M G S K V S K N A L V F N V L Q A L R E G L T...    1
M P S K E T S P A E R M A R D E Y Y M R L A M...    1
M V K E Y A L E W I D G Y R E R L V K V S D A...    1
M G T A A S Q D R A A M A E A A Q R V G D S F...    0

df_train=pd.read_csv('CLASSIFIER_train',sep=',',header=None)
df_train
class SequenceDataset(Dataset):

  def __init__(self, sequences, targets, tokenizer, max_len):
    self.sequences = sequences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, item):
    sequences = str(self.sequences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      sequences,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )


    return {
      'sequences_text': sequences,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SequenceDataset(
    sequences=df[0].to_numpy(),
    targets=df[1].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2,
    shuffle=True
  )

BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr", 0.001), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 30)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network
  for _ in range(num_epochs):
      for inputs, labels in train_data_loader:
          # move data to proper dtype and device
          inputs = inputs.to(dtype=dtype, device=device)
          labels = labels.to(device=device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          scheduler.step()
  return net
  
def init_net(parameterization):

    model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) 

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = False # Freeze feature extractor
        
    Hs = 512 # Hidden layer size; you can optimize this as well
                                  
    model.fc = nn.Sequential(nn.Linear(2048, Hs), # attach trainable classifier
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(Hs, 10),
                                 nn.LogSoftmax(dim=1))
    return model # return untrained model

def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size


    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 32))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

classes=('0','1')

dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        {"name": "batchsize", "type": "range", "bounds": [16, 128]},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        #{"name": "max_epoch", "type": "range", "bounds": [1, 30]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ],
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)
  File "<ipython-input-71-e52ebc0d7b5b>", line 14, in train_evaluate
    parameters=parameterization, dtype=dtype, device=device)
  File "<ipython-input-61-66c57e7138fa>", line 20, in net_train
    for inputs, labels in train_data_loader:
ValueError: too many values to unpack (expected 2)

I'm stacked with this error from one week now, I tried everything so the fact is that I'm not understanding deeply what is happening (I'm new at pytorch implementation). Anyway I'm trying to implement a Bert Classifier to discriminate between 2 sequences classes, with AX hyperparameters tuning.
This is all my code implemented anticipated by a sample of my datasets ( I have 3 csv, train-test-val). Thank you very much !

                                           0        1
M A T T D R P T P D G T D A I D L T T R V R R...    1
M K K L F Q T E P L L E L F N C N E L R I I G...    0
M L V A A A V C P H P P L L I P E L A A G A A...    1
M I V A W G N S G S G L L I L I L S L A V S A...    0
M V E E G R R L A A L H P N I V V K L P T T E...    1
M G S K V S K N A L V F N V L Q A L R E G L T...    1
M P S K E T S P A E R M A R D E Y Y M R L A M...    1
M V K E Y A L E W I D G Y R E R L V K V S D A...    1
M G T A A S Q D R A A M A E A A Q R V G D S F...    0

df_train=pd.read_csv('CLASSIFIER_train',sep=',',header=None)
df_train
class SequenceDataset(Dataset):

  def __init__(self, sequences, targets, tokenizer, max_len):
    self.sequences = sequences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sequences)
  
  def __getitem__(self, item):
    sequences = str(self.sequences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      sequences,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )


    return {
      'sequences_text': sequences,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SequenceDataset(
    sequences=df[0].to_numpy(),
    targets=df[1].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2,
    shuffle=True
  )

BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

def net_train(net, train_data_loader, parameters, dtype, device):
  net.to(dtype=dtype, device=device)

  # Define loss and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), # or any optimizer you prefer 
                        lr=parameters.get("lr", 0.001), # 0.001 is used if no lr is specified
                        momentum=parameters.get("momentum", 0.9)
  )

  scheduler = optim.lr_scheduler.StepLR(
      optimizer,
      step_size=int(parameters.get("step_size", 30)),
      gamma=parameters.get("gamma", 1.0),  # default is no learning rate decay
  )

  num_epochs = parameters.get("num_epochs", 3) # Play around with epoch number
  # Train Network
  for _ in range(num_epochs):
      for inputs, labels in train_data_loader:
          # move data to proper dtype and device
          inputs = inputs.to(dtype=dtype, device=device)
          labels = labels.to(device=device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          scheduler.step()
  return net
  
def init_net(parameterization):

    model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) 

    # The depth of unfreezing is also a hyperparameter
    for param in model.parameters():
        param.requires_grad = False # Freeze feature extractor
        
    Hs = 512 # Hidden layer size; you can optimize this as well
                                  
    model.fc = nn.Sequential(nn.Linear(2048, Hs), # attach trainable classifier
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(Hs, 10),
                                 nn.LogSoftmax(dim=1))
    return model # return untrained model

def train_evaluate(parameterization):

    # constructing a new training data loader allows us to tune the batch size


    train_data_loader=create_data_loader(df_train, tokenizer, MAX_LEN, batch_size=parameterization.get("batchsize", 32))
    
    
    # Get neural net
    untrained_net = init_net(parameterization) 
    
    # train
    trained_net = net_train(net=untrained_net, train_data_loader=train_data_loader, 
                            parameters=parameterization, dtype=dtype, device=device)
    
    # return the accuracy of the model as it was trained in this run
    return evaluate(
        net=trained_net,
        data_loader=test_data_loader,
        dtype=dtype,
        device=device,
    )

classes=('0','1')

dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        {"name": "batchsize", "type": "range", "bounds": [16, 128]},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
        #{"name": "max_epoch", "type": "range", "bounds": [1, 30]},
        #{"name": "stepsize", "type": "range", "bounds": [20, 40]},        
    ],
  
    evaluation_function=train_evaluate,
    objective_name='accuracy',
)

print(best_parameters)
means, covariances = values
print(means)
print(covariances)
  File "<ipython-input-71-e52ebc0d7b5b>", line 14, in train_evaluate
    parameters=parameterization, dtype=dtype, device=device)
  File "<ipython-input-61-66c57e7138fa>", line 20, in net_train
    for inputs, labels in train_data_loader:
ValueError: too many values to unpack (expected 2)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

怪我闹别瞎闹 2025-01-19 18:45:36

您的数据加载器返回一个字典,因此您循环和访问它的方式是错误的,应该这样做:

# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
          # move data to proper dtype and device
          labels = batch['targets'].to(device=device)
          atten_mask = batch['attention_mask'].to(device=device)
          input_ids = batch['input_ids'].to(device=device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(input_ids, attention_mask=atten_mask)

your dataloader returns a dictionary therefore the way you loop and access it is wrong should be done as such:

# Train Network
  for _ in range(num_epochs):
      # Your dataloader returns a dictionary
      # so access it as such
      for batch in train_data_loader:
          # move data to proper dtype and device
          labels = batch['targets'].to(device=device)
          atten_mask = batch['attention_mask'].to(device=device)
          input_ids = batch['input_ids'].to(device=device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(input_ids, attention_mask=atten_mask)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文