Runtimeerror:批次列表中的每个元素在Bert中的大小应相等

发布于 2025-02-08 11:17:26 字数 6823 浏览 2 评论 0原文

现在我想训练伯特模型。
但是发生RuntimeError。
我不知道如何解决此错误。
请给我一些帮助。

这是我的代码。

我在 Python = 3.8.10 Pytorch = 1.8.0

我使用了IMDB数据集。

我尝试了更改版本并检查数据。 数据集是可变数据。

如何处理可变数据? 请给我一些提示。

def pretraining(
    model: MLMandNSPmodel,
    model_name: str,
    train_dataset: PretrainDataset,
    val_dataset: PretrainDataset,
):
    # Below options are just our recommendation. You can choose different options if you want.
    batch_size = 8
    learning_rate = 1e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    epochs = 200 # 200 if you want to feel the effect of pretraining
    steps_per_a_epoch: int=2000
    steps_for_val: int=200

    ### YOUR CODE HERE
    # pretraining(model, model_name, train_dataset, val_dataset)
    
    MLM_train_losses: List[float] = None
    MLM_val_losses: List[float] = None
    NSP_train_losses: List[float] = None
    NSP_val_losses: List[float] = None
    MLM_train_losses = []
    MLM_val_losses = []
    NSP_train_losses = []
    NSP_val_losses = []
    
    print('')
    print(train_dataset)
    print(val_dataset)
    print('')
    
    train_data_iterator = iter(
        torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
    # train_data_iterator = torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
    eval_data_iterator = iter(
        torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
    # eval_data_iterator = torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
    
    loss_log = tqdm(total=0, bar_format='{desc}')
    i = 0
    for epoch in trange(epochs, desc="Epoch", position=0):
        i += 1
        # Run batches for 'steps_per_a_epoch' times
        MLM_loss = 0
        NSP_loss = 0
        model.train()
        for step in trange(steps_per_a_epoch, desc="Training steps"):
            optimizer.zero_grad()
            src, mlm, mask, nsp = next(train_data_iterator)
            mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
            MLM_loss += mlm_loss
            NSP_loss += nsp_loss
            loss = mlm_loss + nsp_loss
            loss.backward()
            optimizer.step()
            des = 'Loss: {:06.4f}'.format(loss.cpu())
            loss_log.set_description_str(des)

        # Calculate training loss
        MLM_loss = MLM_loss / steps_per_a_epoch
        NSP_loss = NSP_loss / steps_per_a_epoch
        MLM_train_losses.append(float(MLM_loss.data))
        NSP_train_losses.append(float(NSP_loss.data))

        # Calculate valid loss
        model.eval()
        valid_mlm_loss = 0.
        valid_nsp_loss = 0.

        for step in trange(steps_for_val, desc="Evaluation steps"):
            src, mlm, mask, nsp = next(eval_data_iterator)
            mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
            valid_mlm_loss += mlm_loss
            valid_nsp_loss += nsp_loss

        valid_mlm_loss = valid_mlm_loss / steps_for_val
        valid_nsp_loss = valid_nsp_loss / steps_for_val

        MLM_val_losses.append(float(valid_mlm_loss.data))
        NSP_val_losses.append(float(valid_nsp_loss.data))
        torch.save(model.state_dict(), os.path.join('/home/ml/Desktop/song/HW3/hw3/',model_name + str(i)+'.pth'))


    ### END YOUR CODE

    assert len(MLM_train_losses) == len(MLM_val_losses) == epochs and \
           len(NSP_train_losses) == len(NSP_val_losses) == epochs

    assert all(isinstance(loss, float) for loss in MLM_train_losses) and \
           all(isinstance(loss, float) for loss in MLM_val_losses) and \
           all(isinstance(loss, float) for loss in NSP_train_losses) and \
           all(isinstance(loss, float) for loss in NSP_val_losses)

    return MLM_train_losses, MLM_val_losses, NSP_train_losses, NSP_val_losses

这是错误消息

(hw3) ml@automl03:~/Desktop/song/HW3/hw3$ python pretrain.py
======MLM & NSP Pretraining======

<__main__.PretrainDataset object at 0x7fa72117eb80>
<__main__.PretrainDataset object at 0x7fa70d0a2a30>


<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3640>
<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3670>

Training steps:   0%|                                                                                                                                                                  | 0/2000 [00:00<?, ?it/s]
Epoch:   0%|                                                                                                                                                                            | 0/200 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "pretrain.py", line 527, in <module>
    pretrain_model()
  File "pretrain.py", line 505, in pretrain_model
    = pretraining(model, model_name, train_dataset, val_dataset)
  File "pretrain.py", line 316, in pretraining
    src, mlm, mask, nsp = next(train_data_iterator)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 517, in __next__
    data = self._next_data()
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
    return self._process_data(data)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
    data.reraise()
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/_utils.py", line 429, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 35, in fetch
    return self.collate_fn(data)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 81, in default_collate
    raise RuntimeError('each element in list of batch should be of equal size')
RuntimeError: each element in list of batch should be of equal siz

Now i want to train bert model.
But runtimeError occured.
I don't know how to solve this error.
Please give me some help.

Here is my code.

I excute this code in
python = 3.8.10
pytorch = 1.8.0

I used IMDB datasets.

I tried change version and check data.
And the dataset is variable data.

How to handle variable data?
Please give me some tips.

def pretraining(
    model: MLMandNSPmodel,
    model_name: str,
    train_dataset: PretrainDataset,
    val_dataset: PretrainDataset,
):
    # Below options are just our recommendation. You can choose different options if you want.
    batch_size = 8
    learning_rate = 1e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    epochs = 200 # 200 if you want to feel the effect of pretraining
    steps_per_a_epoch: int=2000
    steps_for_val: int=200

    ### YOUR CODE HERE
    # pretraining(model, model_name, train_dataset, val_dataset)
    
    MLM_train_losses: List[float] = None
    MLM_val_losses: List[float] = None
    NSP_train_losses: List[float] = None
    NSP_val_losses: List[float] = None
    MLM_train_losses = []
    MLM_val_losses = []
    NSP_train_losses = []
    NSP_val_losses = []
    
    print('')
    print(train_dataset)
    print(val_dataset)
    print('')
    
    train_data_iterator = iter(
        torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
    # train_data_iterator = torch.utils.data.dataloader.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
    eval_data_iterator = iter(
        torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False))
    # eval_data_iterator = torch.utils.data.dataloader.DataLoader(val_dataset, batch_size=batch_size, num_workers=2, shuffle=False)
    
    loss_log = tqdm(total=0, bar_format='{desc}')
    i = 0
    for epoch in trange(epochs, desc="Epoch", position=0):
        i += 1
        # Run batches for 'steps_per_a_epoch' times
        MLM_loss = 0
        NSP_loss = 0
        model.train()
        for step in trange(steps_per_a_epoch, desc="Training steps"):
            optimizer.zero_grad()
            src, mlm, mask, nsp = next(train_data_iterator)
            mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
            MLM_loss += mlm_loss
            NSP_loss += nsp_loss
            loss = mlm_loss + nsp_loss
            loss.backward()
            optimizer.step()
            des = 'Loss: {:06.4f}'.format(loss.cpu())
            loss_log.set_description_str(des)

        # Calculate training loss
        MLM_loss = MLM_loss / steps_per_a_epoch
        NSP_loss = NSP_loss / steps_per_a_epoch
        MLM_train_losses.append(float(MLM_loss.data))
        NSP_train_losses.append(float(NSP_loss.data))

        # Calculate valid loss
        model.eval()
        valid_mlm_loss = 0.
        valid_nsp_loss = 0.

        for step in trange(steps_for_val, desc="Evaluation steps"):
            src, mlm, mask, nsp = next(eval_data_iterator)
            mlm_loss, nsp_loss = calculate_losses(model, src, mlm, mask, nsp)
            valid_mlm_loss += mlm_loss
            valid_nsp_loss += nsp_loss

        valid_mlm_loss = valid_mlm_loss / steps_for_val
        valid_nsp_loss = valid_nsp_loss / steps_for_val

        MLM_val_losses.append(float(valid_mlm_loss.data))
        NSP_val_losses.append(float(valid_nsp_loss.data))
        torch.save(model.state_dict(), os.path.join('/home/ml/Desktop/song/HW3/hw3/',model_name + str(i)+'.pth'))


    ### END YOUR CODE

    assert len(MLM_train_losses) == len(MLM_val_losses) == epochs and \
           len(NSP_train_losses) == len(NSP_val_losses) == epochs

    assert all(isinstance(loss, float) for loss in MLM_train_losses) and \
           all(isinstance(loss, float) for loss in MLM_val_losses) and \
           all(isinstance(loss, float) for loss in NSP_train_losses) and \
           all(isinstance(loss, float) for loss in NSP_val_losses)

    return MLM_train_losses, MLM_val_losses, NSP_train_losses, NSP_val_losses

And this is error message

(hw3) ml@automl03:~/Desktop/song/HW3/hw3$ python pretrain.py
======MLM & NSP Pretraining======

<__main__.PretrainDataset object at 0x7fa72117eb80>
<__main__.PretrainDataset object at 0x7fa70d0a2a30>


<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3640>
<torch.utils.data.dataloader._MultiProcessingDataLoaderIter object at 0x7fa70c8b3670>

Training steps:   0%|                                                                                                                                                                  | 0/2000 [00:00<?, ?it/s]
Epoch:   0%|                                                                                                                                                                            | 0/200 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "pretrain.py", line 527, in <module>
    pretrain_model()
  File "pretrain.py", line 505, in pretrain_model
    = pretraining(model, model_name, train_dataset, val_dataset)
  File "pretrain.py", line 316, in pretraining
    src, mlm, mask, nsp = next(train_data_iterator)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 517, in __next__
    data = self._next_data()
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1199, in _next_data
    return self._process_data(data)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1225, in _process_data
    data.reraise()
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/_utils.py", line 429, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 202, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 35, in fetch
    return self.collate_fn(data)
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 83, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/ml/anaconda3/envs/hw3/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 81, in default_collate
    raise RuntimeError('each element in list of batch should be of equal size')
RuntimeError: each element in list of batch should be of equal siz

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文