如何在 PyTorch 中的复杂（嵌套）模块中有效地初始化（并检查健全性）层的权重？

发布于 2025-01-12 16:37:43 字数 6504 浏览 1 评论 0原文

寻找一种有效的方法来访问嵌套模块和层来设置权重

我正在复制DCGAN Paper 和我的代码按预期工作。我发现论文的作者是这么说的：

所有权重均从零中心正态分布初始化标准差0.02

这个很棒的答案解释了可以使用 torch.nn.init.normal_(nn.Conv2d(1,1,1, 1,1 ).weight.data, 0.0, 0.02) 但我有复杂使用 ModuleList 等结构。做到这一点最有效的方法是什么？

通过复杂，请查看下面的代码来了解我的实现：

'''
Implement the Deep Convolution Gan AKA DCGAN in Pytorch: Paper at https://arxiv.org/pdf/1511.06434v2.pdf
'''
import torch
import torch.nn as nn


class GeneratorBlock(nn.Module):
    '''
    Generator Block uses TransposedConv2D -> Batch Norm (except LAST block) -> Relu
    Note: kernel_size = 4, stride = 2, padding = 1 is used in the paper. When BatchNorm is used, Bias is not used for Conv2D
    '''
    def __init__(self, in_channels, out_channels, kernel_size = 4, stride = 2, padding = 1, use_batchnorm:bool = True):
        super().__init__()
        self.use_batchnorm = use_batchnorm
        self.transpose_conv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size = kernel_size, stride=stride, padding=padding, bias = not self.use_batchnorm)
        self.batch_norm = nn.BatchNorm2d(out_channels) if self.use_batchnorm else None
        self.activation = nn.ReLU() # Paper uses Relu in Generator Network
    
    def forward(self, x):
        x = self.transpose_conv(x)
        return self.activation(self.batch_norm(x)) if self.use_batchnorm else self.activation(x)


class Generator(nn.Module):
    '''
    Generate Images using Transposed Convolution. Input is a random noise of [Batch, 100, 1,1] Dimension and then upsampled
    '''
    def __init__(self, input_features = 100, base_feature = 128, final_channels:int = 1):
        '''
        We use nn.Sequantial here just to show the workings. If you want to make the layers dynamically using a loop, find nn.ModuleList() in the Descriminator block. Both works same
        So we'll use 'base_feature = 64' as a base for input and output channels
        args:
            input_features: The shape of Random Noise from which an image will be generated
            base_feature: The shape of feature map or number or channels which will act as out base. Other inputs and outputs will be calculated based on this
            final_channels: The channels / features which will be sent to the Discriminator as an input
        '''
        super(Generator, self).__init__()

        # in Descriminator, we do the same work using ModuleList(). Uses 4 blocks
        self.blocks = nn.Sequential(
            GeneratorBlock(in_channels = input_features, out_channels = base_feature * 8, stride = 1, padding = 0), # from Random Noise, Generate 1024 features
            GeneratorBlock(in_channels = base_feature * 8, out_channels = base_feature * 4), # 1024 -> 512 features
            GeneratorBlock(in_channels = base_feature * 4, out_channels = base_feature * 2), # 512 -> 256 features
            GeneratorBlock(in_channels = base_feature * 2, out_channels = base_feature), # 256 -> 128 features
            nn.ConvTranspose2d(base_feature, final_channels, kernel_size = 4, stride = 2, padding = 1)# 128 -> final feature. It is just GeneratorBlock without ReLu and BatchNorm ;)
        )
        self.activation = nn.Tanh() # To make the outputs between [-1,1]
    
    def forward(self, x):
        '''
        Takes Random Noise as input and Generte features from that
        '''
        return self.activation(self.blocks(x))
    

class DiscriminatorBlock(nn.Module):
    '''
    Discriminator Block uses Conv2D -> Batch Norm (except FIRST block) -> LeakyRelu
    Note: kernel_size = 4, stride = 2, padding = 1 is used in the paper. When BatchNorm is used, Bias is not used for Conv2D
    '''
    def __init__(self, in_channels, out_channels, kernel_size = 4, stride = 2, padding = 1, use_batchnorm:bool = True):
        super().__init__()
        self.use_batchnorm = use_batchnorm
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias = not self.use_batchnorm)
        self.batch_norm = nn.BatchNorm2d(out_channels) if self.use_batchnorm else None
        self.activation = nn.LeakyReLU(0.2)
    
    def forward(self, x):
        x = self.conv(x)
        return self.activation(self.batch_norm(x)) if self.use_batchnorm else self.activation(x)
    

class Discriminator(nn.Module):
    '''
    CNNs to classify whether the image generated by the Generator are as good as the real ones
    Feature Changes as :: 1 -> 64 -> 128 -> 256 -> 512 -> 1
    '''
    def __init__(self, input_features = 1, output_features = 1,  middle_features = [64,128,256]):
        '''
        In the paper, they take in a feature of [Batch, 1, 64, 64] from the Generator and then output a single number per sample in the batch
        '''
        super().__init__()
        self.layers = nn.ModuleList() # Just a fancy method of stacking layers using loop

        # in the paper, the first layer does not use BatchNorm
        self.layers.append(DiscriminatorBlock(input_features, middle_features[0], use_batchnorm = False)) #  1 -> 64 Because the input has 1 channel

        for i, channel in enumerate(middle_features): # total 4 blocks are used in paper. 1 has already been used in the line above. 3 blocks are these
            self.layers.append(DiscriminatorBlock(channel, channel*2)) # 64 -> 128 --- 128 -> 256 --- 256 -> 512

        self.final_conv = nn.Conv2d(in_channels = middle_features[-1]*2,  out_channels = output_features, kernel_size = 4, stride = 2,  padding = 0) # Input from previous layer 512 -> 1
        self.sigmoid_layer = nn.Sigmoid() # gives whether an image is real or fake or more precisely, how CLOSE is it to the real image

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        
        return self.sigmoid_layer(self.final_conv(x))


def test_DCGAN_code():
    noise = torch.rand(10,100,1,1)
    image = Generator()(noise)
    result = Discriminator()(image)
    print('Model Built Successfully!!! Generating 10 random samples and their end results')
    print(f"'Z' random Noise shape: {noise.shape} || Generator output shape: {image.shape} || Discriminator shape: {result.shape}")

原文

Looking for an efficient way to access nested Modules and Layers to set the weights

I am replicating the DCGAN Paper and my code works as expected. I found out that in the paper, the authors said that:

All weights were initialized from a zero-centered Normal distribution
with standard deviation 0.02

This awesome answer explains that it can be done using torch.nn.init.normal_(nn.Conv2d(1,1,1, 1,1 ).weight.data, 0.0, 0.02) but I have complex structure using ModuleList and others. What is the most efficient way of doing this?

By Complex, please look at the code below for my implementation:

'''
Implement the Deep Convolution Gan AKA DCGAN in Pytorch: Paper at https://arxiv.org/pdf/1511.06434v2.pdf
'''
import torch
import torch.nn as nn


class GeneratorBlock(nn.Module):
    '''
    Generator Block uses TransposedConv2D -> Batch Norm (except LAST block) -> Relu
    Note: kernel_size = 4, stride = 2, padding = 1 is used in the paper. When BatchNorm is used, Bias is not used for Conv2D
    '''
    def __init__(self, in_channels, out_channels, kernel_size = 4, stride = 2, padding = 1, use_batchnorm:bool = True):
        super().__init__()
        self.use_batchnorm = use_batchnorm
        self.transpose_conv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size = kernel_size, stride=stride, padding=padding, bias = not self.use_batchnorm)
        self.batch_norm = nn.BatchNorm2d(out_channels) if self.use_batchnorm else None
        self.activation = nn.ReLU() # Paper uses Relu in Generator Network
    
    def forward(self, x):
        x = self.transpose_conv(x)
        return self.activation(self.batch_norm(x)) if self.use_batchnorm else self.activation(x)


class Generator(nn.Module):
    '''
    Generate Images using Transposed Convolution. Input is a random noise of [Batch, 100, 1,1] Dimension and then upsampled
    '''
    def __init__(self, input_features = 100, base_feature = 128, final_channels:int = 1):
        '''
        We use nn.Sequantial here just to show the workings. If you want to make the layers dynamically using a loop, find nn.ModuleList() in the Descriminator block. Both works same
        So we'll use 'base_feature = 64' as a base for input and output channels
        args:
            input_features: The shape of Random Noise from which an image will be generated
            base_feature: The shape of feature map or number or channels which will act as out base. Other inputs and outputs will be calculated based on this
            final_channels: The channels / features which will be sent to the Discriminator as an input
        '''
        super(Generator, self).__init__()

        # in Descriminator, we do the same work using ModuleList(). Uses 4 blocks
        self.blocks = nn.Sequential(
            GeneratorBlock(in_channels = input_features, out_channels = base_feature * 8, stride = 1, padding = 0), # from Random Noise, Generate 1024 features
            GeneratorBlock(in_channels = base_feature * 8, out_channels = base_feature * 4), # 1024 -> 512 features
            GeneratorBlock(in_channels = base_feature * 4, out_channels = base_feature * 2), # 512 -> 256 features
            GeneratorBlock(in_channels = base_feature * 2, out_channels = base_feature), # 256 -> 128 features
            nn.ConvTranspose2d(base_feature, final_channels, kernel_size = 4, stride = 2, padding = 1)# 128 -> final feature. It is just GeneratorBlock without ReLu and BatchNorm ;)
        )
        self.activation = nn.Tanh() # To make the outputs between [-1,1]
    
    def forward(self, x):
        '''
        Takes Random Noise as input and Generte features from that
        '''
        return self.activation(self.blocks(x))
    

class DiscriminatorBlock(nn.Module):
    '''
    Discriminator Block uses Conv2D -> Batch Norm (except FIRST block) -> LeakyRelu
    Note: kernel_size = 4, stride = 2, padding = 1 is used in the paper. When BatchNorm is used, Bias is not used for Conv2D
    '''
    def __init__(self, in_channels, out_channels, kernel_size = 4, stride = 2, padding = 1, use_batchnorm:bool = True):
        super().__init__()
        self.use_batchnorm = use_batchnorm
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias = not self.use_batchnorm)
        self.batch_norm = nn.BatchNorm2d(out_channels) if self.use_batchnorm else None
        self.activation = nn.LeakyReLU(0.2)
    
    def forward(self, x):
        x = self.conv(x)
        return self.activation(self.batch_norm(x)) if self.use_batchnorm else self.activation(x)
    

class Discriminator(nn.Module):
    '''
    CNNs to classify whether the image generated by the Generator are as good as the real ones
    Feature Changes as :: 1 -> 64 -> 128 -> 256 -> 512 -> 1
    '''
    def __init__(self, input_features = 1, output_features = 1,  middle_features = [64,128,256]):
        '''
        In the paper, they take in a feature of [Batch, 1, 64, 64] from the Generator and then output a single number per sample in the batch
        '''
        super().__init__()
        self.layers = nn.ModuleList() # Just a fancy method of stacking layers using loop

        # in the paper, the first layer does not use BatchNorm
        self.layers.append(DiscriminatorBlock(input_features, middle_features[0], use_batchnorm = False)) #  1 -> 64 Because the input has 1 channel

        for i, channel in enumerate(middle_features): # total 4 blocks are used in paper. 1 has already been used in the line above. 3 blocks are these
            self.layers.append(DiscriminatorBlock(channel, channel*2)) # 64 -> 128 --- 128 -> 256 --- 256 -> 512

        self.final_conv = nn.Conv2d(in_channels = middle_features[-1]*2,  out_channels = output_features, kernel_size = 4, stride = 2,  padding = 0) # Input from previous layer 512 -> 1
        self.sigmoid_layer = nn.Sigmoid() # gives whether an image is real or fake or more precisely, how CLOSE is it to the real image

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        
        return self.sigmoid_layer(self.final_conv(x))


def test_DCGAN_code():
    noise = torch.rand(10,100,1,1)
    image = Generator()(noise)
    result = Discriminator()(image)
    print('Model Built Successfully!!! Generating 10 random samples and their end results')
    print(f"'Z' random Noise shape: {noise.shape} || Generator output shape: {image.shape} || Discriminator shape: {result.shape}")

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

再见回来 2025-01-19 16:37:43

您可以简单地在 __init__ 方法末尾迭代所有子模块：

class Generator(nn.Module):
  def __init__(self, ....):
    # all code here
    # ...
    # init weights, at the very bottom of __init__
   for sm in self.modules():
     if isinstance(sm, nn.Conv2d):
       # only conv2d will be initialized in this way
       torch.nn.init.normal_(sm.weight.data, 0.0, 0.02)

完成。

You can simply iterate over all submodules, at the end of your __init__ method:

class Generator(nn.Module):
  def __init__(self, ....):
    # all code here
    # ...
    # init weights, at the very bottom of __init__
   for sm in self.modules():
     if isinstance(sm, nn.Conv2d):
       # only conv2d will be initialized in this way
       torch.nn.init.normal_(sm.weight.data, 0.0, 0.02)

done.

回复收藏 0 原文

此岸叶落 2025-01-19 16:37:43

找到了一些答案。只是想知道这是正确的方法：

def initialise_weights(m):
    if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
        nn.init.normal_(m.weight.data, 0.0, 0.02)

def check_sanity(m):
    if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
        print(m.weight.data.mean(), m.weight.data.std())


gen = Generator()
gen = gen.apply(initialise_weights)
gen = gen.apply(check_sanity)

Found some answer to this. Just want to know it this is the right approach:

def initialise_weights(m):
    if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
        nn.init.normal_(m.weight.data, 0.0, 0.02)

def check_sanity(m):
    if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d)):
        print(m.weight.data.mean(), m.weight.data.std())


gen = Generator()
gen = gen.apply(initialise_weights)
gen = gen.apply(check_sanity)

回复收藏 0 原文

尹雨沫 2025-01-19 16:37:43

接受的答案是最佳答案（另一种选择是使用 class _ConvNd 并修改源代码，换句话说，替换 init.kaiming_uniform_(self.weight, a=math.sqrt(5 ）））。总而言之，最佳实践是定义另一个名为 Reset_parameters() 的方法，将其放在 __init__(self, *args) 的末尾并更改那里的参数：

class Generator(nn.Module):

    def __init__(self, *args) -> None:
        ...
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # comments
        for sm in self.modules():
            if isinstance(sm, nn.Conv2d):
                torch.nn.init.normal_(
                    sm.weight.data, 
                    mean=0.0, 
                    std=0.02
                    )

The accepted answer is the best answer (an alternative would be going to class _ConvNd and modify the source in other words replace the init.kaiming_uniform_(self.weight, a=math.sqrt(5))). All said and done though the best practice is to define another method called reset_parameters() put it at the end of your __init__(self, *args) and change the parameters there:

class Generator(nn.Module):

    def __init__(self, *args) -> None:
        ...
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # comments
        for sm in self.modules():
            if isinstance(sm, nn.Conv2d):
                torch.nn.init.normal_(
                    sm.weight.data, 
                    mean=0.0, 
                    std=0.02
                    )

回复收藏 0 原文

~没有更多了~