9 - nn.Module、nn.Sequential以及其他容器Module的源码讲解

原创已于 2022-03-13 20:38:41 修改 · 4.7k 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#深度学习 #pytorch

于 2022-03-13 11:01:08 首次发布

pytorch 同时被 2 个专栏收录

148 篇文章

订阅专栏

python

75 篇文章

订阅专栏

本文详细介绍了PyTorch中模型的训练模式（train和eval）、参数梯度控制（requires_grad_）、梯度清零（zero_grad）以及模型结构可视化（str和nn.Sequential）。通过实例展示了如何在训练过程中切换模型状态，控制参数梯度，以及如何构建和理解模型的内部结构。

1. train 训练模式

将模型设置成训练模式，本质上式将self.training=True
启用dropout类和batchnorm类

    def train(self: T, mode: bool = True) -> T:
        r"""Sets the module in training mode.

        This has any effect only on certain modules. See documentations of
        particular modules for details of their behaviors in training/evaluation
        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
        etc.

        Args:
            mode (bool): whether to set training mode (``True``) or evaluation
                         mode (``False``). Default: ``True``.

        Returns:
            Module: self
        """
        # 判断模式是否是布尔类型，如果不是就报错
        if not isinstance(mode, bool):
            raise ValueError("training mode is expected to be boolean")
        # 设置self.training=mode
        self.training = mode
        # 遍历模型中所有的子模块，将其子模块的self.training=mode
        for module in self.children():
            module.train(mode)
        return self

2. eval 推理模式

将模型设置成训练模式，本质上式将self.training=False
不启用dropout类和batchnorm类,

    def eval(self: T) -> T:
        r"""Sets the module in evaluation mode.

        This has any effect only on certain modules. See documentations of
        particular modules for details of their behaviors in training/evaluation
        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
        etc.

        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.

        See :ref:`locally-disable-grad-doc` for a comparison between
        `.eval()` and several similar mechanisms that may be confused with it.

        Returns:
            Module: self
        """
        return self.train(False)

3. requires_grad_

作用：对模型中的所有的参数设置是否需要计算梯度

    def requires_grad_(self: T, requires_grad: bool = True) -> T:
        r"""Change if autograd should record operations on parameters in this
        module.

        This method sets the parameters' :attr:`requires_grad` attributes
        in-place.

        This method is helpful for freezing part of the module for finetuning
        or training parts of a model individually (e.g., GAN training).

        See :ref:`locally-disable-grad-doc` for a comparison between
        `.requires_grad_()` and several similar mechanisms that may be confused with it.

        Args:
            requires_grad (bool): whether autograd should record operations on
                                  parameters in this module. Default: ``True``.

        Returns:
            Module: self
        """
        # 批量递归模型中的参数，并将图形中的参数梯度逐个设置True或False
        for p in self.parameters():
            p.requires_grad_(requires_grad)
        return self

案例代码

# 1.导入相关库
import torch
from torch import nn


# 2.定义模型
class MyTest(nn.Module):
	def __init__(self):
		super(MyTest, self).__init__()
		self.linear1 = nn.Linear(2, 3)
		self.linear2 = nn.Linear(3, 4)
		self.batchnorm = nn.BatchNorm2d(4)


# 3.实例化神经网络
mymodel = MyTest()

# 4. 查看模型参数梯度
for param in mymodel.parameters():
	print(f"original_param={param}")
print("*"*100)

# 5. 设置模型参数梯度为False
mymodel.requires_grad_(False)

# 6. 查看模型参数梯度
for param in mymodel.named_parameters():
	print(f"False:param={param}")

print("*"*100)
# 7. 设置模型参数梯度为False
mymodel.requires_grad_(True)

# 8. 查看模型参数梯度
for param in mymodel.named_parameters():
	print(f"True:param={param}")

结果

original_param=Parameter containing:
tensor([[-0.5034, -0.6599],
        [ 0.4451,  0.1687],
        [-0.6548, -0.6644]], requires_grad=True)
original_param=Parameter containing:
tensor([-0.4746,  0.4611,  0.0340], requires_grad=True)
original_param=Parameter containing:
tensor([[ 0.2367, -0.4441,  0.4117],
        [-0.4328, -0.5242, -0.4260],
        [ 0.0096, -0.3758,  0.2389],
        [ 0.4779, -0.5718,  0.4700]], requires_grad=True)
original_param=Parameter containing:
tensor([ 0.4026, -0.1612, -0.2042, -0.5499], requires_grad=True)
original_param=Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
original_param=Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
****************************************************************************************************
False:param=('linear1.weight', Parameter containing:
tensor([[-0.5034, -0.6599],
        [ 0.4451,  0.1687],
        [-0.6548, -0.6644]]))
False:param=('linear1.bias', Parameter containing:
tensor([-0.4746,  0.4611,  0.0340]))
False:param=('linear2.weight', Parameter containing:
tensor([[ 0.2367, -0.4441,  0.4117],
        [-0.4328, -0.5242, -0.4260],
        [ 0.0096, -0.3758,  0.2389],
        [ 0.4779, -0.5718,  0.4700]]))
False:param=('linear2.bias', Parameter containing:
tensor([ 0.4026, -0.1612, -0.2042, -0.5499]))
False:param=('batchnorm.weight', Parameter containing:
tensor([1., 1., 1., 1.]))
False:param=('batchnorm.bias', Parameter containing:
tensor([0., 0., 0., 0.]))
****************************************************************************************************
True:param=('linear1.weight', Parameter containing:
tensor([[-0.5034, -0.6599],
        [ 0.4451,  0.1687],
        [-0.6548, -0.6644]], requires_grad=True))
True:param=('linear1.bias', Parameter containing:
tensor([-0.4746,  0.4611,  0.0340], requires_grad=True))
True:param=('linear2.weight', Parameter containing:
tensor([[ 0.2367, -0.4441,  0.4117],
        [-0.4328, -0.5242, -0.4260],
        [ 0.0096, -0.3758,  0.2389],
        [ 0.4779, -0.5718,  0.4700]], requires_grad=True))
True:param=('linear2.bias', Parameter containing:
tensor([ 0.4026, -0.1612, -0.2042, -0.5499], requires_grad=True))
True:param=('batchnorm.weight', Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True))
True:param=('batchnorm.bias', Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True))

4. zero_grad

作用：梯度清零，常用作优化器的梯度清零工作

    def zero_grad(self, set_to_none: bool = False) -> None:
        r"""Sets gradients of all model parameters to zero. See similar function
        under :class:`torch.optim.Optimizer` for more context.

        Args:
            set_to_none (bool): instead of setting to zero, set the grads to None.
                See :meth:`torch.optim.Optimizer.zero_grad` for details.
        """
        if getattr(self, '_is_replica', False):
            warnings.warn(
                "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
                "The parameters are copied (in a differentiable manner) from the original module. "
                "This means they are not leaf nodes in autograd and so don't accumulate gradients. "
                "If you need gradients in your forward method, consider using autograd.grad instead.")

        for p in self.parameters():
            if p.grad is not None:
                if set_to_none:
                    p.grad = None
                else:
                    if p.grad.grad_fn is not None:
                        p.grad.detach_()
                    else:
                        p.grad.requires_grad_(False)
                    p.grad.zero_()

5. str(mymodel)

通过nn.Module中的魔法方法__repr__我们可以简单的通过str(mymodel)来查看模型

    def __repr__(self):
        # We treat the extra repr like the sub-module, one item per line
        extra_lines = []
        extra_repr = self.extra_repr()
        # empty string will be split into list ['']
        if extra_repr:
            extra_lines = extra_repr.split('\n')
        child_lines = []
        for key, module in self._modules.items():
            mod_str = repr(module)
            mod_str = _addindent(mod_str, 2)
            child_lines.append('(' + key + '): ' + mod_str)
        lines = extra_lines + child_lines

        main_str = self._get_name() + '('
        if lines:
            # simple one-liner info, which most builtin Modules will use
            if len(extra_lines) == 1 and not child_lines:
                main_str += extra_lines[0]
            else:
                main_str += '\n  ' + '\n  '.join(lines) + '\n'

        main_str += ')'
        return main_str

案例

# 1.导入相关库
import torch
from torch import nn


# 2.定义模型
class MyTest(nn.Module):
	def __init__(self):
		super(MyTest, self).__init__()
		self.linear1 = nn.Linear(2, 3)
		self.linear2 = nn.Linear(3, 4)
		self.batchnorm = nn.BatchNorm2d(4)


# 3.实例化神经网络
mymodel = MyTest()

# 4.直接通过str查看模型
print(str(mymodel))

结果

MyTest(
  (linear1): Linear(in_features=2, out_features=3, bias=True)
  (linear2): Linear(in_features=3, out_features=4, bias=True)
  (batchnorm): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

6. nn.Sequential

作用：提供一个有序的模型序列，方便将各个模块依次进行组合。是一个有序的容器

class Sequential(Module):
    r"""A sequential container.
    Modules will be added to it in the order they are passed in the
    constructor. Alternatively, an ``OrderedDict`` of modules can be
    passed in. The ``forward()`` method of ``Sequential`` accepts any
    input and forwards it to the first module it contains. It then
    "chains" outputs to inputs sequentially for each subsequent module,
    finally returning the output of the last module.

    The value a ``Sequential`` provides over manually calling a sequence
    of modules is that it allows treating the whole container as a
    single module, such that performing a transformation on the
    ``Sequential`` applies to each of the modules it stores (which are
    each a registered submodule of the ``Sequential``).

    What's the difference between a ``Sequential`` and a
    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
    sounds like--a list for storing ``Module`` s! On the other hand,
    the layers in a ``Sequential`` are connected in a cascading way.

    Example::

        # Using Sequential to create a small model. When `model` is run,
        # input will first be passed to `Conv2d(1,20,5)`. The output of
        # `Conv2d(1,20,5)` will be used as the input to the first
        # `ReLU`; the output of the first `ReLU` will become the input
        # for `Conv2d(20,64,5)`. Finally, the output of
        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
        model = nn.Sequential(
                  nn.Conv2d(1,20,5),
                  nn.ReLU(),
                  nn.Conv2d(20,64,5),
                  nn.ReLU()
                )

        # Using Sequential with OrderedDict. This is functionally the
        # same as the above code
        model = nn.Sequential(OrderedDict([
                  ('conv1', nn.Conv2d(1,20,5)),
                  ('relu1', nn.ReLU()),
                  ('conv2', nn.Conv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))
    """

    @overload
    def __init__(self, *args: Module) -> None:
        ...

    @overload
    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
        ...

    def __init__(self, *args):
        super(Sequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def _get_item_by_idx(self, iterator, idx) -> T:
        """Get the idx-th item of the iterator"""
        size = len(self)
        idx = operator.index(idx)
        if not -size <= idx < size:
            raise IndexError('index {} is out of range'.format(idx))
        idx %= size
        return next(islice(iterator, idx, None))

    @_copy_to_script_wrapper
    def __getitem__(self, idx) -> Union['Sequential', T]:
        if isinstance(idx, slice):
            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
        else:
            return self._get_item_by_idx(self._modules.values(), idx)

    def __setitem__(self, idx: int, module: Module) -> None:
        key: str = self._get_item_by_idx(self._modules.keys(), idx)
        return setattr(self, key, module)

    def __delitem__(self, idx: Union[slice, int]) -> None:
        if isinstance(idx, slice):
            for key in list(self._modules.keys())[idx]:
                delattr(self, key)
        else:
            key = self._get_item_by_idx(self._modules.keys(), idx)
            delattr(self, key)

    @_copy_to_script_wrapper
    def __len__(self) -> int:
        return len(self._modules)

    @_copy_to_script_wrapper
    def __dir__(self):
        keys = super(Sequential, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    @_copy_to_script_wrapper
    def __iter__(self) -> Iterator[Module]:
        return iter(self._modules.values())

    # NB: We can't really type check this function as the type of input
    # may change dynamically (as is tested in
    # TestScript.test_sequential_intermediary_types).  Cannot annotate
    # with Any as TorchScript expects a more precise type
	
	# sequential 的前向传播函数，就是简单的循环迭代，将输入的参数通过nn.Sequential进行传播
    def forward(self, input):
        for module in self:
            input = module(input)
        return input

7. nn.ModuleList

（1）为什么有ModuleList，为啥不用python自带的List
nn.ModuleList 是一个能储存每个module的容器，并且能够将ModuleList里面的每个模块的parameters自动注入到网络模型中；而python自带的List不会

# 定义一个神经网络，模块用ModuleList进行组合
class MyModel(nn.Module):
	def __init__(self):
		super(MyModel, self).__init__()
		# ModuleList组合
		self.module_list = nn.ModuleList([nn.Linear(3, 5), nn.ReLU()])

	def forward(self, x):
		for m in self.module_list:
			x = m(x)
		return x

print("*"*100)
print("ModelList"*10)
my_ModelList = MyModel()
print(f"my_ModelList={my_ModelList}")
# 查看参数
print("ModelList for param")
for param in my_ModelList.parameters():
	print(f"my_modellist_param={param}")

print("*"*100)
print("PythonList"*10)
# 定义一个神经网络，模块用Python自带的list进行组合
class PythonList(nn.Module):
	def __init__(self):
		super(PythonList, self).__init__()
		self.list = [nn.Linear(3, 5), nn.ReLU()]

	def forward(self, x):
		for m in self.list:
			x = m(x)


my_python_list = PythonList()
print(f"my_python_list={my_python_list}")
print("pythonlist for param")
for param in my_python_list.parameters():
	print(f"param_python={param}")

结果：

****************************************************************************************************
ModelListModelListModelListModelListModelListModelListModelListModelListModelListModelList
my_ModelList=MyModel(
  (module_list): ModuleList(
    (0): Linear(in_features=3, out_features=5, bias=True)
    (1): ReLU()
  )
)
ModelList for param
my_modellist_param=Parameter containing:
tensor([[ 0.5168,  0.5762,  0.1003],
        [-0.4560, -0.0956,  0.4347],
        [-0.1871, -0.3226,  0.0094],
        [ 0.3742,  0.1279, -0.3885],
        [ 0.5290, -0.3003, -0.1993]], requires_grad=True)
my_modellist_param=Parameter containing:
tensor([ 0.4282,  0.1626,  0.0872, -0.5053,  0.4860], requires_grad=True)
****************************************************************************************************
PythonListPythonListPythonListPythonListPythonListPythonListPythonListPythonListPythonListPythonList
my_python_list=PythonList()
pythonlist for param

8. nn.ModuleDict

nn.ModuleDict是nn.module的容器，用于包装一组网络层，以索引方式调用网络层
我们可以定义包装一组网络层，在传播的时候可以自主选择新的网络，根据键值进行选择

主要方法：
（1）clear()：清空ModuleDict
（2）items():返回可迭代的键值对(key-value pairs)
（3）keys():返回字典的键(key)
（4）values():返回字典的值(value)
（5）pop():返回一对键值，并从字典中删除

class ModuleDict(nn.Module):
	def __init__(self):
		super(ModuleDict, self).__init__()
		self.choices = nn.ModuleDict({
			'conv': nn.Conv2d(10, 10, 3),
			'pool': nn.MaxPool2d(3)
		})

		self.activation = nn.ModuleDict({
			'relu': nn.ReLU(),
			'prelu': nn.PReLU()
		})

	def forward(self, x, choice, act):
		# choice='conv';act='relu'
		# self.choices[choice]=nn.Conv2d(10, 10, 3)
		# self.activation[act]=nn.Relu()
		x = self.choices[choice](x)
		x = self.activation[act](x)
		return x


# 网络实例化（用的是__init__部分，构建子网络）
net = ModuleDict()
# 输入数据
fake_img = torch.randn((4, 10, 32, 32))
# 调用函数
output = net(fake_img, 'conv', 'relu')

9. nn.ParameterList

创建一个list类型的ParametersList

class ParameterList(Module):
    r"""Holds parameters in a list.

    :class:`~torch.nn.ParameterList` can be indexed like a regular Python
    list, but parameters it contains are properly registered, and will be
    visible by all :class:`~torch.nn.Module` methods.

    Args:
        parameters (iterable, optional): an iterable of :class:`~torch.nn.Parameter` to add

    Example::

        class MyModule(nn.Module):
            def __init__(self):
                super(MyModule, self).__init__()
                self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])

            def forward(self, x):
                # ParameterList can act as an iterable, or be indexed using ints
                for i, p in enumerate(self.params):
                    x = self.params[i // 2].mm(x) + p.mm(x)
                return x
    """

    def __init__(self, parameters: Optional[Iterable['Parameter']] = None) -> None:
        super(ParameterList, self).__init__()
        self._initialized = True
        if parameters is not None:
            self += parameters

    def __setstate__(self, state):
        state['_initialized'] = False
        super(ParameterList, self).__setstate__(state)
        self._initialized = True

    def _get_abs_string_index(self, idx):
        """Get the absolute index for the list of modules"""
        idx = operator.index(idx)
        if not (-len(self) <= idx < len(self)):
            raise IndexError('index {} is out of range'.format(idx))
        if idx < 0:
            idx += len(self)
        return str(idx)

    @overload
    def __getitem__(self, idx: int) -> 'Parameter':
        ...

    @overload
    def __getitem__(self: T, idx: slice) -> T:
        ...

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return self.__class__(list(self._parameters.values())[idx])
        else:
            idx = self._get_abs_string_index(idx)
            return self._parameters[str(idx)]

    def __setitem__(self, idx: int, param: 'Parameter') -> None:
        idx = self._get_abs_string_index(idx)
        return self.register_parameter(str(idx), param)

    def __setattr__(self, key: Any, value: Any) -> None:
        if getattr(self, "_initialized", False):
            if not hasattr(self, key) and not isinstance(value, torch.nn.Parameter):
                warnings.warn("Setting attributes on ParameterList is not supported.")
        super(ParameterList, self).__setattr__(key, value)

    def __len__(self) -> int:
        return len(self._parameters)

    def __iter__(self) -> Iterator['Parameter']:
        return iter(self._parameters.values())

    def __iadd__(self, parameters: Iterable['Parameter']) -> 'ParameterList':
        return self.extend(parameters)

    def __dir__(self):
        keys = super(ParameterList, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def append(self, parameter: 'Parameter') -> 'ParameterList':
        """Appends a given parameter at the end of the list.

        Args:
            parameter (nn.Parameter): parameter to append
        """
        self.register_parameter(str(len(self)), parameter)
        return self

    def extend(self, parameters: Iterable['Parameter']) -> 'ParameterList':
        """Appends parameters from a Python iterable to the end of the list.

        Args:
            parameters (iterable): iterable of parameters to append
        """
        if not isinstance(parameters, container_abcs.Iterable):
            raise TypeError("ParameterList.extend should be called with an "
                            "iterable, but got " + type(parameters).__name__)
        offset = len(self)
        for i, param in enumerate(parameters):
            self.register_parameter(str(offset + i), param)
        return self

    def extra_repr(self) -> str:
        child_lines = []
        for k, p in self._parameters.items():
            size_str = 'x'.join(str(size) for size in p.size())
            device_str = '' if not p.is_cuda else ' (GPU {})'.format(p.get_device())
            parastr = 'Parameter containing: [{} of size {}{}]'.format(
                torch.typename(p), size_str, device_str)
            child_lines.append('  (' + str(k) + '): ' + parastr)
        tmpstr = '\n'.join(child_lines)
        return tmpstr

    def __call__(self, input):
        raise RuntimeError('ParameterList should not be called.')

    def _replicate_for_data_parallel(self):
        warnings.warn("nn.ParameterList is being used with DataParallel but this is not "
                      "supported. This list will appear empty for the models replicated "
                      "on each GPU except the original one.")

        return super(ParameterList, self)._replicate_for_data_parallel()

案例

import torch
from torch import nn

class MyModule(nn.Module):
	def __init__(self):
		super(MyModule, self).__init__()
		self.params = nn.ParameterList([nn.Parameter(torch.randn(3, 3)) for i in range(5)])

	def forward(self, x):
		# ParameterList can act as an iterable, or be indexed using ints
		for i, p in enumerate(self.params):
			x = self.params[i // 2].mm(x) + p.mm(x)
		return x

mymodel = MyModule()
for param in mymodel.named_parameters():
	print(param)

结果

('params.0', Parameter containing:
tensor([[ 0.4520, -0.7078,  1.5575],
        [ 1.2892, -1.3094, -1.1212],
        [-1.2048,  1.3236,  1.6908]], requires_grad=True))
('params.1', Parameter containing:
tensor([[-0.7633, -1.3577,  0.4661],
        [-0.1935, -0.1821,  0.0935],
        [-0.3414, -0.2055,  2.2441]], requires_grad=True))
('params.2', Parameter containing:
tensor([[-1.3977, -1.2781,  0.3150],
        [-0.0079, -0.3423, -0.0806],
        [ 0.4114,  0.2381, -1.7208]], requires_grad=True))
('params.3', Parameter containing:
tensor([[-1.7887, -0.4023, -0.0706],
        [ 0.1060, -1.3700, -0.0148],
        [ 0.0578, -0.0219,  1.1389]], requires_grad=True))
('params.4', Parameter containing:
tensor([[ 0.3666, -0.3358, -0.1044],
        [ 0.3157, -1.0280, -0.1464],
        [ 0.7625, -1.9047,  0.2317]], requires_grad=True))

10. nn.ParameterDict

保存字典中的参数；ParameterDict可以像一个普通的Python字典一样被索引，但是参数化它
包含被正确注册，并且将被所有Module方法可见。

import torch 
from torch import nn


class MyModule(nn.Module):
	def __init__(self):
		super(MyModule, self).__init__()
		# 定义一个ParameterDict()字典，
		self.params = nn.ParameterDict({
			'left': nn.Parameter(torch.randn(5, 10)),
			'right': nn.Parameter(torch.randn(5, 10))
		})

	def forward(self, x, choice):
		x = self.params[choice].mm(x)
		return x