前言

下面是来自 ChatGPT 关于 nn. Sequential 的介绍

nn.Sequential 是 PyTorch 深度学习库中的一个模块,它允许用户按照顺序将多个层堆叠在一起创建神经网络。
在顺序神经网络中,一层的输出作为下一层的输入,因此每一层都可以看作是上一层的输出和下一层的输入之间的一个连接。
使用 nn.Sequential 创建神经网络时,用户只需要将所需的层按照顺序传递给 nn.Sequential 即可创建一个神经网络。

导包

1
2
3
import torch  
from torch import nn
from torch.nn import functional as F

定义类

X = torch. rand (2, 20) 这行代码会生成一个大小为 2x20 的张量,其中包含了 40 个在区间 [0, 1) 内均匀分布的随机数。

1
2
3
net = nn.Sequential(nn.Linear(20,256), nn.ReLU(), nn.Linear(256,10))  
X = torch.rand(2,20) # 输入
net(X) # 输出

输出:

1
2
3
4
tensor([[ 0.0574,  0.3315,  0.1166,  0.0199,  0.3483, -0.1876, -0.1673,  0.0476,
-0.1516, -0.1820],
[-0.1105, 0.1474, 0.0903, -0.0106, 0.4121, -0.1440, -0.1650, 0.0741,
-0.1555, -0.1497]], grad_fn=<AddmmBackward>)

自定义 Module

nn. Module 里面定义的 __call__ 方法会调用 forward 方法

1
2
3
4
5
6
7
8
9
10
11
class MyMLP(nn.Module):  
def __init__(self):
super().__init__()
self.hidden = nn.Linear(20,256)
self.out = nn.Linear(256,10)
def forward(self, X):
return self.out(F.relu((self.hidden(X)))) # 将三个层链接起来

net = MyMLP()
X = torch.rand(2,20)
net(X)

输出:

1
2
3
4
tensor([[-0.1130, -0.2605, -0.0028, -0.1315,  0.2831, -0.0219, -0.1388, -0.3198,
0.0520, 0.1621],
[-0.1664, -0.2344, -0.0618, -0.1139, 0.2405, 0.1695, -0.1188, -0.3584,
0.0692, 0.0633]], grad_fn=<AddmmBackward>)

自定义 Sequential

1
2
3
4
5
6
7
8
9
10
11
12
13
class MySequential(nn.Module):  
def __init__(self,*args):
super().__init__()
for block in args:
print(block)
self._modules[block] = block
def forward(self,X):
for block in self._modules.values():
X = block(X)
return X

net = MySequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
net(X)

输出:

1
2
3
4
5
6
7
8
Linear(in_features=20, out_features=256, bias=True)
ReLU()
Linear(in_features=256, out_features=10, bias=True)

tensor([[ 0.1493, -0.0089, 0.2427, -0.0798, 0.0935, 0.0015, 0.1714, 0.1016,
-0.2196, -0.0520],
[ 0.0919, -0.0402, 0.1483, -0.0247, -0.0209, 0.0968, 0.0856, 0.1403,
-0.2901, -0.1206]], grad_fn=<AddmmBackward>)

自定义嵌套网络

1
2
3
4
5
6
7
8
9
10
11
12
class NestMLP(nn.Module):  
def __init__(self):
super().__init__()
self.net = nn.Sequential(nn.Linear(20,64),nn.ReLU(),
nn.Linear(64,32),nn.ReLU())
self.linear = nn.Linear(32,16)
def forward(self,X):
return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(),nn.Linear(16,20),MyMLP())
X = torch.rand(2,20)
chimera(X)

输出:

1
2
3
4
tensor([[ 0.0210,  0.1105,  0.0292,  0.0055, -0.1473, -0.1418,  0.0247, -0.0770,
0.0096, -0.0914],
[ 0.0166, 0.1126, 0.0327, -0.0024, -0.1452, -0.1435, 0.0254, -0.0761,
0.0077, -0.0984]], grad_fn=<AddmmBackward>)

参数管理

具有单隐藏层的多层感知机

1
2
3
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))  
X = torch.rand(size=(2,4))
net(X)

输出:

1
2
tensor([[0.0310],
[0.0198]], grad_fn=<AddmmBackward>)

查看某一层状态

1
2
3
4
5
print(net[2].state_dict())  
print(type(net[2].bias))
print(net[2].bias) # .bias表示偏移
print(net[2].bias.data)
print(net[2].weight.grad==None) # .grad表示访问梯度

输出:

1
2
3
4
5
6
7
8
9
10
OrderedDict([('weight', tensor([[-0.0261, -0.2381,  0.2371,  0.3306,  0.0700,  0.2761, -0.1394,  0.1811]])), ('bias', tensor([0.0377]))])

<class 'torch.nn.parameter.Parameter'>

Parameter containing:
tensor([0.0377], requires_grad=True)

tensor([0.0377])

True

Parameter 表示定义的是一个可以优化的参数

一次性访问所有参数

1
2
3
4
# * 表示解压缩
print(*[(name,param.shape)for name,param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])
print(net.state_dict()['2.bias'].data)

输出:

1
2
3
4
5
('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))

tensor([0.0377])

返回的是 parameters(可优化参数),但 1 是 ReLU,是没有参数的,所以只会输出 0、2 两个全连接层 Linear。

从嵌套块收集参数

1
2
3
4
5
6
7
8
9
10
11
def block1():  
return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
net = nn.Sequential()
for i in range(4):
net.add_module(f'block {i}',block1())
return net

rgnet = nn.Sequential(block2(), nn.Linear(4,1))
rgnet(X)

输出:

1
2
tensor([[0.1841],
[0.1841]], grad_fn=<AddmmBackward>)

查看整个网络

1
print(rgnet)

输出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
Sequential(
(0): Sequential(
(block 0): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 1): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 2): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
(block 3): Sequential(
(0): Linear(in_features=4, out_features=8, bias=True)
(1): ReLU()
(2): Linear(in_features=8, out_features=4, bias=True)
(3): ReLU()
)
)
(1): Linear(in_features=4, out_features=1, bias=True)
)

初始化 module

传入的 m 表示是一个 module
_表示无需返回值,直接原地替换。

例 1

1
2
3
4
5
6
7
8
9
10
11
def init_normal(m):    
if type(m) == nn.Linear: # 如果是线性层
nn.init.normal_(m.weight, mean=0, std=0.01) # 这里将weight变成均值为0,标准差为0.01
nn.init.zeros_(m.bias) # 将bias偏移替换为0

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(init_normal) # 将网络遍历一遍,用init_normal进行更新
net[0].weight.data[0], net[0].bias.data[0]

输出:

1
(tensor([-0.0028, -0.0195,  0.0069, -0.0052]), tensor(0.))

例 2

1
2
3
4
5
6
7
8
9
10
11
def init_constant(m):  
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 1)
nn.init.zeros_(m.bias)

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

输出:

1
(tensor([1., 1., 1., 1.]), tensor(0.))

对不同层做不同初始化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def xavier(m):  
if type(m) == nn.Linear:
nn.init.xavier_uniform_(m.weight) # 对weight做xavier均值化

def init_42(m):
if type(m) == nn.Linear:
nn.init.constant_(m.weight, 42) # 全部赋值42

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net[0].apply(xavier)
net[2].apply(init_42)

print(net[0].weight.data[0])
print(net[2].weight.data)
1
2
tensor([ 0.0871, -0.1821, -0.5227, -0.4939])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])

自定义初始化

m.weight.data *= m.weight.data.abs()>=5 表示如果 m.weight.data.abs() >= 5 成立,相当于乘以 1;不成立相当于乘以 0 。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def my_init(m):  
if type(m) == nn.Linear:
print(
"Init",
*[(name,param.shape) for name, param in m.named_parameters()][0]
)
nn.init.uniform_(m.weight, -10, 10) # (-10,10)之间的均匀分布填充
m.weight.data *= m.weight.data.abs()>=5

net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

net.apply(my_init)
net[0].weight[:2]

输出:

1
2
3
4
5
Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])

tensor([[0.0000, 0.0000, 7.5092, -0.0000],
[0.0000, 5.3086, 7.6778, 0.0000]], grad_fn=<SliceBackward>)

直接操作

1
2
3
net[0].weight.data[:] += 1     # 所有值+1  
net[0].weight.data[0, 0] = 42 # 第一行的第一个元素为42
net[0].weight.data[0] # 第一行元素

输出:

1
tensor([42.0000,  1.0000,  8.5092,  1.0000])

参数绑定

下面的例子可以看出,第二层和第四层都指向 shared,这相当于指向同一个内存空间,所以第二层和第四层会同时变化,永远一样。

1
2
3
4
5
6
7
8
9
shared = nn.Linear(8,8)  
net = nn.Sequential(
nn.Linear(4,8),nn.ReLU(),shared,nn.ReLU(),shared,nn.ReLU(),nn.Linear(8,1)
)
net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])

net[2].weight.data[0,0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

输出:

1
2
tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])

自定义层

1
2
3
4
5
6
7
8
class CenteredLayer(nn.Module):  
def __init__(self):
super().__init__()
def forward(self, X):
return X - X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

输出:

1
tensor([-2., -1.,  0.,  1.,  2.])

利用自定义层构建 Module

1
2
3
4
net = nn.Sequential(nn.Linear(8,128), CenteredLayer())  

Y = net(torch.rand(4,8))
Y.mean()

输出:

1
tensor(7.4506e-09, grad_fn=<MeanBackward0>)

自定义带参数的层

torch.randn(in_units,units) 表示均值为 0,方差为 1 的正态分布,模型形状为 in_units*units

1
2
3
4
5
6
7
8
9
10
11
class MyLinear(nn.Module):  
def __init__(self, in_units, units):
super().__init__()
self.weight = nn.Parameter(torch.randn(in_units,units))
self.bias = nn.Parameter(torch.zeros(units))
def forward(self,X):
linear = torch.matmul(X, self.weight.data) + self.bias.data
return F.relu(linear)

dense = MyLinear(5,3)
dense.weight

输出:

1
2
3
4
5
6
Parameter containing:
tensor([[-1.4344, 0.5694, -0.6562],
[ 0.8137, 0.1474, -1.5661],
[-0.4021, 0.6279, 0.7612],
[-0.3380, -1.6948, 0.6764],
[-0.6898, 0.9873, 1.0886]], requires_grad=True)

直接执行正向传播计算

1
dense(torch.rand(2,5))

输出:

1
2
tensor([[0.2325, 0.0000, 0.0000],
[0.0000, 0.5239, 0.9386]])

直接构建Module

1
2
net = nn.Sequential(MyLinear(64,8), MyLinear(8,1))  
net(torch.rand(2,64))

输出:

1
2
tensor([[0.],
[0.]])

读写文件

训练好的东西如何存储下来?

简单存储

torch.save() 存储文件;torch.load() 读取文件

1
2
3
4
5
x = torch.arange(4)  
torch.save(x, 'x-file') # 保存数据到文件

x2 = torch.load("x-file") # 从文件读取数据
x2

输出:

1
tensor([0, 1, 2, 3])

存一个张量列表,再读回内存

1
2
3
4
y = torch.zeros(4)  
torch.save([x,y], 'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

输出:

1
(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

写入或读取字符串映射到张量的字典

1
2
3
4
mydict = {'x': x, 'y': y}  
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

输出:

1
{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

加载和保存模型参数

存储

只保存权重,不保存计算部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class MyMLP(nn.Module):  
def __init__(self):
super().__init__()
self.hidden = nn.Linear(20,256)
self.output = nn.Linear(256,10)
def forward(self, X):
return self.output(F.relu((self.hidden(X)))) # 将三个层链接起来

net = MyMLP()
X = torch.randn(size=(2,20))
Y = net(X)

print(net.state_dict())

torch.save(net.state_dict(),'mlp.params')

输出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
OrderedDict([
('hidden.weight', tensor([
[ 0.0971, -0.1579, 0.0044, ..., -0.0998, 0.0737, -0.1686],
[ 0.0530, 0.0414, 0.1216, ..., 0.1181, -0.0419, 0.2201],
[-0.1028, -0.1758, -0.1662, ..., -0.0928, 0.1294, 0.0587],
...,
[-0.1926, -0.1774, 0.2134, ..., -0.2065, -0.1961, -0.0740],
[ 0.1167, 0.1235, 0.0281, ..., -0.2233, -0.1973, 0.2134],
[ 0.1455, 0.0599, 0.1002, ..., 0.0954, -0.1207, -0.1105]])),
('hidden.bias', tensor([
-0.0302, -0.0348, 0.0281, 0.0997, -0.0086, 0.1300, 0.0476, -0.2219,
0.1233, 0.0026, -0.0673, 0.0856, -0.1732, 0.1215, -0.1404, -0.1227,
0.2165, 0.1321, -0.0280, 0.0206, -0.0785, -0.2105, -0.0730, 0.1728,
0.0761, 0.1807, 0.1850, 0.1304, 0.1791, -0.1322, -0.1459, -0.2088,
0.1379, 0.0745, -0.1551, 0.1028, -0.0318, -0.0190, 0.0452, -0.1210,
-0.1974, 0.0848, -0.1211, -0.0029, -0.0758, -0.1973, -0.1604, 0.1820,
0.1789, 0.0604, 0.2105, 0.1426, -0.2211, 0.0429, -0.0488, -0.0870,
0.0529, -0.1741, -0.1400, 0.0548, -0.1349, 0.0981, 0.2169, -0.0210,
-0.1844, 0.0077, 0.0834, 0.2054, -0.2228, -0.0648, 0.0173, 0.0247,
-0.1176, 0.1128, 0.1654, -0.0504, 0.1098, 0.1063, -0.0364, -0.1849,
-0.1402, 0.1194, -0.0215, -0.0185, -0.2194, 0.0617, -0.1944, 0.1957,
-0.0532, 0.1535, 0.2023, -0.1782, 0.2102, -0.1350, 0.0877, -0.0950,
-0.1563, -0.0329, 0.0251, 0.0778, 0.1942, -0.1021, 0.0153, 0.1390,
0.1445, 0.0628, 0.0339, 0.2018, -0.0993, 0.0693, -0.2129, 0.0332,
-0.0043, 0.0372, -0.0691, 0.2017, -0.0454, -0.0628, -0.1467, 0.0851,
0.1321, -0.2065, -0.0110, 0.0214, -0.0256, 0.1904, 0.0079, -0.0249,
-0.1835, -0.1258, -0.0364, 0.0382, 0.0364, 0.0556, 0.0968, -0.0379,
-0.0573, -0.0327, 0.2173, -0.1410, 0.0369, -0.0393, 0.0957, -0.0846,
0.1420, -0.0529, 0.0196, -0.1414, 0.0247, 0.0764, -0.0029, 0.1371,
0.0078, 0.0094, 0.1526, -0.0658, -0.1047, -0.0852, 0.1926, 0.1918,
0.1632, -0.0534, 0.0203, 0.1192, 0.0354, -0.1002, 0.2012, -0.1022,
0.1445, 0.1265, 0.1041, 0.0924, -0.0209, 0.0056, -0.1787, 0.0652,
-0.1389, -0.0571, -0.1906, -0.2193, -0.0129, 0.0469, -0.0718, 0.2137,
-0.0676, 0.2137, -0.0784, -0.0154, 0.0074, -0.0139, 0.2043, 0.1941,
-0.0824, 0.0544, -0.2138, -0.0478, -0.1863, -0.2089, 0.1727, 0.0725,
0.0170, 0.2099, -0.2147, -0.2158, -0.1833, -0.1895, -0.1178, -0.0157,
-0.1715, -0.0400, 0.0310, -0.2036, -0.1314, 0.1275, 0.0224, -0.1556,
-0.0607, -0.0977, 0.0341, -0.0254, -0.1512, 0.1797, -0.0274, -0.0814,
-0.1371, 0.0022, -0.1917, -0.1770, 0.0264, 0.0580, 0.0983, 0.0687,
0.0243, 0.1223, 0.1731, -0.0675, 0.0962, 0.1003, -0.1203, 0.2133,
-0.2032, 0.0220, -0.0204, 0.0468, -0.1539, 0.0375, 0.0204, -0.0473,
-0.0527, 0.0048, -0.1396, -0.0565, 0.0003, 0.0638, -0.0141, -0.1569])),
('output.weight', tensor([
[-0.0125, 0.0369, -0.0440, ..., 0.0466, -0.0250, 0.0114],
[ 0.0407, -0.0135, 0.0389, ..., 0.0151, 0.0203, 0.0166],
[ 0.0319, -0.0301, 0.0393, ..., 0.0548, -0.0589, -0.0623],
...,
[-0.0496, -0.0027, -0.0179, ..., 0.0543, 0.0416, -0.0003],
[ 0.0338, -0.0056, -0.0475, ..., 0.0057, -0.0026, 0.0187],
[ 0.0377, 0.0057, -0.0228, ..., -0.0383, -0.0076, 0.0280]])),
('output.bias', tensor([
0.0501, 0.0566, 0.0253, -0.0438, -0.0322,
-0.0575, -0.0386, -0.0467,-0.0455, -0.0255]))])

读取

1
2
3
4
5
6
clone = MyMLP()               # 先声明一个MyMLP,此时网络已经初始化  
clone.load_state_dict(torch.load("mlp.params")) # 更新权重
print(clone.eval())

Y_clone = clone(X)
print(Y_clone == Y)

输出:

1
2
3
4
5
6
7
MyMLP(
(hidden): Linear(in_features=20, out_features=256, bias=True)
(output): Linear(in_features=256, out_features=10, bias=True)
)

tensor([[True, True, True, True, True, True, True, True, True, True],
[True, True, True, True, True, True, True, True, True, True]])

相关链接

bilibili
Pytorch神经网络工具箱