前言

SGD 优化器

新的参数 = 当前参数 - 学习率 * 梯度

pytoch 使用

导包

1
import torch.optim as opt

其中包含多个优化器包:

MSGD

公式

\[v_{t+1}=uv_t-lr*\bigtriangledown weight\]
\[weight=weight+v_{t+1}\]

对应代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class Linear:  
def __init__(self,in_num,out_num):
self.weight = np.random.normal(0,1,size=(in_num, out_num))
self.u = 0.3
self.vt = 0

def forward(self,x):
self.x = x
return self.x @ self.weight

def backward(self,G):
delta_weight = self.x.T @ G
delta_x = G @ self.weight.T

# ----------------------------SGD----------------------------
# self.weight -= lr * delta_weight # 优化器的内容,梯度下降优化器 SGD
# ----------------------------MSGD----------------------------
self.vt = self.u * self.vt - lr * delta_weight
self.weight = self.weight + self.vt

return delta_x

def __call__(self, x): # 类似于重载
return self.forward(x)

结果

左边为 MSGD,右边为 SGD

Sigmoid 溢出

1
2
3
4
def sigmoid(x):  
# 防止-x太大导致数据溢出,所以进行切片,如果超出范围则取边界值
x = np.clip(x,-100,10000000)
return 1/(1+np.exp(-x))

Softmax 溢出

我们看到 BP 神经网络中有这样的例子:

针对 Softmax 的公式:\[S_i=\frac{e^{V_i}} {\sum_{j} e^{V_i}}\]

其特点显示再 v 极大会产生溢出。所以做出如下改变:所有数同时减去最大的数,这样能保证相对大小没有变化。

1
2
3
4
5
6
def softmax(x):  
# x是batch_size * 分类数
max_x = np.max(x,axis=1,keepdims=True) # batch_size * 1 , 表示的是每个batch_size(一行)的最大值
ex = np.exp(x-max_x) # 整个矩阵的每个元素都求指数
sum_ex = np.sum(ex, axis=1, keepdims=True) # 按行求指数结果的和,axis=1表示按行,keepdims=True表示保持原来形状
return ex/sum_ex

注意: max_x = np.max(x,axis=1,keepdims=True) 中需要加上 keepdims=True
如果不加:shape 为 (batchsize, ) ,表示是一个一维数组,数组中有 batchsize 个元素
如果加上:shape 为 (batchsize, 1),表示一个二维数组,batchsize 行,1 列

Loss 溢出

根据 5 逻辑回归 中可知 Loss 公式:\[Loss=Label·log(pre)+(1-Label)·log(1-pre)\]
可以得到 Loss 的图像为:

可以看出当 x 接近 0 时,会有下溢的可能。所以不能让 x 太接近 0。

通过代码可以看出,x 最后来自 softmax,所以还需要修改 softmax 的代码。

1
2
3
4
5
6
7
8
def softmax(x):  
# x是batch_size * 分类数
max_x = np.max(x,axis=1,keepdims=True) # batch_size * 1 , 表示的是每个batch_size(一行)的最大值
ex = np.exp(x-max_x) # 整个矩阵的每个元素都求指数
sum_ex = np.sum(ex, axis=1, keepdims=True) # 按行求指数结果的和,axis=1表示按行,keepdims=True表示保持原来形状
result = ex/sum_ex
result = np.clip(result,1e-100,1)
return result

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import matplotlib.pyplot as plt  
import numpy as np
import struct

def load_labels(file): # 加载数据
with open(file, "rb") as f:
data = f.read()
return np.asanyarray(bytearray(data[8:]), dtype=np.int32)

def load_images(file): # 加载数据
with open(file, "rb") as f:
data = f.read()
magic_number, num_items, rows, cols = struct.unpack(">iiii", data[:16])
return np.asanyarray(bytearray(data[16:]), dtype=np.uint8).reshape(num_items, -1)

# 将label变成矩阵(60000*10)
def make_one_hot(labels,class_num=10):
result = np.zeros((len(labels),class_num))
for index,lab in enumerate(labels): # enumerate()函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据下标和数据
result[index][lab] = 1
return result

def sigmoid(x):
x = np.clip(x,-100,10000000) # 防止-x太大导致数据溢出,所以进行切片,如果超出范围则取边界值
return 1/(1+np.exp(-x))

def softmax(x):
# x是batch_size * 分类数
max_x = np.max(x,axis=1,keepdims=True) # batch_size * 1 , 表示的是每个batch_size(一行)的最大值
ex = np.exp(x-max_x) # 整个矩阵的每个元素都求指数
sum_ex = np.sum(ex, axis=1, keepdims=True) # 按行求指数结果的和,axis=1表示按行,keepdims=True表示保持原来形状
result = ex/sum_ex
result = np.clip(result,1e-100,1)
return result

def get_datas():
train_datas = load_images("data/train-images.idx3-ubyte") / 255 # (60000, 784)
train_label = make_one_hot(load_labels("data/train-labels.idx1-ubyte"),10) # # (60000,)

test_datas = load_images("data/t10k-images.idx3-ubyte") / 255
test_label = load_labels("data/t10k-labels.idx1-ubyte")

return train_datas,train_label,test_datas,test_label

class Linear:
def __init__(self,in_num,out_num):
self.weight = np.random.normal(0,1,size=(in_num, out_num))
self.u = 0.9
self.vt = 0

def forward(self,x):
self.x = x
return self.x @ self.weight

def backward(self,G):
delta_weight = self.x.T @ G
delta_x = G @ self.weight.T

# ----------------------------SGD----------------------------
# self.weight -= lr * delta_weight # 优化器的内容,梯度下降优化器 SGD
# ----------------------------MSGD----------------------------
self.vt = self.u * self.vt - lr * delta_weight
self.weight = self.weight + self.vt

return delta_x

def __call__(self, x): # 类似于重载
return self.forward(x)

class Sigmoid:
def forward(self,x):
self.r = sigmoid(x)
return self.r

def backward(self,G):
return G * self.r * (1-self.r)

def __call__(self, x):
return self.forward(x)

class Softmax:
def forward(self,x):
self.r = softmax(x)
return self.r

def backward(self,G): # 传的是label
return (self.r - G)/self.r.shape[0] # batch_size就是第0个维度的shape

def __call__(self, x):
return self.forward(x)

class MyModel:
def __init__(self,layers):
self.layers = layers

def forward(self,x,label=None):
for layer in self.layers:
x = layer(x)
self.x = x
if label is not None:
self.label = label
loss = -np.mean(label * np.log(x)) / x.shape[0]
return loss

def backward(self):
G = self.label
for layer in self.layers[::-1]:
G = layer.backward(G)

def __call__(self, *args): # *args变参,数量不变
return self.forward(*args)



if __name__ == "__main__":
train_datas,train_label,test_datas,test_label = get_datas()

# 定义参数
epoch = 100
batch_size = 600 # 一次性处理多少图片
lr = 0.05
hidden_num = 256 # 隐层大小

model = MyModel([
Linear(784,hidden_num),
Sigmoid(),
Linear(hidden_num,10),
Softmax()
])

batch_times = int(np.ceil(len(train_datas) / batch_size)) # np.ceil 向上取整

for e in range(epoch):
for batch_index in range(batch_times):

x = train_datas[batch_index * batch_size : (batch_index + 1) * batch_size] # 按行为单位取出,每次取batch_size行
batch_label = train_label[batch_index * batch_size : (batch_index + 1) * batch_size]

# forward
loss = model.forward(x,batch_label)
# if batch_index%100==0:
# print(f"loss={loss:.3f}") model.backward()

# backward && 更新梯度
model.backward()

# 利用测试集计算精确度
x = test_datas
model.forward(x)

pre = np.argmax(model.x, axis=1) # 取一行最大值的下标,最终的pre是一个10000行1列的向量
acc = np.sum(pre==test_label)/10000

print(f"{'*'*20} epoch={e} {'*'*20} \nacc={acc:.3f}")

相关链接

bilibili
GitHub