前言

模拟 \(f = kx + b\),其中 \(f\)\(x\) 是已知,\(k\)\(b\) 是未知。就是利用历史数据训练出 \(k\)\(b\) ,在未来给出 \(x\) 就能直接解出 \(y\)

线性回归智能确定一条直线 ,不适用于波动很大的情况。

公式:

\[pre = kx+ b\] \[Loss=(pre-Label)^2=(kx+b-Label)^2\] \[{ \frac{\partial Loss} {\partial k} }=2 * (kx+b-Label) * x\] \[{ \frac{\partial Loss} {\partial b} }=2*(kx+b-Label)\]

更新公式:

\[k=k-\frac{\partial Loss} {\partial k}*lr\] \[b=b-\frac{\partial Loss} {\partial b}*lr\]

不断循环上述公式,得到稳定的 \(k\)\(b\)

复现

理解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import numpy as np  

# 对数据进行归一化(都初一最大的值),否则学习率必须非常小才能保证不越界
years = np.array([i for i in range(2000,2022)])/2022 # 年份 2000 ~ 2021
prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])/60000

epoch = 100000
k = 1
b = 1
lr = 0.1

for e in range(epoch):
for x,label in zip(years,prices):
pre = k * x + b
loss = (pre - label) ** 2

delta_k = 2 * (k * x + b - label) * x
delta_b = 2 * (k * x + b - label)

k = k - delta_k * lr
b = b - delta_b * lr

print(f"k={k},b={b}")


while True:
year = float(input("请输入年份:")) / 2022 # 将数据恢复
print("预测房价:", (k * year +b) * 60000) # 将数据恢复

输出: k=11.583227003108236, b=-10.707528453584919
请输入年份:2022
预测房价: 52541.91297139904
请输入年份:2023
预测房价: 52885.62890620945

但是年份在归一化时因为除以 2022 导致数据都是 0.9……,差距很小,所以在有限的 epoch 中无法很好地预估。

years/2022 变成 years = (years-2000)/22,可以将数据先进行均匀分布:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import numpy as np  

# 对数据进行归一化(都初一最大的值),否则学习率必须非常小才能保证不越界
years = np.array([i for i in range(2000,2022)]) # 年份 2000 ~ 2021
years = (years-2000)/22
prices = np.array([10000,11000,12000,13000,14000,12000,13000,16000,18000,20000,19000,22000,24000,23000,26000,35000,30000,40000,45000,52000,50000,60000])/60000

epoch = 10000

k = 1
b = 1
lr = 0.1

for e in range(epoch):
for x,label in zip(years,prices):
pre = k * x + b
loss = (pre - label) ** 2

delta_k = 2 * (k * x + b - label) * x
delta_b = 2 * (k * x + b - label)

k = k - delta_k * lr
b = b - delta_b * lr

print(f"k={k},b={b}")


while True:
year = (float(input("请输入年份:")) - 2000)/22 # 将数据恢复
print("预测房价:", (k * year +b) * 60000) # 将数据恢复

输出: k=0.9958495409320027, b=-0.029087073213425022
请输入年份:2022
预测房价: 58005.74806311466
请输入年份:2035
预测房价: 93313.14087797658

利用 dataset+dataloader 整合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np  

class DataSet:
def __init__(self,years,prices,k,b,lr,batch_size,shuffle=True):
self.years = years
self.prices = prices
self.k = k
self.b = b
self.lr = lr
self.batch_size = batch_size
self.shuffle = shuffle

def __iter__(self):
return DataLoader(self)

def __len__(self):
return len(years)


class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0
self.index = [i for i in range(len(self.dataset))]
if self.dataset.shuffle==True:
np.random.shuffle(self.index)

def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration

ind = self.index[self.cursor : self.cursor+self.dataset.batch_size]
x = self.dataset.years[ind]
y = self.dataset.prices[ind]
self.cursor += self.dataset.batch_size

return x, y

if __name__ == "__main__":

epoch = 10000
k = 1
b = 1
lr = 0.1
batch_size = 2
shuffle = True

years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22
prices = np.array(
[10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
30000, 40000, 45000, 52000, 50000, 60000]) / 60000

dataset = DataSet(years,prices,k,b,lr,batch_size,shuffle)

for e in range(epoch):
for year,price in dataset:

predict = k * year + b
loss = (predict - price) ** 2

delta_k = 2 * (k * year + b - price) * year
delta_b = 2 * (k * year + b - price)

k = k - np.sum(delta_k) * lr
b = b - np.sum(delta_b) * lr

print(f"k={k},b={b}")

while True:
year = (float(input("请输入年份:")) - 2000)/22
print("预测房价:", (k * year +b) * 60000)

输出: k=0.7869334616968109, b=0.0493809221566964
请输入年份:2023
预测房价: 52325.04519947447
请输入年份:2035
预测房价: 78079.23121864282

多特征

模拟 \(f = ax + by + c\),即在上面的例题中增加楼层变量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import numpy as np  

class DataSet:
def __init__(self,years,floors,prices,lr,batch_size,shuffle=True):
self.years = years
self.floors = floors
self.prices = prices
self.lr = lr
self.batch_size = batch_size
self.shuffle = shuffle

def __iter__(self):
return DataLoader(self)

def __len__(self):
return len(self.years)


class DataLoader:
def __init__(self,dataset):
self.dataset = dataset
self.cursor = 0

self.index = [i for i in range(len(self.dataset))]
if self.dataset.shuffle:
np.random.shuffle(self.index)

def __next__(self):
if self.cursor >= len(self.dataset):
raise StopIteration

ind = self.index[self.cursor : self.cursor+self.dataset.batch_size]
x = self.dataset.years[ind]
y = self.dataset.floors[ind]
z = self.dataset.prices[ind]
self.cursor += self.dataset.batch_size

return x, y, z

if __name__ == "__main__":

a = 1
b = -1
c = 0

epoch = 10000
batch_size = 2
lr = 0.1
shuffle = True

years = np.array([i for i in range(2000, 2022)])
years = (years - 2000) / 22

floors = np.array([i for i in range(23, 1, -1)])
floors = floors / 23

prices = np.array(
[10000, 11000, 12000, 13000, 14000, 12000, 13000, 16000, 18000, 20000, 19000, 22000, 24000, 23000, 26000, 35000,
30000, 40000, 45000, 52000, 50000, 60000])
prices = prices / 60000

dataset = DataSet(years,floors,prices,lr,batch_size,shuffle)

for e in range(epoch):
for year,floor,price in dataset:

predict = a * year + b * floor + c
loss = np.sum((predict - price) ** 2)

delta_a = 2 * (predict - price) * year
delta_b = 2 * (predict - price) * floor
delta_c = 2 * (predict - price)

a = a - np.sum(delta_a) * lr
b = b - np.sum(delta_b) * lr
c = c - np.sum(delta_c) * lr

print(f"k={a},b={b},c={c}")

while True:
year = (float(input("请输入年份:")) - 2000)/22
floor = float(input("请输入楼层:")) / 23
print("预测房价:", (a * year +b * floor + c) * 60000)

输出: k=0.535609459528955, b=-0.2391682413913525, c=0.31663211120152795
请输入年份:2023
请输入楼层:1
预测房价: 51971.33016381589
请输入年份:2035
请输入楼层:5
预测房价: 67004.69841530433

相关链接

bilibili
GitHub