简介
直线拟合就是根据数据 $X$ 和$Y$,找到最合适的$W$和$K$,使得$W \times X+b$的结果和$Y$最接近。当然这个最接近和距离的定义有关,通常用的是欧几里得距离。本文介绍几种直线拟合的方法。
生成数据
生成数据,包括理想数据ideal(红线)和实际带噪声数据(蓝点)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import numpy as np
import matplotlib.pyplot as plt
# 从 -2 到 35间均匀采样,间隔为0.01
X = np.random.choice(range(-200, 3501), 100, replace=False) / 100
k = -3
b = -10
Y_ideal = k * X + b # 理想值
Y_real = Y_ideal + np.random.randn(X.shape[0]) * 20 # 真实值,带噪声
plt.plot(X, Y_ideal, label='ideal', color='r')
plt.scatter(X, Y_real, label='real', color='b')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.savefig(r'E:\particle\imgs\fitLine\1.png')
|
显示结果函数
1
2
3
4
5
|
def show(k, b):
plt.plot(X, X * k + b, label='pred', color='g')
plt.plot(X, Y_ideal, label='ideal', color='r')
plt.scatter(X, Y_real, label='real', color='b')
plt.legend()
|
最小二乘法
最小二乘法计算理想值和计算值的L2距离,通过解方程的方式直接计算直线参数。
1
2
3
4
5
6
7
|
def leastSquaresMethod(X, Y):
X = np.hstack([X[:, None], np.ones((X.shape[0], 1))]) # (m, 2)
W = np.linalg.inv(X.T @ X) @ X.T @ Y
return W[0], W[1]
k, b = leastSquaresMethod(X, Y_real)
print(k, b)
show(k, b)
|

PyTorch 逻辑回归
前述的最小二乘法虽然简单,但是当数据量不足或数据维度过大时,最小二乘法中计算逆不可行,可以采用迭代的方式拟合参数。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
import torch
import torch.nn as nn
class Net(nn.Module):
def __init__(self, k=1, b=0):
super().__init__()
weight = torch.tensor([k]).float()
bias = torch.tensor([b]).float()
self.weight = nn.Parameter(weight)
self.bias = nn.Parameter(bias)
def forward(self, x):
return x * self.weight + self.bias
def wtf(X, Y):
net = Net()
X_pytorch = torch.from_numpy(X)
Y_pytorch = torch.from_numpy(Y)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.002)
for epoch in range(1000):
optimizer.zero_grad()
Y_pred = net(X_pytorch)
loss = criterion(Y_pytorch, Y_pred)
# print(loss.item())
loss.backward()
optimizer.step()
return net.weight.item(), net.bias.item()
k, b = wtf(X, Y_real)
print(k, b)
show(k, b)
|

Numpy 逻辑回归
和PyTorch代码类似,自行推导梯度
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
def linearRegression(X, Y):
k = 1
b = 0
lr = 0.002
batch_size = X.shape[0]
for epoch in range(1000):
Y_pred = X * k + b
err = (Y_pred - Y) ** 2 # (m,)
derrdy = 2 * Y_pred.T - 2 * Y.T # (1,m)
derrdk = derrdy @ X # (1,)
derrdb = derrdy @ np.ones_like(Y) # (1,)
# 注意上面求得的梯度与数据的数量有关系,需要除以batch_size
k -= lr*derrdk / batch_size
b -= lr*derrdb / batch_size
return k, b
k, b = linearRegression(X, Y_real)
print(k, b)
show(k, b)
|

RANSAC
RANSAC随机抽取数据初始化参数,并通过参数,按数据的偏离程度将数据分为群内点(inliers)和群外电(outliers),只通过群内点更新参数。这个过程应该可以算是EM。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
def ransac(X, Y, d=4, iteration=100):
n = X.shape[0]
m = min(10, max(2, n//10))
indexs = np.random.choice(range(n), size=(m), replace=False)
max_num = 0
res = None
for i in range(100):
k, b = leastSquaresMethod(X[indexs], Y[indexs])
dist = np.abs(k*X-Y+b)/np.sqrt(k**2+1)
indexs = dist < d
num_inlier = indexs.sum()
if num_inlier > max_num:
res = (k, b)
return res
k, b = ransac(X, Y_real, d=10, iteration=100)
print(k, b)
show(k, b)
|
