直线拟合的多种方法

简介

直线拟合就是根据数据 $X$ 和$Y$，找到最合适的$W$和$K$，使得$W \times X+b$的结果和$Y$最接近。当然这个最接近和距离的定义有关，通常用的是欧几里得距离。本文介绍几种直线拟合的方法。

生成数据

生成数据，包括理想数据ideal（红线）和实际带噪声数据（蓝点）

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


import numpy as np
import matplotlib.pyplot as plt

# 从 -2 到 35间均匀采样，间隔为0.01
X = np.random.choice(range(-200, 3501), 100, replace=False) / 100

k = -3
b = -10

Y_ideal = k * X + b  # 理想值
Y_real = Y_ideal + np.random.randn(X.shape[0]) * 20  # 真实值，带噪声

plt.plot(X, Y_ideal, label='ideal', color='r')
plt.scatter(X, Y_real, label='real', color='b')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.savefig(r'E:\particle\imgs\fitLine\1.png')

显示结果函数

1
2
3
4
5


def show(k, b):
    plt.plot(X, X * k + b, label='pred', color='g')
    plt.plot(X, Y_ideal, label='ideal', color='r')
    plt.scatter(X, Y_real, label='real', color='b')
    plt.legend()

最小二乘法

最小二乘法计算理想值和计算值的L2距离，通过解方程的方式直接计算直线参数。

1
2
3
4
5
6
7


def leastSquaresMethod(X, Y):
    X = np.hstack([X[:, None], np.ones((X.shape[0], 1))])  # (m, 2)
    W = np.linalg.inv(X.T @ X) @ X.T @ Y
    return W[0], W[1]
k, b = leastSquaresMethod(X, Y_real)
print(k, b)
show(k, b)

最小二乘法

PyTorch 逻辑回归

前述的最小二乘法虽然简单，但是当数据量不足或数据维度过大时，最小二乘法中计算逆不可行，可以采用迭代的方式拟合参数。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self, k=1, b=0):
        super().__init__()
        weight = torch.tensor([k]).float()
        bias = torch.tensor([b]).float()
        self.weight = nn.Parameter(weight)
        self.bias = nn.Parameter(bias)
    
    def forward(self, x):
        return x * self.weight + self.bias

def wtf(X, Y):
    net = Net()
    X_pytorch = torch.from_numpy(X)
    Y_pytorch = torch.from_numpy(Y)
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=0.002)
    for epoch in range(1000):
        optimizer.zero_grad()
        Y_pred = net(X_pytorch)
        loss = criterion(Y_pytorch, Y_pred)
#         print(loss.item())
        loss.backward()
        optimizer.step()
    return net.weight.item(), net.bias.item()
k, b = wtf(X, Y_real)
print(k, b)
show(k, b)

PyTorch

Numpy 逻辑回归

和PyTorch代码类似，自行推导梯度

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


def linearRegression(X, Y):
    k = 1
    b = 0
    lr = 0.002
    batch_size = X.shape[0]
    for epoch in range(1000):
        Y_pred = X * k + b
        err = (Y_pred - Y) ** 2  # (m,)
        derrdy = 2 * Y_pred.T - 2 * Y.T  # (1,m)
        derrdk = derrdy @ X  # (1,)
        derrdb = derrdy @ np.ones_like(Y)  # (1,)
        # 注意上面求得的梯度与数据的数量有关系，需要除以batch_size
        k -= lr*derrdk / batch_size
        b -= lr*derrdb / batch_size
    return k, b
k, b = linearRegression(X, Y_real)
print(k, b)
show(k, b)

Numpy

RANSAC

RANSAC随机抽取数据初始化参数，并通过参数，按数据的偏离程度将数据分为群内点（inliers）和群外电（outliers），只通过群内点更新参数。这个过程应该可以算是EM。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


def ransac(X, Y, d=4, iteration=100):
    n = X.shape[0]
    m = min(10, max(2, n//10))
    indexs = np.random.choice(range(n), size=(m), replace=False)
    max_num = 0
    res = None
    for i in range(100):
        k, b = leastSquaresMethod(X[indexs], Y[indexs])
        dist = np.abs(k*X-Y+b)/np.sqrt(k**2+1)
        indexs = dist < d
        num_inlier = indexs.sum()
        if num_inlier > max_num:
            res = (k, b)
    return res
k, b = ransac(X, Y_real, d=10, iteration=100)
print(k, b)
show(k, b)

RANSAC