本章主要讲述线性回归。

来源：动手学习深度学习(pytorch版)

向量化加速

%matplotlib inline
import math
import time
import numpy as np
import torch
from d2l import torch as d2l

1
2
3

n=10000
a = torch.ones(n)
b = torch.ones(n)

#基准测试
class Timer: #@save
    def __init__(self):
        self.times = []
        self.start()
    
    def start(self):
        """启动计时器"""
        self.tik = time.time() #获取当前时间的时间戳，时间戳是从1970年1月1日零点开始按秒计算的偏移量
    
    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time()-self.tik)
        return self.times[-1]
    
    def avg(self):
        """返回平均时间"""
        return sum(self.times)/len(self.times)
    
    def Sum(self):
        return sum(self.times)
    
    def cumsum(self):
        """返回累计时间"""
        return np.array(self.times).cumsum().tolist() #self.times的列表转换为NumPy数组，然后对数组进行求和操作，并将结果转换为Python列表。

c = torch.zeros(n)
timer = Timer()
for i in range(n):
    c[i] = a[i] + b[i]
print(f'{timer.stop(): .5f} sec')
print(c)
print(a)
print(b)
print(timer.cumsum(),timer.Sum())

 0.12451 sec
tensor([2., 2., 2.,  ..., 2., 2., 2.])
tensor([1., 1., 1.,  ..., 1., 1., 1.])
tensor([1., 1., 1.,  ..., 1., 1., 1.])
[0.1245124340057373] 0.1245124340057373

1
2
3

timer.start()
d = a + b
f'{timer.stop() : .9f} sec'

' 0.000000000 sec'

正态分布与平方损失

def normal(x,mu,sigma):
    p = 1/np.sqrt(2*math.pi*sigma**2)
    return p*np.exp((-0.5/sigma**2)*(x-mu)**2)

x = np.arange(-7,7,0.01)

#均值和标准差对
params = [(0,1),(0,2),(3,1)]
d2l.plot(x,[normal(x,mu,sigma) for mu,sigma in params],xlabel='x',ylabel='p(x)',figsize=(4.5,2.5),legend= [f'mean {mu}, std {sigma}'for mu,sigma in params])

正如我们看到的改变均值会发生沿x轴偏移，增加方差将会分散分布、降低峰值，均方误差损失函数能用于线性回归的一个原因是：我们假设观测中包含噪声，其中噪声服从正态分布。

小结
1.机器学习模型中的关键要素是训练数据集、损失函数、优化算法、以及模型本身。
2.向量化使数学表达更简洁，同时计算更快。
3.最小化目标函数和执行极大似然估计等价
4.线性回归模型也是一个简单的神经网络

当数据集较小时，解析解可能比随机梯度下降更好。然而，在大型数据集上，计算解析解可能会非常耗时，或者存在多个局部最小的情况。此外，当矩阵

⊤

X
⊤
X不可逆时，解析解不存在。在这种情况下，需要使用正则化或数值优化方法。

由于使用了绝对值函数作为损失函数，梯度在接近驻点（即梯度接近零的点）时，梯度不会平滑地趋向于零，而是存在突变。当使用SGD算法，不断更新参数时，可能导致模型无法稳定收敛。

解决该问题的方法：

使用平滑的损失函数，可使用MSE、Smooth L1损失函数等。
调整学习率，逐渐减小学习率，使得在驻点附近的参数更新更加稳定
使用动量法或自适应学习率优化算法

从0开始线性回归

%matplotlib inline 
import torch
import random
from d2l import torch as d2l

def synthetic_data(w,b,num_examples):  #@save
    """生成y=Xw+b+噪声"""
    x = torch.normal(0,1,(num_examples,len(w))) #num_examples表示有几个样本，即有几行。len(w)表示有与w行数相同的属性数即列数
    y = torch.matmul(x,w)+b
    y += torch.normal(0,0.01,y.shape) #加上噪声
    return x,y.reshape((-1,1))  #(-1, 1)表示将数组y在第一个维度上展开，第二个维度保持不变,此例中将y转为列向量

true_w = torch.tensor([2,-3.4])
true_b = 4.2
features, labels = synthetic_data(true_w,true_b,1000)
print('features:',features[0],'\nlabels:',labels[0]) #features中每一行都包含一个二维样本，labels每一行都包含一个一维标量

features: tensor([-1.5115, -2.4289]) 
labels: tensor([9.4437])

生成features的第二个特征与labels之间的线性关系

1
2

d2l.set_figsize() #设置图形的宽度和高度
d2l.plt.scatter(features[:,1].detach().numpy(),labels.detach().numpy(),1) #labels.detach()返回的新张量与原始张量共享数据,而numpy则是将之转为数组

<matplotlib.collections.PathCollection at 0x230fd7fe130>

读取数据集

def data_iter(batch_size,features,labels): #batch_size为每次读取的批量大小
    num_examples = len(features)
    indices = list(range(num_examples)) 
    # 这些样本是随机读取的，没有特定的顺序
    random.shuffle(indices) #用于将列表中的元素随机打乱
    for i in range(0,num_examples,batch_size):
        batch_indices = torch.tensor(indices[i:min(i + batch_size,num_examples)]) #每次取batch_size个，若是满了，则取num_examples-i+1个
        yield features[batch_indices],labels[batch_indices]

yield关键字用于定义生成器函数。生成器函数是一种特殊的迭代器，它允许你在函数执行过程中暂停和恢复，从而节省内存。当你调用一个生成器函数时，它会返回一个生成器对象，而不是直接执行函数体。要获取生成器的下一个值，你需要使用next()函数或者在循环中使用for语句。features[batch_indices]和labels[batch_indices]。这两个列表分别包含了当前批次的特征和标签。

batch_size = 10
for X, y in data_iter(batch_size,features,labels):
    print(X,"\n",y)
    break

tensor([[-1.7161,  0.5275],
        [ 0.5354, -0.6246],
        [ 0.8205,  0.0515],
        [-2.5739,  1.4279],
        [ 0.8007,  0.0255],
        [ 0.4949,  0.1812],
        [-0.4580,  0.1285],
        [-0.7347,  0.9106],
        [-0.7922, -0.7991],
        [-0.6561,  0.4373]]) 
 tensor([[-1.0201],
        [ 7.3861],
        [ 5.6638],
        [-5.8160],
        [ 5.7303],
        [ 4.5611],
        [ 2.8534],
        [-0.3845],
        [ 5.3452],
        [ 1.3960]])

初始化模型参数

#初始化权重，并将偏置为0
w = torch.normal(0,0.01,size=(2,1),requires_grad=True)
b = torch.zeros(1,requires_grad=True)

#定义模型

def linreg(X,w,b): #@save
    """线性回归模型"""
    return torch.matmul(X,w)+b

#定义损失函数

def squared_loss(y_hat,y): #@save
    """均方损失"""
    return (y_hat-y.reshape(y_hat.shape))**2/2

#定义优化算法

def sgd(params,lr,batch_size): #@save
    #params:参数集合，lr：学习率，batch_size:批量大小
    with torch.no_grad(): #是PyTorch中的一个上下文管理器，用于在计算图中禁用梯度计算
        for param in params:
            param -= lr * param.grad / batch_size #每一步更新大小由学习率lr决定，用批量答案小来规范步长，这样步长就不会取决于批量大小
            param.grad.zero_() #将参数梯度清零

lr = 0.03
num_epochs = 3 #共三轮总遍历
net = linreg
loss = squared_loss
for epoch in range(num_epochs):
    for X,y in data_iter(batch_size,features,labels):
        l = loss(net(X,w,b),y) # X和y的小批量损失，net(X,w,b)用于计算神经网络的输出。其中，X是输入数据，w是权重矩阵，b是偏置向量。\
        #这个函数通常用于前向传播过程，即将输入数据通过神经网络进行计算，得到输出结果。
        #这里的X是一个二维数组，每一行代表一个样本，每一列代表一个特征；
        #因为l的形状是(batch_size,1),不是一个标量
        #l中的所有元素被加到一起，并以此计算关于[w,b]的梯度
        l.sum().backward()
        sgd([w,b], lr, batch_size) #使用参数的梯度更新参数
    with torch.no_grad():
        train_l = loss(net(features,w,b),labels)
        print(f'epoch {epoch+1},loss {float(train_l.mean()): f}')

epoch 1,loss  0.044458
epoch 2,loss  0.000173
epoch 3,loss  0.000050

观道观

深度学习第二篇——线性神经网络

从0开始线性回归