本章主要讲述数据预处理。

来源：动手学习深度学习(pytorch版)

数据预处理

#创建数据集
import os
os.makedirs(os.path.join('..','data'),exist_ok=True)
data_file =os.path.join('..','data','house_tiny.csv')
with open(data_file,'w') as f:
    f.write('NumRooms,Alley,Price\n') #列名
    f.write('NA,Pave,127500\n') #每行表示一个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

1
2
3

import pandas as pd
data=pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000

处理缺失值

#NAN代表缺失值
#通过iloc，我们将data分成inputs和outputs
inputs,outputs = data.iloc[:,0:1],data.iloc[:,2]
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms
0       3.0
1       2.0
2       4.0
3       3.0

1
2
3

inputs1=data.iloc[:,1:2]
inputs = pd.concat([inputs,inputs1],axis=1)
inputs

	NumRooms	Alley
0	3.0	Pave
1	2.0	NaN
2	4.0	NaN
3	3.0	NaN

我们将NaN视为一个类别，由于Alley列只接受两种类型的类别值Pave和NaN,pandas可以自动将此列转换为两列Alley_Pave和Alley_NaN

1 2	inputs = pd.get_dummies(inputs,dummy_na=True,dtype=int) # dtype=int为了将原本返回的逻辑值转为int型 print(inputs)

   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1

转换为张量格式

1
2
3

import torch
X,y = torch.tensor(inputs.values),torch.tensor(outputs.values)
X,y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

线性代数

#标量
x = torch.tensor(3.0)
y = torch.tensor(2.0)
x + y, x * y, x / y, x**y

(tensor(5.), tensor(6.), tensor(1.5000), tensor(9.))

1
2
3

#向量
x = torch.arange(4)
x

tensor([0, 1, 2, 3])

1	x[3] #访问任意元素

tensor(3)

1	len(x) #数组长度

1	x.shape #向量形状

torch.Size([4])

矩阵

1 2	A = torch.arange(20).reshape(5,4) A

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15],
        [16, 17, 18, 19]])

1
2

#矩阵转置
A.T

tensor([[ 0,  4,  8, 12, 16],
        [ 1,  5,  9, 13, 17],
        [ 2,  6, 10, 14, 18],
        [ 3,  7, 11, 15, 19]])

1 2	B = torch.tensor([[1,2,3],[2,0,4],[3,4,5]]) # 对称矩阵 B , B == B.T

(tensor([[1, 2, 3],
         [2, 0, 4],
         [3, 4, 5]]),
 tensor([[True, True, True],
         [True, True, True],
         [True, True, True]]))

张量

##当我们处理图像时，张量将变得更加重要，图像以n维数组形式出现，其中3个轴对应于高度,宽度，以及一个通道用于表示颜色通道（红色，绿色和蓝色）。

1 2	X = torch.arange(24).reshape(2,3,4) X, len(X)

(tensor([[[ 0,  1,  2,  3],
          [ 4,  5,  6,  7],
          [ 8,  9, 10, 11]],

         [[12, 13, 14, 15],
          [16, 17, 18, 19],
          [20, 21, 22, 23]]]),
 2)

张量算法的基本性质

1
2
3

A = torch.arange(20,dtype=torch.float32).reshape(5,4)
B = A.clone() # 通过分配新内存，将A的一个副本分配给B
A, A + B # 对于相同形状的矩阵，按元素的二元运算结果都将是形状相同的张量

(tensor([[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.],
         [16., 17., 18., 19.]]),
 tensor([[ 0.,  2.,  4.,  6.],
         [ 8., 10., 12., 14.],
         [16., 18., 20., 22.],
         [24., 26., 28., 30.],
         [32., 34., 36., 38.]]))

1	A * B # 按照元素乘，称为哈达玛积

tensor([[  0.,   1.,   4.,   9.],
        [ 16.,  25.,  36.,  49.],
        [ 64.,  81., 100., 121.],
        [144., 169., 196., 225.],
        [256., 289., 324., 361.]])

1
2
3

a = 2
X = torch.arange(24).reshape(2,3,4)
a + X, (a * X).shape

(tensor([[[ 2,  3,  4,  5],
          [ 6,  7,  8,  9],
          [10, 11, 12, 13]],

         [[14, 15, 16, 17],
          [18, 19, 20, 21],
          [22, 23, 24, 25]]]),
 torch.Size([2, 3, 4]))

降维

1 2	x = torch.arange(4,dtype=torch.float32) x, x.sum() # 会计算其所有元素的和

(tensor([0., 1., 2., 3.]), tensor(6.))

1	A.shape, A.sum()

(torch.Size([5, 4]), tensor(190.))

1
2
3

#指定某个轴降低维度
A_sum_axis0=A.sum(axis=0)
A_sum_axis0,A_sum_axis0.shape

(tensor([40., 45., 50., 55.]), torch.Size([4]))

1 2	A_sum_axis1 = A.sum(axis=1) A_sum_axis1, A_sum_axis1.shape

(tensor([ 6., 22., 38., 54., 70.]), torch.Size([5]))

1 2	# 沿行和列对矩阵求和，等价于对矩阵的所有元素求和 A.sum(axis=[0,1]) # 结果和A.sum()相同

tensor(190.)

1	A.mean(), A.sum() / A.numel()

(tensor(9.5000), tensor(9.5000))

1	A.mean(axis=0), A.sum(axis=0) / A.shape[0]

(tensor([ 8.,  9., 10., 11.]), tensor([ 8.,  9., 10., 11.]))

非降维求和

1 2	sum_A = A.sum(axis=1,keepdims=True) sum_A

tensor([[ 6.],
        [22.],
        [38.],
        [54.],
        [70.]])

A / sum_A

tensor([[0.0000, 0.1667, 0.3333, 0.5000],
        [0.1818, 0.2273, 0.2727, 0.3182],
        [0.2105, 0.2368, 0.2632, 0.2895],
        [0.2222, 0.2407, 0.2593, 0.2778],
        [0.2286, 0.2429, 0.2571, 0.2714]])

1	A.cumsum(axis=0) #累积总和

tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  6.,  8., 10.],
        [12., 15., 18., 21.],
        [24., 28., 32., 36.],
        [40., 45., 50., 55.]])

点积

1 2	y = torch.ones(4,dtype = torch.float32) x, y, torch.dot(x,y)

(tensor([0., 1., 2., 3.]), tensor([1., 1., 1., 1.]), tensor(6.))

1 2	#也可以通过执行按元素相乘后求和 torch.sum(x*y)

tensor(6.)

矩阵-向量积

1 2	#线代中的矩阵乘以向量 mnn1=m1 A.shape, x.shape, torch.mv(A,x)

(torch.Size([5, 4]), torch.Size([4]), tensor([ 14.,  38.,  62.,  86., 110.]))

矩阵-矩阵乘法

1 2	B = torch.ones(4,3) torch.mm(A,B) #与哈达玛积不同

tensor([[ 6.,  6.,  6.],
        [22., 22., 22.],
        [38., 38., 38.],
        [54., 54., 54.],
        [70., 70., 70.]])

范数

1 2	u = torch.tensor([3.0,-4.0]) torch.norm(u) # L2范数

tensor(5.)

L1范数是取绝对值相加，L2范数是取平方相加后开平方，Lp范数是对每个值取p次方之后相加的和开p次方

1	torch.abs(u).sum() # L1范数

tensor(7.)

费罗贝尼乌斯范数是矩阵元素平方和的平方根

1	torch.norm(torch.ones((4,9)))

tensor(6.)

微积分

%matplotlib inline
import numpy as np
from matplotlib_inline import backend_inline
from d2l import torch as d2l
def f(x):
    return 3*x**2-4*x

def numerical_lim(f,x,h):
    return (f(x+h)-f(x))/h
h=0.1
for i in range(5):
    print(f'h={h:.5f},numerocal limit = {numerical_lim(f,1,h):.5f}')
    h*=0.1

h=0.10000,numerocal limit = 2.30000
h=0.01000,numerocal limit = 2.03000
h=0.00100,numerocal limit = 2.00300
h=0.00010,numerocal limit = 2.00030
h=0.00001,numerocal limit = 2.00003

use_svg_display函数指定matplotlib包输出svg图表以获得更清晰的图像

@save是一个特殊的标记，会将对应的函数、类或语句保存在d2l包中

1
2
3

def use_svg_display(): #@save
    """使用svg格式在jupyter中显示绘图"""
    backend_inline.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5,2.5)): #@save
    """设置matplotlib的图表大小"""
    use_svg_display()
    d2l.plt.rcParams['figure.figsize'] = figsize

#@save
def set_axes(axes,xlabel,ylabel,xlim,ylim,xscale,yscale,legend):
    """设置matplotlib的轴"""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()

#@save
def plot(X, Y=None,xlabel=None,ylabel=None,legend=None,xlim=None,
        ylim=None,xscale='linear',yscale='linear',
        fmts=('-','m--','g-.','r:'),figsize=(3.5,2.5),axes=None):
    """绘制数据点"""
    if legend is None:
        legend = []
    set_figsize(figsize)
    axes = axes if axes else d2l.plt.gca()
    
    # 如果X有一个轴，输出True
    def has_one_axis(X):
        return (hasattr(X,"ndim") and X.ndim == 1 or isinstance(X,list)
               and not hasattr(X[0],"__len__"))
    if has_one_axis(X):
        X = [X]
    if Y is None:
        X, Y = [[]]*len(X),X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)
    axes.cla()
    for x,y,fmt in zip(X,Y,fmts):
        if len(x):
            axes.plot(x,y,fmt)
        else:
            axes.plot(y,fmt)
    set_axes(axes,xlabel,ylabel,xlim,ylim,xscale,yscale,legend)

1 2	x = np.arange(0,3,0.1) plot(x,[f(x),2*x-3],'x','f(x)',legend=['f(x)','Tangent line (x=1)'])

import matplotlib.pyplot as plt
def f(x):
    return x**3-1/x
def f_tangent(f,x,point):
    h=1e-4
    grad=(f(h+point)-f(point))/h
    return grad*(x-point)+f(point)
x=np.arange(0.1,2.0,0.01)
y=f(x)
y_tangent=f_tangent(f,x,1)
plt.plot(x,y,label="f(x)")
plt.plot(x,y_tangent,label="Tangent line at x=1")
plt.legend()
plt.title('Graph of f(x) and its tangent line at x=1')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

1
2
3

#利用代码验证
#requires_grad设置为true，它将开始追踪在其上的所有操作，这样就可以利用链式法则进行梯度传播
x = torch.arange(2.0,requires_grad=True) # 将梯度附加到其想要计算的偏导数的变量

#例一
#grad_fn 该属性即创建该Tensor的Function,就是说该Tensor是不是通过某些运算得到的，若是，则grad_fn返回与这些运算相关的对象，否则是None
x = torch.ones(2,2,requires_grad=True)
print(x)
print(x.grad_fn)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
None

#例二
x = torch.ones(2,2,requires_grad=True)
y = x + 2
print(y)
print(y.grad_fn)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x000002278E2D5D00>

is_leaf说明
注意像x是直接创建的，所以他没有grad_fn，而y是通过一个加法操作创建的，所以他有一个grad_fn
像x这种直接创建的称为叶子节点，叶子节点对应的grad_fn是None

1	print(x.is_leaf,y.is_leaf)

True False

#另一种改变requires_grad的方法
a = torch.randn(2,2)
print(a)
a=((a*3)/(a-1))
print(a)
print(a.requires_grad)
a.requires_grad_(True)
y=(a*a).sum()
print(y)
print(y.requires_grad)

tensor([[ 0.5309, -0.5969],
        [-0.9133, -1.4638]])
tensor([[-3.3949,  1.1214],
        [ 1.4320,  1.7824]])
False
tensor(18.0100, grad_fn=<SumBackward0>)
True

#使用backward()计算梯度
#注意在使用y.backward()时，若y时标量，则可以直接计算，否则需利用torch传入一个与y相同维度的tensor
x = torch.ones(2,2,requires_grad=True)
y = x+2
z = y*y*3
out = z.mean()
print(z)
print(out)
out.backward()
print(x.grad)

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>)
tensor(27., grad_fn=<MeanBackward0>)
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])

grad_zero说明
该函数用来清除x的梯度，也就是重新赋值为0。需要注意的是，只有当x被求过一次梯度时，这个函数才能使用，否则会报错

#再来一次反向传播，注意grad是累加的
out2 = x.sum()
print(out2)
out2.backward()
print(x.grad)

out3 = x.sum()
x.grad.zero_()
out3.backward()
print(x.grad)

tensor(4., grad_fn=<SumBackward0>)
tensor([[5.5000, 5.5000],
        [5.5000, 5.5000]])
tensor([[1., 1.],
        [1., 1.]])

观道观

深度学习第一篇——数据预处理