神經網絡反向傳播算法
奇技指南
本文介紹全連接人工神經網絡的訓練算法——反向傳播算法。反向傳播算法本質上是梯度下降法。人工神經網絡的參數多,梯度計算比較複雜。在人工神經網絡模型提出幾十年後才有研究者提出了反向傳播算法來解決深層參數的訓練問題。本文將詳細講解該算法的原理及實現。本篇為機器學習系列文章
一、符號與表示
圖 1.1 多層全連接神經網絡
圖 1.2 神經元

訓練過程
對每一個提交給神經網絡的樣本用式 [2.3] 對全體權值進行一次更新,直到所有樣本的誤差值都小於一個預設的閾值,此時訓練完成。看到這裏或有疑問:不是應該用所有訓練樣本的誤差的模平方的平均值(均方誤差)來作 E 麼?如果把樣本誤差的模平方看作一個隨機變量,那麼所有樣本的平均誤差模平方是該隨機變量的一個無偏估計。而一個樣本的誤差模平方也是該隨機變量的無偏估計,只不過估計得比較粗糙(大數定律)。但是用一次一個樣本的誤差模平方進行訓練可節省計算量,且支持在線學習(樣本隨來隨訓練)。




圖 1.3 神經網絡各個值影響損失函數的方式

-
反向傳播階段:
-
權值更新階段:
將矩陣形式的反向傳播與權值更新算法總結如下:
-
反向傳播階段:
-
權值更新階段:
實現
後來筆者寫了一個機器學習庫。那裏的 ANN 實現對本文例程做了改進:
https://link.zhihu.com/?target=https%3A//github.com/zhangjuefei/mentat
本節以 Python 語言實現了一個 mini-batch 隨機梯度下降反向傳播神經網絡,帶衝量和學習率衰減功能。代碼如下:
import numpy as np
class DNN:
def __init__(self, input_shape, shape, activations, eta=0.1, threshold=1e-5, softmax=False, max_epochs=1000,
regularization=0.001, minibatch_size=5, momentum=0.9, decay_power=0.5, verbose=False):
if not len(shape) == len(activations):
raise Exception("activations must equal to number od layers.")
self.depth = len(shape)
self.activity_levels = [np.mat([0])] * self.depth
self.outputs = [np.mat(np.mat([0]))] * (self.depth + 1)
self.deltas = [np.mat(np.mat([0]))] * self.depth
self.eta = float(eta)
self.effective_eta = self.eta
self.threshold = float(threshold)
self.max_epochs = int(max_epochs)
self.regularization = float(regularization)
self.is_softmax = bool(softmax)
self.verbose = bool(verbose)
self.minibatch_size = int(minibatch_size)
self.momentum = float(momentum)
self.decay_power = float(decay_power)
self.iterations = 0
self.epochs = 0
self.activations = activations
self.activation_func = []
self.activation_func_diff = []
for f in activations:
if f == "sigmoid":
self.activation_func.append(np.vectorize(self.sigmoid))
self.activation_func_diff.append(np.vectorize(self.sigmoid_diff))
elif f == "identity":
self.activation_func.append(np.vectorize(self.identity))
self.activation_func_diff.append(np.vectorize(self.identity_diff))
elif f == "relu":
self.activation_func.append(np.vectorize(self.relu))
self.activation_func_diff.append(np.vectorize(self.relu_diff))
else:
raise Exception("activation function {:s}".format(f))
self.weights = [np.mat(np.mat([0]))] * self.depth
self.biases = [np.mat(np.mat([0]))] * self.depth
self.acc_weights_delta = [np.mat(np.mat([0]))] * self.depth
self.acc_biases_delta = [np.mat(np.mat([0]))] * self.depth
self.weights[0] = np.mat(np.random.random((shape[0], input_shape)) / 100)
self.biases[0] = np.mat(np.random.random((shape[0], 1)) / 100)
for idx in np.arange(1, len(shape)):
self.weights[idx] = np.mat(np.random.random((shape[idx], shape[idx - 1])) / 100)
self.biases[idx] = np.mat(np.random.random((shape[idx], 1)) / 100)
def compute(self, x):
result = x
for idx in np.arange(0, self.depth):
self.outputs[idx] = result
al = self.weights[idx] * result + self.biases[idx]
self.activity_levels[idx] = al
result = self.activation_func[idx](al)
self.outputs[self.depth] = result
return self.softmax(result) if self.is_softmax else result
def predict(self, x):
return self.compute(np.mat(x).T).T.A
def bp(self, d):
tmp = d.T
for idx in np.arange(0, self.depth)[::-1]:
delta = np.multiply(tmp, self.activation_func_diff[idx](self.activity_levels[idx]).T)
self.deltas[idx] = delta
tmp = delta * self.weights[idx]
def update(self):
self.effective_eta = self.eta / np.power(self.iterations, self.decay_power)
for idx in np.arange(0, self.depth):
# current gradient
weights_grad = -self.deltas[idx].T * self.outputs[idx].T / self.deltas[idx].shape[0] + \
self.regularization * self.weights[idx]
biases_grad = -np.mean(self.deltas[idx].T, axis=1) + self.regularization * self.biases[idx]
# accumulated delta
self.acc_weights_delta[idx] = self.acc_weights_delta[
idx] * self.momentum - self.effective_eta * weights_grad
self.acc_biases_delta[idx] = self.acc_biases_delta[idx] * self.momentum - self.effective_eta * biases_grad
self.weights[idx] = self.weights[idx] + self.acc_weights_delta[idx]
self.biases[idx] = self.biases[idx] + self.acc_biases_delta[idx]
def fit(self, x, y):
x = np.mat(x)
y = np.mat(y)
loss = []
self.iterations = 0
self.epochs = 0
start = 0
train_set_size = x.shape[0]
while True:
end = start + self.minibatch_size
minibatch_x = x[start:end].T
minibatch_y = y[start:end].T
yp = self.compute(minibatch_x)
d = minibatch_y - yp
if self.is_softmax:
loss.append(np.mean(-np.sum(np.multiply(minibatch_y, np.log(yp + 1e-1000)), axis=0)))
else:
loss.append(np.mean(np.sqrt(np.sum(np.power(d, 2), axis=0))))
self.iterations += 1
start = (start + self.minibatch_size) % train_set_size
if self.iterations % train_set_size == 0:
self.epochs += 1
mean_e = np.mean(loss)
loss = []
if self.verbose:
print("epoch: {:d}. mean loss: {:.6f}. learning rate: {:.8f}".format(self.epochs, mean_e,
self.effective_eta))
if self.epochs >= self.max_epochs or mean_e < self.threshold:
break
self.bp(d)
self.update()
@staticmethod
def sigmoid(x):
return 1.0 / (1.0 + np.power(np.e, min(-x, 1e2)))
@staticmethod
def sigmoid_diff(x):
return np.power(np.e, min(-x, 1e2)) / (1.0 + np.power(np.e, min(-x, 1e2))) ** 2
@staticmethod
def relu(x):
return x if x > 0 else 0.0
@staticmethod
def relu_diff(x):
return 1.0 if x > 0 else 0.0
@staticmethod
def identity(x):
return x
@staticmethod
def identity_diff(x):
return 1.0
@staticmethod
def softmax(x):
x[x > 1e2] = 1e2
ep = np.power(np.e, x)
return ep / np.sum(ep, axis=0)
測試一下神經網絡的擬合能力如何。用網絡擬合以下三個函數:
對每個函數生成 100 個隨機選擇的數據點。神經網絡的輸入為 2 維,輸出為 1 維。為每個函數訓練 3 個神經網絡。這些網絡有 1 個隱藏層 1 個輸出層,隱藏層神經元數量分別為:3、5 和 8 。隱藏層激活函數是 sigmoid 。輸出層的激活函數是恆等函數f(x)=x 。初始學習率 0.4 ,衰減指數 0.2 。衝量慣性 0.6 。迭代 200 個 epoch 。mini batch 樣本數為 40 。L2 正則,正則強度 0.0001 。擬合效果見下圖:
圖 4.1 對三種函數進行擬合
測試代碼如下:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mentat.classification_model import DNN
from mpl_toolkits.mplot3d import Axes3D
np.random.seed(42)
hidden_layer_size = [3, 5, 8] # 隱藏層神經元個數(所有隱藏層都取同樣數量神經元)
hidden_layers = 1 # 隱藏層數量
hidden_layer_activation_func = "sigmoid" # 隱藏層激活函數
learning_rate = 0.4 # 學習率
max_epochs = 200 # 訓練 epoch 數量
regularization_strength = 0.0001 # 正則化強度
minibatch_size = 40 # mini batch 樣本數
momentum = 0.6 # 衝量慣性
decay_power = 0.2 # 學習率衰減指數
def f1(x):
return (x[:, 0] ** 2 + x[:, 1] ** 2).reshape((len(x), 1))
def f2(x):
return (x[:, 0] ** 2 - x[:, 1] ** 2).reshape((len(x), 1))
def f3(x):
return (np.cos(1.2 * x[:, 0]) * np.cos(1.2 * x[:, 1])).reshape((len(x), 1))
funcs = [f1, f2, f3]
X = np.random.uniform(low=-2.0, high=2.0, size=(100, 2))
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
# 模型
names = ["{:d} neurons per layer".format(hs) for hs
in hidden_layer_size]
classifiers = [
DNN(input_shape=2, shape=[hs] * hidden_layers + [1],
activations=[hidden_layer_activation_func] * hidden_layers + ["identity"], eta=learning_rate, threshold=0.001,
softmax=False, max_epochs=max_epochs, regularization=regularization_strength, verbose=True,
minibatch_size=minibatch_size, momentum=momentum, decay_power=decay_power) for hs in
hidden_layer_size
]
figure = plt.figure(figsize=(5 * len(classifiers) + 2, 4 * len(funcs)))
cm = plt.cm.PuOr
cm_bright = ListedColormap(["#DB9019", "#00343F"])
i = 1
for cnt, f in enumerate(funcs):
zz = f(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
z = f(X)
ax = figure.add_subplot(len(funcs), len(classifiers) + 1, i, projection="3d")
if cnt == 0:
ax.set_title("data")
ax.plot_surface(xx, yy, zz, rstride=1, cstride=1, alpha=0.6, cmap=cm)
ax.contourf(xx, yy, zz, zdir='z', offset=zz.min(), alpha=0.6, cmap=cm)
ax.scatter(X[:, 0], X[:, 1], z.ravel(), cmap=cm_bright, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_zlim(zz.min(), zz.max())
i += 1
for name, clf in zip(names, classifiers):
print("model: {:s} training.".format(name))
ax = plt.subplot(len(funcs), len(classifiers) + 1, i)
clf.fit(X, z)
predict = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
ax = figure.add_subplot(len(funcs), len(classifiers) + 1, i, projection="3d")
if cnt == 0:
ax.set_title(name)
ax.plot_surface(xx, yy, predict, rstride=1, cstride=1, alpha=0.6, cmap=cm)
ax.contourf(xx, yy, predict, zdir='z', offset=zz.min(), alpha=0.6, cmap=cm)
ax.scatter(X[:, 0], X[:, 1], z.ravel(), cmap=cm_bright, edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_zlim(zz.min(), zz.max())
i += 1
print("model: {:s} train finished.".format(name))
plt.tight_layout()
plt.savefig(
"pic/dnn_fitting_{:d}_{:.6f}_{:d}_{:.6f}_{:.3f}_{:3f}.png".format(hidden_layers, learning_rate, max_epochs,
regularization_strength, momentum, decay_power))
參考書目
-
https://link.zhihu.com/?target=https%3A//book.douban.com/subject/26732914/
-
https://link.zhihu.com/?target=https%3A//book.douban.com/subject/1115600/
-
https://link.zhihu.com/?target=https%3A//book.douban.com/subject/1102235/
-
https://link.zhihu.com/?target=https%3A//book.douban.com/subject/2068931/
界世的你當不
只做你的肩膀
無
360官方技術公眾號
技術乾貨|一手資訊|精彩活動
空·
我知道你 在看 喲
- 循環神經網絡(RNN)的改進——長短期記憶LSTM
- KG 高引論文解讀兩篇 | 兩種模型:多層卷積神經網絡、知識感知路徑遞歸網絡
- Terminal Brain Damage:在硬件錯誤攻擊下深度神經網絡的嚴重退化
- 常見的五種神經網絡(4)-深度信念網絡(下)篇
- KG 高引論文解讀兩篇 | 兩種模型:多層卷積神經網絡、知識感知路徑遞歸網絡
- 神經網絡模型壓縮好就業嗎?
- 用 Pytorch 理解卷積網絡
- [ch04-03] 用神經網絡解決線性迴歸問題
- “不正經”的卷積神經網絡
- 強化學習、聯邦學習、圖神經網絡,飛槳全新工具組件詳解