From 439062d609c69f4fe3bfb6389069ecb95c5b7a46 Mon Sep 17 00:00:00 2001 From: ")s" Date: Mon, 28 Oct 2024 19:38:45 +0800 Subject: [PATCH] Update chapter8.md to v2 --- docs/chapter8/chapter8.md | 198 -------------------------------------- 1 file changed, 198 deletions(-) diff --git a/docs/chapter8/chapter8.md b/docs/chapter8/chapter8.md index a55cd5e..966bb93 100644 --- a/docs/chapter8/chapter8.md +++ b/docs/chapter8/chapter8.md @@ -40,19 +40,16 @@ $h_i(\boldsymbol{x}) \in\{-1,+1\}$ $$ - \begin{aligned} \mathrm{X}&=\sum_{i=1}^{\mathrm{T}} x_i\\ \mathbb{E}(X)&=\sum_{i=1}^{\mathrm{T}}\mathbb{E}(x_i)=(1-\epsilon)T \end{aligned} - $$ 证明过程如下: $$ - \begin{aligned} P(H(x) \neq f(x))=& P(X \leq\lfloor T / 2\rfloor) \\ & \leqslant P(X \leq T / 2) \\ & =P\left[X-(1-\epsilon) T \leqslant \frac{T}{2}-(1-\epsilon) T\right] \\ & =P\left[X- @@ -62,16 +59,13 @@ $$ \\ &=P\left[\frac{1}{\mathrm{T}}\sum_{i=1}^{\mathrm{T}} x_i-\frac{1}{\mathrm{T}} \sum_{i=1}^{\mathrm{T}}\mathbb{E}(x_i) \leqslant -\frac{1}{2}\left(1-2\epsilon\right)]\right] \end{aligned} - $$ 根据Hoeffding不等式知 $$ - P\left(\frac{1}{m} \sum_{i=1}^{m} x_{i}-\frac{1}{m} \sum_{i=1}^{m} \mathbb{E}\left(x_{i}\right) \leqslant -\delta\right) \leqslant \exp \left(-2 m \delta^{2}\right) - $$ @@ -79,9 +73,7 @@ $$ $$ - \begin{aligned} P(H(\boldsymbol{x}) \neq f(\boldsymbol{x})) &=\sum_{k=0}^{\lfloor T / 2\rfloor} \left( \begin{array}{c}{T} \\ {k}\end{array}\right)(1-\epsilon)^{k} \epsilon^{T-k} \\ & \leqslant \exp \left(-\frac{1}{2} T(1-2 \epsilon)^{2}\right) \end{aligned} - $$ @@ -110,13 +102,11 @@ $$ $\mathcal{D}(\boldsymbol{x})$, 则式(8.5)可写为: $$ - \begin{aligned} \ell_{\exp }(H \mid \mathcal{D}) & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H(\boldsymbol{x})}\right] \\ & =\sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x}) e^{-f(\boldsymbol{x}) H(\boldsymbol{x})} \\ & =\sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x})\left(e^{-H(\boldsymbol{x})} \mathbb{I}(f(\boldsymbol{x})=1)+e^{H(\boldsymbol{x})} \mathbb{I}(f(\boldsymbol{x})=-1)\right) \end{aligned} - $$ @@ -127,9 +117,7 @@ $\mathcal{D}(\boldsymbol{x})=\frac{1}{|D|}$, 其中 $|D|$ 为数据集 $D$ $$ - \ell_{\exp }(H \mid \mathcal{D})=\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H(\boldsymbol{x})}\right]=\frac{1}{|D|} \sum_{\boldsymbol{x} \in D} e^{-f(\boldsymbol{x}) H(\boldsymbol{x})} - $$ @@ -142,7 +130,6 @@ $$ $$ - \begin{aligned} \ell_{\exp }(H | \mathcal{D}) &=\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H(\boldsymbol{x})}\right] \\ &=\sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x}) e^{-f(\boldsymbol{x}) H(\boldsymbol{x})} \\ @@ -150,7 +137,6 @@ $$ &=\sum_{i=1}^{|D|} \left(e^{-H\left(\boldsymbol{x}_{i}\right)} \mathcal{D}\left(\boldsymbol{x}_{i}\right)\mathbb{I}\left(f\left(\boldsymbol{x}_{i}\right)=1\right)+e^{H\left(\boldsymbol{x}_{i}\right)} \mathcal{D}\left(\boldsymbol{x}_{i}\right)\mathbb{I}\left(f\left(\boldsymbol{x}_{i}\right)=-1\right)\right)\\ &=\sum_{i=1}^{|D|} \left(e^{-H\left(\boldsymbol{x}_{i}\right)} P\left(f\left(\boldsymbol{x}_{i}\right)=1 \mid \boldsymbol{x}_{i}\right)+e^{H\left(\boldsymbol{x}_{i}\right)} P\left(f\left(\boldsymbol{x}_{i}\right)=-1 \mid \boldsymbol{x}_{i}\right)\right) \end{aligned} - $$ @@ -161,9 +147,7 @@ $\mathcal{D}(x_i)$表示在数据集$D$中进行一次随机抽样,样本$x_i$ $$ - \frac{\partial e^{-H(\boldsymbol{x})}}{\partial H(\boldsymbol{x})}=-e^{-H(\boldsymbol{x})}\qquad \frac{\partial e^{H(\boldsymbol{x})}}{\partial H(\boldsymbol{x})}=e^{H(\boldsymbol{x})} - $$ @@ -171,9 +155,7 @@ $$ $$ - \frac{\partial \ell_{\exp }(H | \mathcal{D})}{\partial H(\boldsymbol{x})}=-e^{-H(\boldsymbol{x})} P(f(\boldsymbol{x})=1 | \boldsymbol{x})+e^{H(\boldsymbol{x})} P(f(\boldsymbol{x})=-1 | \boldsymbol{x}) - $$ @@ -184,9 +166,7 @@ $$ $$ - \quad-e^{-H(\boldsymbol{x})} P(f(\boldsymbol{x})=1 \mid \boldsymbol{x})+e^{H(\boldsymbol{x})} P(f(\boldsymbol{x})=-1 \mid \boldsymbol{x})=0 - $$ @@ -194,9 +174,7 @@ $$ $$ - \quad e^{H(\boldsymbol{x})} P(f(\boldsymbol{x})=-1 \mid \boldsymbol{x})=e^{-H(\boldsymbol{x})} P(f(\boldsymbol{x})=1 \mid \boldsymbol{x}) - $$ @@ -205,9 +183,7 @@ $\frac{e^{H(\boldsymbol{x})}}{P(f(\boldsymbol{x})=-1 \mid \boldsymbol{x})}$: $$ - \quad e^{2 H(\boldsymbol{x})}=\frac{P(f(\boldsymbol{x})=1 \mid \boldsymbol{x})}{P(f(\boldsymbol{x})=-1 \mid \boldsymbol{x})} - $$ @@ -215,9 +191,7 @@ $$ $$ - \quad 2 H(\boldsymbol{x})=\ln \frac{P(f(\boldsymbol{x})=1 \mid \boldsymbol{x})}{P(f(\boldsymbol{x})=-1 \mid \boldsymbol{x})} - $$ @@ -228,13 +202,11 @@ $$ $$ - \begin{aligned} \operatorname{sign}(H(\boldsymbol{x}))&=\operatorname{sign}\left(\frac{1}{2} \ln \frac{P(f(x)=1 | \boldsymbol{x})}{P(f(x)=-1 | \boldsymbol{x})}\right) \\ & =\left\{\begin{array}{ll}{1,} & {P(f(x)=1 | \boldsymbol{x})>P(f(x)=-1 | \boldsymbol{x})} \\ {-1,} & {P(f(x)=1 | \boldsymbol{x})0.5 \sum_{k=1}^{N} \sum_{i=1}^{T} h_{i}^{k}(\boldsymbol{x})} \\ {\text { reject, }} & {\text { otherwise. }} \end{array}\right. - $$ @@ -1008,9 +904,7 @@ $$ $$ - H(\boldsymbol{x})=c_{\underset{j}{ \arg \max} \sum_{i=1}^{T} h_{i}^{j}(\boldsymbol{x})} - $$ @@ -1021,9 +915,7 @@ $$ $$ - H(\boldsymbol{x})=c_{\underset{j}{ \arg \max} \sum_{i=1}^{T} w_i h_{i}^{j}(\boldsymbol{x})} - $$ @@ -1059,9 +951,7 @@ $$ $$ - A\left(h_{i} | \boldsymbol{x}\right)=\left(h_{i}(\boldsymbol{x})-H(\boldsymbol{x})\right)^{2} - $$ @@ -1072,12 +962,10 @@ $$ $$ - \begin{aligned} \bar{A}(h | \boldsymbol{x}) &=\sum_{i=1}^{T} w_{i} A\left(h_{i} | \boldsymbol{x}\right) \\ &=\sum_{i=1}^{T} w_{i}\left(h_{i}(\boldsymbol{x})-H(\boldsymbol{x})\right)^{2} \end{aligned} - $$ @@ -1088,9 +976,7 @@ $$ $$ - E\left(h_{i} | \boldsymbol{x}\right)=\left(f(\boldsymbol{x})-h_{i}(\boldsymbol{x})\right)^{2} - $$ @@ -1101,9 +987,7 @@ $$ $$ - E(H | \boldsymbol{x})=(f(\boldsymbol{x})-H(\boldsymbol{x}))^{2} - $$ @@ -1114,34 +998,28 @@ $$ 由(8.28)知 $$ - \begin{aligned} \bar{A}(h | \boldsymbol{x})&=\sum_{i=1}^{T} w_{i}\left(h_{i}(\boldsymbol{x})-H(\boldsymbol{x})\right)^{2}\\ &=\sum_{i=1}^{T} w_{i}(h_i(\boldsymbol{x})^2-2h_i(\boldsymbol{x})H(\boldsymbol{x})+H(\boldsymbol{x})^2)\\ &=\sum_{i=1}^{T} w_{i}h_i(\boldsymbol{x})^2-H(\boldsymbol{x})^2 \end{aligned} - $$ 又因为 $$ - \begin{aligned} & \sum_{i=1}^{T} w_{i} E\left(h_{i} | \boldsymbol{x}\right)-E(H | \boldsymbol{x})\\ &=\sum_{i=1}^{T} w_{i}\left(f(\boldsymbol{x})-h_{i}(\boldsymbol{x})\right)^{2}-(f(\boldsymbol{x})-H(\boldsymbol{x}))^{2}\\ &=\sum_{i=1}^{T} w_{i}h_i(\boldsymbol{x})^2-H(\boldsymbol{x})^{2} \end{aligned} - $$ 所以 $$ - \bar{A}(h | \boldsymbol{x}) =\sum_{i=1}^{T} w_{i} E\left(h_{i} | \boldsymbol{x}\right)-E(H | \boldsymbol{x}) - $$ @@ -1151,9 +1029,7 @@ $$ $$ - \sum_{i=1}^{T} w_{i} \int A\left(h_{i} | \boldsymbol{x}\right) p(\boldsymbol{x}) d \boldsymbol{x}=\sum_{i=1}^{T} w_{i} \int E\left(h_{i} | \boldsymbol{x}\right) p(\boldsymbol{x}) d \boldsymbol{x}-\int E(H | \boldsymbol{x}) p(\boldsymbol{x}) d \boldsymbol{x} - $$ @@ -1177,9 +1053,7 @@ $A_i=\sum_{\boldsymbol{x} \in D} A\left(h_i \mid \boldsymbol{x}\right) p_{\bolds $$ - E_{i}=\int E\left(h_{i} | \boldsymbol{x}\right) p(\boldsymbol{x}) d \boldsymbol{x} - $$ @@ -1190,9 +1064,7 @@ $$ $$ - A_{i}=\int A\left(h_{i} | \boldsymbol{x}\right) p(\boldsymbol{x}) d \boldsymbol{x} - $$ @@ -1203,9 +1075,7 @@ $$ $$ - E=\int E(H | \boldsymbol{x}) p(\boldsymbol{x}) d \boldsymbol{x} - $$ @@ -1216,9 +1086,7 @@ $$ $$ - E=\bar{E}-\bar{A} - $$ @@ -1244,9 +1112,7 @@ $h_i$ 与 $h_j$ 对数据集 $D$ 所有样本预测结果均相同, 此时 $p_1= $$ - p_2=\frac{a+b}{m} \cdot \frac{a+c}{m}+\frac{c+d}{m} \cdot \frac{b+d}{m} - $$ @@ -1263,12 +1129,10 @@ $\frac{c+d}{m} \cdot \frac{b+d}{m}$ 与 $\frac{d}{m}$ 的不同: $$ - \begin{aligned} & \frac{a+b}{m} \cdot \frac{a+c}{m}=p\left(h_i=+1\right) p\left(h_j=+1\right), \frac{a}{m}=p\left(h_i=+1, h_j=+1\right) \\ & \frac{c+d}{m} \cdot \frac{b+d}{m}=p\left(h_i=-1\right) p\left(h_j=-1\right), \frac{d}{m}=p\left(h_i=-1, h_j=-1\right) \end{aligned} - $$ 即 $\frac{a+b}{m} \cdot \frac{a+c}{m}$ 和 @@ -1317,9 +1181,7 @@ $\nabla f\left(\boldsymbol{x}_k\right)=\left.\frac{\nabla f(\boldsymbol{x})}{\na $$ - f(\boldsymbol{x}) \approx f\left(\boldsymbol{x}_k\right)+\nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}}\left(\boldsymbol{x}-\boldsymbol{x}_k\right) - $$ @@ -1327,9 +1189,7 @@ $$ $$ - f\left(\boldsymbol{x}_k+\Delta \boldsymbol{x}\right) \approx f\left(\boldsymbol{x}_k\right)+\nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \Delta \boldsymbol{x} - $$ @@ -1354,21 +1214,17 @@ $\boldsymbol{d}_k$ 表示往哪个方向改 变 $\boldsymbol{x}$ 函数值下降 $$ - \left(\alpha_k, \boldsymbol{d}_k\right)=\underset{\alpha, \boldsymbol{d}}{\arg \min } \nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \alpha \boldsymbol{d} - $$ 将以上优化问题分为两步求解, 即 $$ - \begin{gathered} \boldsymbol{d}_k=\underset{\boldsymbol{d}}{\arg \min } \nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \boldsymbol{d} \quad \text { s.t. }\|\boldsymbol{d}\|_2=1 \\ \alpha_k=\underset{\alpha}{\arg \min } \nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \boldsymbol{d}_k \alpha \end{gathered} - $$ 以上求解 $\alpha_k$ 的优化问题明显有问题, 因为对于 @@ -1378,9 +1234,7 @@ $\nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \boldsymbol{d}_k<0$ $$ - \alpha_k=\underset{\alpha}{\arg \min } f\left(\boldsymbol{x}_k+\alpha \boldsymbol{d}_k\right) - $$ @@ -1391,9 +1245,7 @@ $\boldsymbol{d}_k$ 和 $\alpha_k$, 与先求 $\boldsymbol{d}_k$ $$ - \left|\nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \boldsymbol{d}_k\right| \leq\left\|\nabla f\left(\boldsymbol{x}_k\right)\right\|_2\left\|\boldsymbol{d}_k\right\|_2 - $$ @@ -1409,9 +1261,7 @@ $f\left(\boldsymbol{x}_k+\alpha \boldsymbol{d}_k\right)$ 对 $\alpha$ $$ - \frac{\partial f\left(\boldsymbol{x}_k+\alpha \boldsymbol{d}_k\right)}{\partial \alpha}=0 - $$ @@ -1419,13 +1269,11 @@ $$ 解: 对 $f(x)$ 在 $x_k=2$ 处进行一阶 Taylor 展开: $$ - \begin{aligned} f(x) & =f\left(x_k\right)+f^{\prime}\left(x_k\right)\left(x-x_k\right) \\ & =x_k^2+2 x_k\left(x-x_k\right) \\ & =x_k^2+2 x_k \alpha d \end{aligned} - $$ 由于此时自变量为一维, 因此只有两个方向可选, 要么正方向, @@ -1435,27 +1283,21 @@ $d_k=-\frac{f^{\prime}\left(x_k\right)}{\left|f^{\prime}\left(x_k\right)\right|} $$ - f\left(x_k+\alpha d_k\right)=f(2-\alpha)=(2-\alpha)^2 - $$ 进而有 $$ - \frac{\partial f\left(x_k+\alpha d_k\right)}{\partial \alpha}=-2(2-\alpha) - $$ 令导数等于 0 , 得 $\alpha_k=2$ 。此时 $$ - \Delta x=\alpha_k d_k=-2 - $$ 则 @@ -1469,13 +1311,11 @@ $\boldsymbol{x}_k=\left[x_k^1, x_k^2\right]^{\mathrm{T}}=[3,4]^{\mathrm{T}}$ 处进行一阶 Taylor 展开: $$ - \begin{aligned} f(\boldsymbol{x}) & =f\left(\boldsymbol{x}_k\right)+\nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}}\left(\boldsymbol{x}-\boldsymbol{x}_k\right) \\ & =\|\boldsymbol{x}\|_2^2+2 \boldsymbol{x}_k^{\mathrm{T}}\left(\boldsymbol{x}-\boldsymbol{x}_k\right) \\ & =\|\boldsymbol{x}\|_2^2+2 \boldsymbol{x}_k^{\mathrm{T}} \alpha \boldsymbol{d} \end{aligned} - $$ 此时 @@ -1485,13 +1325,11 @@ $\boldsymbol{d}_k=-\frac{\nabla f\left(\boldsymbol{x}_k\right)}{\left\|\nabla f\ $$ - \begin{aligned} f\left(\boldsymbol{x}_k+\alpha \boldsymbol{d}_k\right) & =(3-0.6 \alpha)^2+(4-0.8 \alpha)^2 \\ & =\alpha^2-10 \alpha+25 \\ & =(\alpha-5)^2 \end{aligned} - $$ 因此可得 $\alpha_k=5$ (或对 $\alpha$ 求导, 再令导数等于 @@ -1499,9 +1337,7 @@ $$ $$ - \Delta \boldsymbol{x}=\alpha_k \boldsymbol{d}_k=[-3,-4]^{\mathrm{T}} - $$ @@ -1512,9 +1348,7 @@ $\boldsymbol{d}_k$ 时应该求解如下优化问题: $$ - \boldsymbol{d}_k=\underset{\boldsymbol{d}}{\arg \min } \nabla f\left(\boldsymbol{x}_k\right)^{\mathrm{T}} \boldsymbol{d} \text { s.t. }\|\boldsymbol{d}\|_2=C - $$ @@ -1524,9 +1358,7 @@ $\alpha_k$ 应该求解如下优化问题: $$ - \alpha_k=\underset{\alpha}{\arg \min } f\left(\boldsymbol{x}_k+\alpha \boldsymbol{d}_k\right) - $$ @@ -1538,9 +1370,7 @@ AdaBoost 第 $t$ 轮迭代时最小化式(8.5)的指数损失函数 $$ - \ell_{\exp }\left(H_t \mid \mathcal{D}\right)=\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H_t(\boldsymbol{x})}\right]=\sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x}) e^{-f(\boldsymbol{x}) H_t(\boldsymbol{x})} - $$ @@ -1548,13 +1378,11 @@ $$ 处泰勒展开 $$ - \begin{aligned} \ell_{\exp }\left(H_t \mid \mathcal{D}\right) & \approx \sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x})\left(e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}-f(\boldsymbol{x}) e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}\left(H_t(\boldsymbol{x})-H_{t-1}(\boldsymbol{x})\right)\right) \\ & =\sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x})\left(e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}-e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) \alpha_t h_t(\boldsymbol{x})\right) \\ & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}-e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) \alpha_t h_t(\boldsymbol{x})\right] \end{aligned} - $$ 其中 $H_t=H_{t-1}+\alpha_t h_t$ 。注意: $\alpha_t, h_t$ @@ -1563,9 +1391,7 @@ $H_t(\boldsymbol{x})$, 在 $H_{t-1}$ 处一阶导数为 $$ - \left.\frac{\partial e^{-f(\boldsymbol{x}) H_t(\boldsymbol{x})}}{\partial H_t(\boldsymbol{x})}\right|_{H_t(\boldsymbol{x})=H_{t-1}(\boldsymbol{x})}=-f(\boldsymbol{x}) e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} - $$ @@ -1574,14 +1400,12 @@ $g(z)=e^{-f(\boldsymbol{x}) z}$, 对 $g(z)$ 在 $z_0=H_{t-1}(\boldsymbol{x})$ 处泰勒展开, 得 $$ - \begin{aligned} g(z) & \approx g\left(z_0\right)+g^{\prime}\left(z_0\right)\left(z-z_0\right) \\ & =g\left(z_0\right)-f(\boldsymbol{x}) e^{-f(\boldsymbol{x}) z_0}\left(z-z_0\right) \\ & =e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}-e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x})\left(H_t(\boldsymbol{x})-H_{t-1}(\boldsymbol{x})\right) \\ & =e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})}-e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) \alpha_t h_t(\boldsymbol{x}) \end{aligned} - $$ @@ -1593,21 +1417,17 @@ $$ $$ - h_t=\underset{h}{\arg \min } \sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x})\left(-e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) h(\boldsymbol{x})\right) \quad \text { s.t. } h(\boldsymbol{x}) \in\{-1,+1\} - $$ 将负号去掉, 最小化变为最大化问题 $$ - \begin{aligned} h_t & =\underset{h}{\arg \max } \sum_{\boldsymbol{x} \in D} \mathcal{D}(\boldsymbol{x})\left(e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) h(\boldsymbol{x})\right) \\ & =\underset{h}{\arg \max } \mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[e^{-f(\boldsymbol{x}) H_{t-1}(\boldsymbol{x})} f(\boldsymbol{x}) h(\boldsymbol{x})\right] \quad \text { s.t. } h(\boldsymbol{x}) \in\{-1,+1\} \end{aligned} - $$ 这就是式(8.14)的第 3 个等号的结果, @@ -1624,9 +1444,7 @@ $\ell_{\exp }\left(H_{t-1}+\alpha h_t \mid \mathcal{D}\right)$ $$ - \alpha_k=\underset{\alpha}{\arg \min } \ell_{\exp }\left(H_{t-1}+\alpha h_t \mid \mathcal{D}\right) - $$ @@ -1635,35 +1453,29 @@ $\ell_{\exp }\left(H_{t-1}+\alpha h_t \mid \mathcal{D}\right)$ 求导, 得 $$ - \begin{aligned} \frac{\partial \ell_{\exp }\left(H_{t-1}+\alpha h_t \mid \mathcal{D}\right)}{\partial \alpha} & =\frac{\partial\left(e^{-\alpha} \sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right)+\left(e^\alpha-e^{-\alpha}\right) \sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right) \mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right)\right)}{\partial \alpha} \\ & =-e^{-\alpha} \sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right)+\left(e^\alpha+e^{-\alpha}\right) \sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right) \mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right) \end{aligned} - $$ 令导数等于零, 得 $$ - \begin{aligned} \frac{e^{-\alpha}}{e^\alpha+e^{-\alpha}} & =\frac{\sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right) \mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right)}{\sum_{i=1}^{|D|} \mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right)}=\sum_{i=1}^{|D|} \frac{\mathcal{D}_t^{\prime}\left(\boldsymbol{x}_i\right)}{Z_t} \mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right) \\ & =\sum_{i=1}^{|D|} \mathcal{D}_t\left(\boldsymbol{x}_i\right) \mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right)=\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}_t}\left[\mathbb{I}\left(f\left(\boldsymbol{x}_i\right) \neq h\left(\boldsymbol{x}_i\right)\right)\right] \\ & =\epsilon_t \end{aligned} - $$ 对上述等式化简, 得 $$ - \begin{aligned} \frac{e^{-\alpha}}{e^\alpha+e^{-\alpha}}=\frac{1}{e^{2 \alpha}+1} & \Rightarrow e^{2 \alpha}+1=\frac{1}{\epsilon_t} \Rightarrow e^{2 \alpha}=\frac{1-\epsilon_t}{\epsilon_t} \Rightarrow 2 \alpha=\ln \left(\frac{1-\epsilon_t}{\epsilon_t}\right) \\ & \Rightarrow \alpha_t=\frac{1}{2} \ln \left(\frac{1-\epsilon_t}{\epsilon_t}\right) \end{aligned} - $$ 即式(8.11)。 通过以上推导可以发现: AdaBoost @@ -1678,12 +1490,10 @@ $h_t(\boldsymbol{x}) \in\{-1,+1\}$)** $$ - \begin{aligned} \ell\left(H_t \mid \mathcal{D}\right) & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)\right] \\ & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_{t-1}(\boldsymbol{x})+\alpha_t h_t(\boldsymbol{x}), f(\boldsymbol{x})\right)\right] \end{aligned} - $$ 问题时, $f(\boldsymbol{x}) \in \mathbb{R}$, @@ -1693,9 +1503,7 @@ $\operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)=\left(H_t $$ - H(\boldsymbol{x})=\sum_{t=1}^T \alpha_t h_t(\boldsymbol{x}) - $$ 类似于 @@ -1704,13 +1512,11 @@ AdaBoost, 第 $t$ 轮得到 $\alpha_t, h_t(\boldsymbol{x})$, $$ - \begin{aligned} \ell\left(H_t \mid \mathcal{D}\right) & \approx \mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_{t-1}(\boldsymbol{x}), f(\boldsymbol{x})\right)+\left.\frac{\partial \operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)}{\partial H_t(\boldsymbol{x})}\right|_{H_t(\boldsymbol{x})=H_{t-1}(\boldsymbol{x})}\left(H_t(\boldsymbol{x})-H_{t-1}(\boldsymbol{x})\right)\right] \\ & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_{t-1}(\boldsymbol{x}), f(\boldsymbol{x})\right)+\left.\frac{\partial \operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)}{\partial H_t(\boldsymbol{x})}\right|_{H_t(\boldsymbol{x})=H_{t-1}(\boldsymbol{x})} \alpha_t h_t(\boldsymbol{x})\right] \\ & =\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_{t-1}(\boldsymbol{x}), f(\boldsymbol{x})\right)\right]+\mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\left.\frac{\partial \operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)}{\partial H_t(\boldsymbol{x})}\right|_{H_t(\boldsymbol{x})=H_{t-1}(\boldsymbol{x})} \alpha_t h_t(\boldsymbol{x})\right] \end{aligned} - $$ 注意, 在上式展开中的变量为 $H_t(\boldsymbol{x})$, 且有 @@ -1724,9 +1530,7 @@ $h_t(\boldsymbol{x})$ : $$ - h_t(\boldsymbol{x})=\underset{h}{\arg \min } \mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\left.\frac{\partial \operatorname{err}\left(H_t(\boldsymbol{x}), f(\boldsymbol{x})\right)}{\partial H_t(\boldsymbol{x})}\right|_{H_t(\boldsymbol{x})=H_{t-1}(\boldsymbol{x})} h(\boldsymbol{x})\right] \quad \text { s.t. constraints for } h(\boldsymbol{x}) - $$ @@ -1734,9 +1538,7 @@ $$ $$ - \alpha_t=\underset{\alpha}{\arg \min } \mathbb{E}_{\boldsymbol{x} \sim \mathcal{D}}\left[\operatorname{err}\left(H_{t-1}(\boldsymbol{x})+\alpha h_t(\boldsymbol{x}), f(\boldsymbol{x})\right)\right] - $$