diff --git a/common.tex b/common.tex index 00c430a..d1633a6 100644 --- a/common.tex +++ b/common.tex @@ -18,6 +18,7 @@ \DeclareMathOperator*{\dom}{dom} \DeclareMathOperator*{\range}{range} \DeclareMathOperator*{\diag}{diag} +\DeclareMathOperator*{\Null}{null} \newcommand{\C}{\mathbb{C}} \newcommand{\F}{\mathbb{F}} \newcommand{\N}{\mathbb{N}} @@ -40,12 +41,14 @@ \renewcommand{\vec}[1]{\mathbf{#1}} \newcommand{\mat}[1]{\mathbf{#1}} \newcommand{\matlit}[1]{\begin{bmatrix}#1\end{bmatrix}} -\newcommand{\tran}{^\top} +\newcommand{\tran}{^{\!\top\!}} \newcommand{\inv}{^{-1}} \newcommand{\halfpow}{^{\frac{1}{2}}} \newcommand{\neghalfpow}{^{-\frac{1}{2}}} \renewcommand{\angle}[1]{\langle #1 \rangle} +\newcommand{\bigangle}[1]{\left\langle #1 \right\rangle} \newcommand{\inner}[2]{\angle{#1, #2}} +\newcommand{\biginner}[2]{\bigangle{#1, #2}} \renewcommand{\P}{\mathbb{P}} \newcommand{\pr}[1]{\P(#1)} \newcommand{\prbig}[1]{\P\big(#1\big)} @@ -66,6 +69,7 @@ \newcommand{\tab}{\hspace{0.5cm}} \renewcommand{\a}{\vec{a}} \renewcommand{\b}{\vec{b}} +\newcommand{\e}{\vec{e}} \newcommand{\g}{\vec{g}} \newcommand{\h}{\vec{h}} \renewcommand{\o}{\vec{o}} diff --git a/cs189-calculus-optimization.tex b/cs189-calculus-optimization.tex index 020ea68..dc8e635 100644 --- a/cs189-calculus-optimization.tex +++ b/cs189-calculus-optimization.tex @@ -8,12 +8,14 @@ \subsection{Extrema} Otherwise the problem is \term{constrained} and may be much harder to solve, depending on the nature of the feasible set. Suppose $f : \R^d \to \R$. -A point $\x$ is said to be a \term{local minimum} (resp. \term{local maximum}) of $f$ in $\calX$ if $f(\x) \leq f(\y)$ (resp. $f(\x) \geq f(\y)$) for all $\y$ in some neighborhood $\calN \subseteq \calX$ that contains $\x$. +A point $\x$ is said to be a \term{local minimum} (resp. \term{local maximum}) of $f$ in $\calX$ if $f(\x) \leq f(\y)$ (resp. $f(\x) \geq f(\y)$) for all $\y$ in some neighborhood $N \subseteq \calX$ about $\x$.\footnote{ + A \textbf{neighborhood} about $\x$ is an open set which contains $\x$. +} Furthermore, if $f(\x) \leq f(\y)$ for all $\y \in \calX$, then $\x$ is a \term{global minimum} of $f$ in $\calX$ (similarly for global maximum). If the phrase ``in $\calX$'' is unclear from context, assume we are optimizing over the whole domain of the function. The qualifier \term{strict} (as in e.g. a strict local minimum) means that the inequality sign in the definition is actually a $>$ or $<$, with equality not allowed. -This indicates that the extremum is unique. +This indicates that the extremum is unique within some neighborhood. Observe that maximizing a function $f$ is equivalent to minimizing $-f$, so optimization problems are typically phrased in terms of minimization without loss of generality. This convention (which we follow here) eliminates the need to discuss minimization and maximization separately. @@ -42,9 +44,9 @@ \subsection{The Jacobian} \subsection{The Hessian} The \term{Hessian} matrix of $f : \R^d \to \R$ is a matrix of second-order partial derivatives: \[\nabla^2 f = \matlit{ - \pdv[2]{f}{x_1} & \hdots & \pdv{f}{x_1}{x_n} \\ + \pdv[2]{f}{x_1} & \hdots & \pdv{f}{x_1}{x_d} \\ \vdots & \ddots & \vdots \\ - \pdv{f}{x_n}{x_1} & \hdots & \pdv[2]{f}{x_n}} + \pdv{f}{x_d}{x_1} & \hdots & \pdv[2]{f}{x_d}} \tab\text{i.e.}\tab [\nabla^2 f]_{ij} = {\pdv{f}{x_i}{x_j}}\] Recall that if the partial derivatives are continuous, the order of differentiation can be interchanged (Clairaut's theorem), so the Hessian matrix will be symmetric. @@ -91,7 +93,7 @@ \subsection{Taylor's theorem} Then there exists $t \in (0,1)$ such that \[f(\x + \h) = f(\x) + \nabla f(\x + t\h)\tran\h\] Furthermore, if $f$ is twice continuously differentiable, then -\[\nabla f(\x + \h) = \nabla f(\x) + \int_0^1 \nabla^2 f(\x + t\h)\h \dif{t}\] +\[\nabla f(\x + \h) = \nabla f(\x) + \int_0^1 \nabla^2 f(\x + t\h)\h \dd{t}\] and there exists $t \in (0,1)$ such that \[f(\x + \h) = f(\x) + \nabla f(\x)\tran\h + \frac{1}{2}\h\tran\nabla^2f(\x+t\h)\h\] \end{theorem} @@ -150,14 +152,14 @@ \subsection{Conditions for local minima} Furthermore if $\nabla^2 f(\x^*)$ is positive definite, then $\x^*$ is a strict local minimum. \end{proposition} \begin{proof} -Let $\calB$ be an open ball of radius $r > 0$ centered at $\x^*$ which is contained in the neighborhood. +Let $B$ be an open ball of radius $r > 0$ centered at $\x^*$ which is contained in the neighborhood. Applying Taylor's theorem, we have that for any $\h$ with $\|\h\|_2 < r$, there exists $t \in (0,1)$ such that \[f(\x^* + \h) = f(\x^*) + \underbrace{\h\tran\nabla f(\x^*)}_0 + \frac{1}{2}\h\tran\nabla^2 f(\x^* + t\h)\h \geq f(\x^*)\] The last inequality holds because $\nabla^2 f(\x^* + t\h)$ is positive semi-definite (since $\|t\h\|_2 = t\|\h\|_2 < \|\h\|_2 < r$), so $\h\tran\nabla^2 f(\x^* + t\h)\h \geq 0$. Since $f(\x^*) \leq f(\x^* + \h)$ for all directions $\h$ with $\|\h\|_2 < r$, we conclude that $\x^*$ is a local minimum. Now further suppose that $\nabla^2 f(\x^*)$ is strictly positive definite. -Since the Hessian is continuous we can choose another ball $\calB'$ with radius $r' > 0$ centered at $\x^*$ such that $\nabla^2 f(\x)$ is positive definite for all $\x \in \calB'$. +Since the Hessian is continuous we can choose another ball $B'$ with radius $r' > 0$ centered at $\x^*$ such that $\nabla^2 f(\x)$ is positive definite for all $\x \in B'$. Then following the same argument as above (except with a strict inequality now since the Hessian is positive definite) we have $f(\x^* + \h) > f(\x^*)$ for all $\h$ with $0 < \|\h\|_2 < r'$. Hence $\x^*$ is a strict local minimum. \end{proof} @@ -173,3 +175,88 @@ \subsection{Conditions for local minima} \subsection{Convexity} \input{cs189-convexity.tex} + +\subsection{Orthogonal projections} +We now consider a particular kind of optimization problem that is particularly well-understood and can often be solved in closed form: given some point $\x$ in an inner product space $V$, find the closest point to $\x$ in a subspace $S$ of $V$. +This process is referred to as \term{projection onto a subspace}. + +The following diagram should make it geometrically clear that, at least in Euclidean space, the solution is intimately related to orthogonality and the Pythagorean theorem: +\begin{center} +\includegraphics[width=0.5\linewidth]{orthogonal-projection} +\end{center} +Here $\y$ is an arbitrary element of the subspace $S$, and $\y^*$ is the point in $S$ such that $\x-\y^*$ is perpendicular to $S$. +The hypotenuse of a right triangle (in this case $\|\x-\y\|$) is always longer than either of the legs (in this case $\|\x-\y^*\|$ and $\|\y^*-\y\|$), and when $\y \neq \y^*$ there always exists such a triangle between $\x$, $\y$, and $\y^*$. + +Our intuition from Euclidean space suggests that the closest point to $\x$ in $S$ has the perpendicularity property described above, and we now show that this is indeed the case. +\begin{proposition} +Suppose $\x \in V$ and $\y \in S$. +Then $\y^*$ is the unique minimizer of $\|\x-\y\|$ over $\y \in S$ if and only if $\x-\y^* \perp S$. +\end{proposition} +\begin{proof} +$(\implies)$ +Suppose $\y^*$ is the unique minimizer of $\|\x-\y\|$ over $\y \in S$. +That is, $\|\x-\y^*\| \leq \|\x-\y\|$ for all $\y \in S$, with equality only if $\y = \y^*$. +Fix $\vec{v} \in S$ and observe that +\begin{align*} +g(t) &:= \|\x-\y^*+t\vec{v}\|^2 \\ +&= \inner{\x-\y^*+t\vec{v}}{\x-\y^*+t\vec{v}} \\ +&= \inner{\x-\y^*}{\x-\y^*} - 2t\inner{\x-\y^*}{\vec{v}} + t^2\inner{\vec{v}}{\vec{v}} \\ +&= \|\x-\y^*\|^2 - 2t\inner{\x-\y^*}{\vec{v}} + t^2\|\vec{v}\|^2 +\end{align*} +must have a minimum at $t = 0$ as a consequence of this assumption. +Thus +\[0 = g'(0) = \left.-2\inner{\x-\y^*}{\vec{v}} + 2t\|\vec{v}\|^2\right|_{t=0} = -2\inner{\x-\y^*}{\vec{v}}\] +giving $\x-\y^* \perp \vec{v}$. +Since $\vec{v}$ was arbitrary in $S$, we have $\x-\y^* \perp S$ as claimed. + +$(\impliedby)$ +Suppose $\x-\y^* \perp S$. +Observe that for any $\y \in S$, $\y^*-\y \in S$ because $\y^* \in S$ and $S$ is closed under subtraction. +Under the hypothesis, $\x-\y^* \perp \y^*-\y$, so by the Pythagorean theorem, +\[\|\x-\y\| = \|\x-\y^*+\y^*-\y\| = \|\x-\y^*\| + \|\y^*-\y\| \geq \|\x - \y^*\|\] +and in fact the inequality is strict when $\y \neq \y^*$ since this implies $\|\y^*-\y\| > 0$. +Thus $\y^*$ is the unique minimizer of $\|\x-\y\|$ over $\y \in S$. +\end{proof} +Since a unique minimizer in $S$ can be found for any $\x \in V$, we can define an operator +\[P\x = \argmin_{\y \in S} \|\x-\y\|\] +Observe that $P\y = \y$ for any $\y \in S$, since $\y$ has distance zero from itself and every other point in $S$ has positive distance from $\y$. +Thus $P(P\x) = P\x$ for any $\x$ (i.e., $P^2 = P$) because $P\x \in S$. +The identity $P^2 = P$ is actually one of the defining properties of a \term{projection}, the other being linearity. + +An immediate consequence of the previous result is that $\x - P\x \perp S$ for any $\x \in V$, and conversely that $P$ is the unique operator that satisfies this property for all $\x \in V$. +For this reason, $P$ is known as an \term{orthogonal projection}. + +If we choose an orthonormal basis for the target subspace $S$, it is possible to write down a more specific expression for $P$. +\begin{proposition} +If $\e_1, \dots, \e_m$ is an orthonormal basis for $S$, then +\[P\x = \sum_{i=1}^m \inner{\x}{\e_i}\e_i\] +\end{proposition} +\begin{proof} +Let $\e_1, \dots, \e_m$ be an orthonormal basis for $S$, and suppose $\x \in V$. +Then for all $j = 1, \dots, m$, +\begin{align*} +\biginner{\x-\sum_{i=1}^m \inner{\x}{\e_i}\e_i}{\e_j} &= \inner{\x}{\e_j} - \sum_{i=1}^m \inner{\x}{\e_i}\underbrace{\inner{\e_i}{\e_j}}_{\delta_{ij}} \\ +&= \inner{\x}{\e_j} - \inner{\x}{\e_j} \\ +&= 0 +\end{align*} +We have shown that the claimed expression, call it $\tilde{P}\x$, satisfies $\x - \tilde{P}\x \perp \e_j$ for every element $\e_j$ of the orthonormal basis for $S$. +It follows (by linearity of the inner product) that $\x - \tilde{P}\x \perp S$, so the previous result implies $P = \tilde{P}$. +\end{proof} +The fact that $P$ is a linear operator (and thus a proper projection, as earlier we showed $P^2 = P$) follows readily from this result. + +%Another useful fact about the orthogonal projection operator is that the metric it induces is \term{non-expansive}, i.e. $1$-Lipschitz. +%\begin{proposition} +%For any $\x \in V$, +%\[\|P\x\| \leq \|\x\|\] +%Thus for any $\x, \xye \in V$, +%\[\|P\x - P\xye\| \leq \|\x-\xye\|\] +%\end{proposition} +%\begin{proof} +%Suppose $\x \in V$. +%Then +%\[\|P\x\|^2 = \inner{P\x}{P\x} = \inner{\x}{P^2\x} = \inner{\x}{P\x} \leq \|\x\|\|P\x\|\] +%using respectively the self-adjointness of $P$, the fact that $P^2 = P$, and the Cauchy-Schwarz inequality. +%If $\|P\x\| = 0$, the inequality holds vacuously; otherwise we can divide both sides by $\|P\x\|$ to obtain $\|P\x\| \leq \|\x\|$. +% +%The second statement follows immediately from the first by linearity of $P$. +%\end{proof} \ No newline at end of file diff --git a/cs189-convexity.tex b/cs189-convexity.tex index f1e1198..914ea9d 100644 --- a/cs189-convexity.tex +++ b/cs189-convexity.tex @@ -93,7 +93,7 @@ \subsubsection{Consequences of convexity} \end{proposition} \begin{proof} Suppose $f$ is convex, and let $\x^*$ be a local minimum of $f$ in $\calX$. -Then for some neighborhood $\calN \subseteq \calX$ about $\x^*$, we have $f(\x) \geq f(\x^*)$ for all $\x \in \calN$. +Then for some neighborhood $N \subseteq \calX$ about $\x^*$, we have $f(\x) \geq f(\x^*)$ for all $\x \in N$. Suppose towards a contradiction that there exists $\xye \in \calX$ such that $f(\xye) < f(\x^*)$. Consider the line segment $\x(t) = t\x^* + (1-t)\xye, ~ t \in [0,1]$, noting that $\x(t) \in \calX$ by the convexity of $\calX$. @@ -101,7 +101,7 @@ \subsubsection{Consequences of convexity} \[f(\x(t)) \leq tf(\x^*) + (1-t)f(\xye) < tf(\x^*) + (1-t)f(\x^*) = f(\x^*)\] for all $t \in (0,1)$. -We can pick $t$ to be sufficiently close to $1$ that $\x(t) \in \calN$; then $f(\x(t)) \geq f(\x^*)$ by the definition of $\calN$, but $f(\x(t)) < f(\x^*)$ by the above inequality, a contradiction. +We can pick $t$ to be sufficiently close to $1$ that $\x(t) \in N$; then $f(\x(t)) \geq f(\x^*)$ by the definition of $N$, but $f(\x(t)) < f(\x^*)$ by the above inequality, a contradiction. It follows that $f(\x^*) \leq f(\x)$ for all $\x \in \calX$, so $\x^*$ is a global minimum of $f$ in $\calX$. \end{proof} @@ -153,7 +153,7 @@ \subsubsection{Showing that a function is convex} Norms are convex. \end{proposition} \begin{proof} -Let $\|\cdot\|$ be a norm on $\R^d$. Then for all $\x, \y \in \R^d$ and $t \in [0,1]$, +Let $\|\cdot\|$ be a norm on a vector space $V$. Then for all $\x, \y \in V$ and $t \in [0,1]$, \[\|t\x + (1-t)\y\| \leq \|t\x\| + \|(1-t)\y\| = |t|\|\x\| + |1-t|\|\y\| = t\|\x\| + (1-t)\|\y\|\] where we have used respectively the triangle inequality, the homogeneity of norms, and the fact that $t$ and $1-t$ are nonnegative. Hence $\|\cdot\|$ is convex. @@ -228,16 +228,16 @@ \subsubsection{Showing that a function is convex} \end{proof} \begin{proposition} -If $f$ is convex, then $g(\vec{x}) \equiv f(A\x + \vec{b})$ is convex for any appropriately-sized $A$ and $\b$. +If $f$ is convex, then $g(\vec{x}) \equiv f(\A\x + \vec{b})$ is convex for any appropriately-sized $\A$ and $\b$. \end{proposition} \begin{proof} Suppose $f$ is convex and $g$ is defined like so. Then for all $\x, \y \in \dom g$, \begin{align*} -g(t\x + (1-t)\y) &= f(A(t\x + (1-t)\y) + \b) \\ -&= f(tA\x + (1-t)A\y + \b) \\ -&= f(tA\x + (1-t)A\y + t\b + (1-t)\b) \\ -&= f(t(A\x + \b) + (1-t)(A\y + \b)) \\ -&\leq tf(A\x + \b) + (1-t)f(A\y + \b) & \text{convexity of $f$} \\ +g(t\x + (1-t)\y) &= f(\A(t\x + (1-t)\y) + \b) \\ +&= f(t\A\x + (1-t)\A\y + \b) \\ +&= f(t\A\x + (1-t)\A\y + t\b + (1-t)\b) \\ +&= f(t(\A\x + \b) + (1-t)(\A\y + \b)) \\ +&\leq tf(\A\x + \b) + (1-t)f(\A\y + \b) & \text{convexity of $f$} \\ &= tg(\x) + (1-t)g(\y) \end{align*} Thus $g$ is convex. diff --git a/cs189-linalg.tex b/cs189-linalg.tex index 1c8e4f4..8133a89 100644 --- a/cs189-linalg.tex +++ b/cs189-linalg.tex @@ -27,7 +27,28 @@ \subsubsection{Euclidean space} \[\x + \y = \matlit{x_1 + y_1 \\ \vdots \\ x_n + y_n}, \tab \alpha\x = \matlit{\alpha x_1 \\ \vdots \\ \alpha x_n}\] Euclidean space is used to mathematically represent physical space, with notions such as distance, length, and angles. Although it becomes hard to visualize for $n > 3$, these concepts generalize mathematically in obvious ways. -Tip: even when you're working in more general settings than $\R^n$, it is often useful to visualize vector addition and scalar multiplication in terms of 2D vectors in the plane or 3D vectors in space. +Even when you're working in more general settings than $\R^n$, it is often useful to visualize vector addition and scalar multiplication in terms of 2D vectors in the plane or 3D vectors in space. + +\subsubsection{Subspaces} +Vector spaces can contain other vector spaces. +If $V$ is a vector space, then $S \subseteq V$ is said to be a \term{subspace} of $V$ if +\begin{enumerate}[(i)] +\item $\vec{0} \in S$ +\item $S$ is closed under addition: $\x, \y \in S$ implies $\x+\y \in S$ +\item $S$ is closed under scalar multiplication: $\x \in S, \alpha \in \R$ implies $\alpha\x \in S$ +\end{enumerate} +Note that $V$ is always a subspace of $V$, as is the trivial vector space which contains only $\vec{0}$. + +As a concrete example, a line passing through the origin is a subspace of Euclidean space. + +Some of the most important subspaces are those induced by linear maps. +If $T : V \to W$ is a linear map, we define the \term{nullspace}\footnote{ + It is sometimes called the \term{kernel} by algebraists, but we eschew this terminology because the word ``kernel'' has another meaning in machine learning. +} of $T$ as +\[\Null(T) = \{\x \in V \mid T\x = \vec{0}\}\] +and the \term{range} (or the \term{columnspace} if we are considering the matrix form) of $T$ as +\[\range(T) = \{\y \in W \mid \text{$\exists \x \in V$ such that $T\x = \y$}\}\] +It is a good exercise to verify that the nullspace and range of a linear map are always subspaces of its domain and codomain, respectively. \subsection{Metric spaces} Metrics generalize the notion of distance from Euclidean space (although metric spaces need not be vector spaces). @@ -91,12 +112,12 @@ \subsection{Inner product spaces} Note that any inner product on $V$ induces a norm on $V$: \[\|\x\| = \sqrt{\inner{\x}{\x}}\] -One can verify that the axioms for norms are satisfied under this definition and follow directly from the axioms for inner products. +One can verify that the axioms for norms are satisfied under this definition and follow (almost) directly from the axioms for inner products. Therefore any inner product space is also a normed space (and hence also a metric space).\footnote{ If an inner product space is complete with respect to the distance metric induced by its inner product, we say that it is a \term{Hilbert space}. } -Two vectors $\x$ and $\y$ are said to be \term{orthogonal} if $\inner{\x}{\y} = 0$. +Two vectors $\x$ and $\y$ are said to be \term{orthogonal} if $\inner{\x}{\y} = 0$; we write $\x \perp \y$ for shorthand. Orthogonality generalizes the notion of perpendicularity from Euclidean space. If two orthogonal vectors $\x$ and $\y$ additionally have unit length (i.e. $\|\x\| = \|\y\| = 1$), then they are described as \term{orthonormal}. @@ -109,11 +130,11 @@ \subsection{Inner product spaces} \subsubsection{Pythagorean Theorem} The well-known Pythagorean theorem generalizes naturally to arbitrary inner product spaces. \begin{theorem} -If $\inner{\x}{\y} = 0$, then +If $\x \perp \y$, then \[\|\x+\y\|^2 = \|\x\|^2 + \|\y\|^2\] \end{theorem} \begin{proof} -Suppose $\inner{\x}{\y} = 0$. Then +Suppose $\x \perp \y$, i.e. $\inner{\x}{\y} = 0$. Then \[\|\x+\y\|^2 = \inner{\x+\y}{\x+\y} = \inner{\x}{\x} + \inner{\y}{\x} + \inner{\x}{\y} + \inner{\y}{\y} = \|\x\|^2 + \|\y\|^2\] as claimed. \end{proof} @@ -159,7 +180,7 @@ \subsection{Eigenthings} \[\x = \A\inv\A\x = \A\inv(\lambda\x) = \lambda\A\inv\x\] Dividing by $\lambda$, which is valid because the invertibility of $\A$ implies $\lambda \neq 0$, gives $\lambda\inv\x = \A\inv\x$. -(iii) The case $k \geq 0$ follows almost immediately by induction on $k$. +(iii) The case $k \geq 0$ follows immediately by induction on $k$. Then the general case $k \in \Z$ follows by combining the $k \geq 0$ case with (ii). \end{proof} @@ -169,12 +190,14 @@ \subsection{Trace} The trace has several nice algebraic properties: \begin{enumerate}[(i)] \item $\tr(\A+\mat{B}) = \tr(\A) + \tr(\mat{B})$ -\item $\tr(\alpha \A) = \alpha\tr(\A)$ +\item $\tr(\alpha\A) = \alpha\tr(\A)$ \item $\tr(\A\tran) = \tr(\A)$ -\item $\tr(\A\mat{B}\mat{C}\mat{D}) = \tr(\mat{B}\mat{C}\mat{D}\A) = \tr(\mat{C}\mat{D}\A\mat{B}) = \tr(\mat{B}\A\mat{D}\mat{C})$ +\item $\tr(\A\mat{B}\mat{C}\mat{D}) = \tr(\mat{B}\mat{C}\mat{D}\A) = \tr(\mat{C}\mat{D}\A\mat{B}) = \tr(\mat{D}\A\mat{B}\mat{C})$ \end{enumerate} The first three properties follow readily from the definition. -The last is known as \term{invariance under cyclic permutations}. Note that the matrices cannot be reordered arbitrarily, for example $\tr(\A\mat{B}\mat{C}\mat{D}) \neq \tr(\mat{B}\A\mat{C}\mat{D})$ in general. +The last is known as \term{invariance under cyclic permutations}. +Note that the matrices cannot be reordered arbitrarily, for example $\tr(\A\mat{B}\mat{C}\mat{D}) \neq \tr(\mat{B}\A\mat{C}\mat{D})$ in general. +Also, there is nothing special about the product of four matrices -- analogous rules hold for more or fewer matrices. Interestingly, the trace of a matrix is equal to the sum of its eigenvalues (repeated according to multiplicity): \[\tr(\A) = \sum_i \lambda_i(\A)\] @@ -192,11 +215,7 @@ \subsection{Determinant} Interestingly, the determinant of a matrix is equal to the product of its eigenvalues (repeated according to multiplicity): \[\det(\A) = \prod_i \lambda_i(\A)\] -\subsection{Special kinds of matrices} -There are several ways matrices can be classified. -Each categorization implies some potentially desirable properties, so it's always good to know what kind of matrix you're dealing with. - -\subsubsection{Orthogonal matrices} +\subsection{Orthogonal matrices} A matrix $\mat{Q} \in \R^{n \times n}$ is said to be \term{orthogonal} if its columns are pairwise orthonormal. This definition implies that \[\mat{Q}\tran \mat{Q} = \mat{Q}\mat{Q}\tran = \I\] @@ -206,92 +225,26 @@ \subsubsection{Orthogonal matrices} \[\|\mat{Q}\x\|_2 = \sqrt{(\mat{Q}\x)\tran(\mat{Q}\x)} = \sqrt{\x\tran\x} = \|\x\|_2\] Therefore multiplication by an orthogonal matrix can be considered as a transformation that preserves length, but may rotate or reflect the vector about the origin. -\subsubsection{Symmetric matrices} -A matrix $\A \in \R^{n \times n}$ is said to be \term{symmetric} if it is equal to its own transpose ($\A = \A\tran$). +\subsection{Symmetric matrices} +A matrix $\A \in \R^{n \times n}$ is said to be \term{symmetric} if it is equal to its own transpose ($\A = \A\tran$), meaning that $A_{ij} = A_{ji}$ for all $(i,j)$. This definition seems harmless enough but turns out to have some strong implications. We summarize the most important of these as \begin{theorem} (Spectral Theorem) -Let $\A \in \R^{n \times n}$ be symmetric. -Then there exists an orthonormal basis for $\R^n$ consisting of eigenvectors of $\A$. +If $\A \in \R^{n \times n}$ is symmetric, then there exists an orthonormal basis for $\R^n$ consisting of eigenvectors of $\A$. \end{theorem} -This theorem allows us to factor symmetric matrices as follows: +The practical application of this theorem is a particular factorization of symmetric matrices, referred to as the \term{eigendecomposition} or \term{spectral decomposition}. +Denote the orthonormal basis of eigenvectors $\q_1, \dots, \q_n$ and their eigenvalues $\lambda_1, \dots, \lambda_n$. +Let $\mat{Q}$ be an orthogonal matrix with $\q_1, \dots, \q_n$ as its columns, and $\mat{\Lambda} = \diag(\lambda_1, \dots, \lambda_n)$. +Since by definition $\A\q_i = \lambda_i\q_i$ for every $i$, the following relationship holds: +\[\A\mat{Q} = \mat{Q}\mat{\Lambda}\] +Right-multiplying by $\mat{Q}\tran$, we arrive at the decomposition \[\A = \mat{Q}\mat{\Lambda}\mat{Q}\tran\] -Here $\mat{Q}$ is an orthogonal matrix with the aforementioned orthogonal basis as its columns, and $\mat{\Lambda} = \diag(\lambda_1, \dots, \lambda_n)$, where $\lambda_1, \dots, \lambda_n \in \R$ are the corresponding eigenvalues\footnote{ - The fact that the eigenvalues are real also follows from the symmetry of $\A$. -} of $\A$. -This is referred to as the \term{eigendecomposition} or \term{spectral decomposition} of $\A$. - -\subsubsection{Positive (semi-)definite matrices} -A symmetric matrix $\A$ is \term{positive definite} if for all nonzero $\x \in \R^n$, $\x\tran\A\x > 0$. -Sometimes people write $\A \succ 0$ to indicate that $\A$ is positive definite. -Positive definite matrices have all positive eigenvalues and diagonal entries. - -A symmetric matrix $\A$ is \term{positive semi-definite} if for all $\x \in \R^n$, $\x\tran\A\x \geq 0$. -Sometimes people write $\A \succeq 0$ to indicate that $\A$ is positive semi-definite. -Positive semi-definite matrices have all nonnegative eigenvalues and diagonal entries. - -Positive definite and positive semi-definite matrices will come up very frequently! -Note that since these matrices are also symmetric, the properties of symmetric matrices apply here as well. - -As an example of how these matrices arise, the matrix $\A\tran\A$ is positive semi-definite for any $\A \in \R^{m \times n}$, since -\[\x\tran (\A\tran\A)\x = (\A\x)\tran(\A\x) = \|\A\x\|_2^2 \geq 0\] -for any $\x \in \R^n$. - -\subsection{Singular value decomposition} -Singular value decomposition (SVD) is a widely applicable tool in linear algebra. -Its strength stems partially from the fact that \textit{every matrix} $\A \in \R^{m \times n}$ has an SVD (even non-square matrices)! -The decomposition goes as follows: -\[\A = \mat{U}\mat{\Sigma}\mat{V}\tran\] -where $\mat{U} \in \R^{m \times m}$ and $\mat{V} \in \R^{n \times n}$ are orthogonal matrices and $\mat{\Sigma} \in \R^{m \times n}$ is a diagonal matrix with the \term{singular values} of $\A$ (denoted $\sigma_i$) on its diagonal. -The singular values of $\A$ are defined as the square roots of the eigenvalues of $\A\tran\A$ (or equivalently, of $\A\A\tran$). - -By convention, the singular values are given in non-increasing order, i.e. -\[\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_{\min(m,n)} \geq 0\] -Only the first $r$ singular values are nonzero, where $r$ is the rank of $\A$. - -The columns of $\mat{U}$ are called the \term{left-singular vectors} of $\A$, and they are eigenvectors of $\A\A\tran$. -(Try showing this!) -The columns of $\mat{V}$ are called the \term{right-singular vectors} of $\A$, and they are eigenvectors of $\A\tran\A$. - -\subsection{Some useful matrix identities} -\subsubsection{Matrix-vector product as linear combination of matrix columns} -\begin{proposition} -Let $\x \in \R^n$ be a vector and $\A \in \R^{m \times n}$ a matrix with columns $\a_1, \dots, \a_n$. -Then -\[\A\x = \sum_{i=1}^n x_i\a_i\] -\end{proposition} -This identity is extremely useful in understanding linear operators in terms of their matrices' columns. -The proof is very simple (consider each element of $\A\x$ individually and expand by definitions) but it is a good exercise to convince yourself. - -\subsubsection{Sum of outer products as matrix-matrix product} -An \term{outer product} is an expression of the form $\a\b\tran$, where $\a \in \R^m$ and $\b \in \R^n$. -By inspection it is not hard to see that such an expression yields an $m \times n$ matrix such that -\[[\a\b\tran]_{ij} = a_ib_j\] -It is not immediately obvious, but the sum of outer products is actually equivalent to an appropriate matrix-matrix product! -We formalize this statement as -\begin{proposition} -Let $\a_1, \dots, \a_k \in \R^m$ and $\b_1, \dots, \b_k \in \R^n$. Then -\[\sum_{\ell=1}^k \a_\ell\b_\ell\tran = \mat{A}\mat{B}\tran\] -where -\[\mat{A} = \matlit{\a_1 & \cdots & \a_k}, \tab \mat{B} = \matlit{\b_1 & \cdots & \b_k}\] -\end{proposition} -\begin{proof} -For each $(i,j)$, we have -\[\left[\sum_{\ell=1}^k \a_\ell\b_\ell\tran\right]_{ij} = \sum_{\ell=1}^k [\a_\ell\b_\ell\tran]_{ij} = \sum_{\ell=1}^k [\a_\ell]_i[\b_\ell]_j = \sum_{\ell=1}^k A_{i\ell}B_{j\ell}\] -This last expression should be recognized as an inner product between the $i$th row of $\A$ and the $j$th row of $\mat{B}$, or equivalently the $j$th column of $\mat{B}\tran$. -Hence by the definition of matrix multiplication, it is equal to $[\mat{A}\mat{B}\tran]_{ij}$. -\end{proof} -\subsection{Quadratic forms} +\subsubsection{Rayleigh quotients} Let $\A \in \R^{n \times n}$ be a symmetric matrix. -The expression $\x\tran\A\x$ is called a \term{quadratic form} and comes up all the time. -It is in some cases helpful to rewrite quadratic forms in terms of the individual elements that make up $\A$ and $\x$: -\[\x\tran\A\x = \sum_{i=1}^n\sum_{j=1}^n A_{ij}x_ix_j\] -This identity is not hard to show, but the derivation is somewhat tedious, so we omit it. -The result can be used, for example, to derive $\nabla_\x(\x\tran\A\x)$, as well as to prove that all the diagonal entries of a positive-definite matrix are positive. +The expression $\x\tran\A\x$ is called a \term{quadratic form}. -\subsubsection{Rayleigh quotients} There turns out to be an interesting connection between the quadratic form of a symmetric matrix and its eigenvalues. This connection is provided by the \term{Rayleigh quotient} \[R_\A(\x) = \frac{\x\tran\A\x}{\x\tran\x}\] @@ -336,36 +289,147 @@ \subsubsection{Rayleigh quotients} with equality if and only if $\x$ is a corresponding eigenvector. \end{theorem} +\subsection{Positive (semi-)definite matrices} +A symmetric matrix $\A$ is \term{positive semi-definite} if for all $\x \in \R^n$, $\x\tran\A\x \geq 0$. +Sometimes people write $\A \succeq 0$ to indicate that $\A$ is positive semi-definite. + +A symmetric matrix $\A$ is \term{positive definite} if for all nonzero $\x \in \R^n$, $\x\tran\A\x > 0$. +Sometimes people write $\A \succ 0$ to indicate that $\A$ is positive definite. +Note that positive definiteness is a strictly stronger property than positive semi-definiteness, in the sense that every positive definite matrix is positive semi-definite but not vice-versa. + +These properties are related to eigenvalues in the following way. +\begin{proposition} +A symmetric matrix is positive semi-definite if and only if all of its eigenvalues are nonnegative, and positive definite if and only if all of its eigenvalues are positive. +\end{proposition} +\begin{proof} +Suppose $A$ is positive semi-definite, and let $\x$ be an eigenvector of $\A$ with eigenvalue $\lambda$. +Then +\[0 \leq \x\tran\A\x = \x\tran(\lambda\x) = \lambda\x\tran\x = \lambda\|\x\|_2^2\] +Since $\x \neq \vec{0}$ (by the assumption that it is an eigenvector), we have $\|\x\|_2^2 > 0$, so we can divide both sides by $\|\x\|_2^2$ to arrive at $\lambda \geq 0$. +If $\A$ is positive definite, the inequality above holds strictly, so $\lambda > 0$. +This proves one direction. + +To simplify the proof of the other direction, we will use the machinery of Rayleigh quotients. +Suppose that $\A$ is symmetric and all its eigenvalues are nonnegative. +Then for all $\x \neq \vec{0}$, +\[0 \leq \lambda_{\min}(\A) \leq R_\A(\x)\] +Since $\x\tran\A\x$ matches $R_\A(\x)$ in sign, we conclude that $\A$ is positive semi-definite. +If the eigenvalues of $\A$ are all strictly positive, then $0 < \lambda_{\min}(\A)$, whence it follows that $\A$ is positive definite. +\end{proof} +As an example of how these matrices arise, consider +\begin{proposition} +Suppose $\A \in \R^{m \times n}$. +Then $\A\tran\A$ is positive semi-definite. +If $\Null(\A) = \{\vec{0}\}$, then $\A\tran\A$ is positive definite. +\end{proposition} +\begin{proof} +For any $\x \in \R^n$, +\[\x\tran (\A\tran\A)\x = (\A\x)\tran(\A\x) = \|\A\x\|_2^2 \geq 0\] +so $\A\tran\A$ is positive semi-definite. + +Note that $\|\A\x\|_2^2 = 0$ implies $\|\A\x\|_2 = 0$, which in turn implies $\A\x = \vec{0}$ (recall that this is a property of norms). +If $\Null(\A) = \{\vec{0}\}$, $\A\x = \vec{0}$ implies $\x = \vec{0}$, so $\x\tran (\A\tran\A)\x = 0$ if and only if $\x = \vec{0}$, and thus $\A\tran\A$ is positive definite. +\end{proof} +Positive definite matrices are invertible (since their eigenvalues are nonzero), whereas positive semi-definite matrices might not be. +However, if you already have a positive semi-definite matrix, it is possible to perturb its diagonal slightly to produce a positive definite matrix. +\begin{proposition} +If $\A$ is positive semi-definite and $\epsilon > 0$, then $\A + \epsilon\I$ is positive definite. +\end{proposition} +\begin{proof} +Assuming $\A$ is positive semi-definite and $\epsilon > 0$, we have for any $\x \neq \vec{0}$ that +\[\x\tran(\A+\epsilon\I)\x = \x\tran\A\x + \epsilon\x\tran\I\x = \underbrace{\x\tran\A\x}_{\geq 0} + \underbrace{\epsilon\|\x\|_2^2}_{> 0} > 0\] +as claimed. +\end{proof} +An obvious but frequently useful consequence of the two propositions we have just shown is that $\A\tran\A + \epsilon\I$ is positive definite (and in particular, invertible) for \textit{any} matrix $\A$ and any $\epsilon > 0$. + \subsubsection{The geometry of positive definite quadratic forms} A useful way to understand quadratic forms is by the geometry of their level sets. -Recall that a \term{level set} or \term{isocontour} of a function is the set of all inputs such that the function applied to those inputs yields a given output. +A \term{level set} or \term{isocontour} of a function is the set of all inputs such that the function applied to those inputs yields a given output. Mathematically, the $c$-isocontour of $f$ is $\{\x \in \dom f : f(\x) = c\}$. Let us consider the special case $f(\x) = \x\tran\mat{A}\x$ where $\mat{A}$ is a positive definite matrix. -Since $\mat{A}$ is positive definite, it has a unique matrix square root $\A\halfpow = \mat{Q}\mat{\Lambda}\halfpow\mat{Q}\tran$, where $\mat{Q}\mat{\Lambda}\mat{Q}\tran = \A$ is the eigendecomposition of $\A$ and $\mat{\Lambda}\halfpow = \diag(\sqrt{\lambda_1}, \dots \sqrt{\lambda_d})$. -It is easy to see that this matrix $\A\halfpow$ is positive definite and satisfies $\A\halfpow\A\halfpow = \A$. -Fixing a value $c \geq 0$, the $c$-isocontour of $f$ is the set of $\x \in \R^d$ such that +Since $\mat{A}$ is positive definite, it has a unique matrix square root $\A\halfpow = \mat{Q}\mat{\Lambda}\halfpow\mat{Q}\tran$, where $\mat{Q}\mat{\Lambda}\mat{Q}\tran$ is the eigendecomposition of $\A$ and $\mat{\Lambda}\halfpow = \diag(\sqrt{\lambda_1}, \dots \sqrt{\lambda_n})$. +It is easy to see that this matrix $\A\halfpow$ is positive definite (consider its eigenvalues) and satisfies $\A\halfpow\A\halfpow = \A$. +Fixing a value $c \geq 0$, the $c$-isocontour of $f$ is the set of $\x \in \R^n$ such that \[c = \x\tran\A\x = \x\tran\A\halfpow\A\halfpow\x = \|\A\halfpow\x\|_2^2\] where we have used the symmetry of $\A\halfpow$. Making the change of variable $\z = \A\halfpow\x$, we have the condition $\|\z\|_2 = \sqrt{c}$. That is, the values $\z$ lie on a sphere of radius $\sqrt{c}$. These can be parameterized as $\z = \sqrt{c}\hat{\z}$ where $\hat{\z}$ has $\|\hat{\z}\|_2 = 1$. Then since $\A\neghalfpow = \mat{Q}\mat{\Lambda}\neghalfpow\mat{Q}\tran$, we have -\[\x = \A\neghalfpow\z = \mat{Q}\mat{\Lambda}\neghalfpow\mat{Q}\tran\sqrt{c}\hat{\z} = \mat{Q}(\sqrt{c}\mat{\Lambda}\neghalfpow)\tilde{\z}\] +\[\x = \A\neghalfpow\z = \mat{Q}\mat{\Lambda}\neghalfpow\mat{Q}\tran\sqrt{c}\hat{\z} = \sqrt{c}\mat{Q}\mat{\Lambda}\neghalfpow\tilde{\z}\] where $\tilde{\z} = \mat{Q}\tran\hat{\z}$ also satisfies $\|\tilde{\z}\|_2 = 1$ since $\mat{Q}$ is orthogonal. -Using this parameterization, we see that the solution set $\{\x \in \R^d : f(\x) = c\}$ is the image of the unit sphere $\{\tilde{\z} \in \R^d : \|\tilde{\z}\|_2 = 1\}$ under the invertible linear map $\x = \mat{Q}(\sqrt{c}\mat{\Lambda}\neghalfpow)\tilde{\z}$. +Using this parameterization, we see that the solution set $\{\x \in \R^n : f(\x) = c\}$ is the image of the unit sphere $\{\tilde{\z} \in \R^n : \|\tilde{\z}\|_2 = 1\}$ under the invertible linear map $\x = \sqrt{c}\mat{Q}\mat{\Lambda}\neghalfpow\tilde{\z}$. What we have gained with all these manipulations is a clear algebraic understanding of the $c$-isocontour of $f$ in terms of a sequence of linear transformations applied to a well-understood set. -We begin with the unit sphere, then scale every axis $i$ by $\sqrt{c}\lambda_i\neghalfpow$, resulting in an axis-aligned ellipsoid. +We begin with the unit sphere, then scale every axis $i$ by $\lambda_i\neghalfpow$, resulting in an axis-aligned ellipsoid. Observe that the axis lengths of the ellipsoid are proportional to the inverse square roots of the eigenvalues of $\A$. Hence larger eigenvalues correspond to shorter axis lengths, and vice-versa. Then this axis-aligned ellipsoid undergoes a rigid transformation (i.e. one that preserves length and angles, such as a rotation/reflection) given by $\mat{Q}$. The result of this transformation is that the axes of the ellipse are no longer along the coordinate axes in general, but rather along the directions given by the corresponding eigenvectors. -To see this, consider the unit vector $\vec{e}_i \in \R^d$ that has $[\vec{e}_i]_j = \delta_{ij}$. +To see this, consider the unit vector $\vec{e}_i \in \R^n$ that has $[\vec{e}_i]_j = \delta_{ij}$. In the pre-transformed space, this vector points along the axis with length proportional to $\lambda_i\neghalfpow$. But after applying the rigid transformation $\mat{Q}$, the resulting vector points in the direction of the corresponding eigenvector $\q_i$, since -\[\mat{Q}\vec{e}_i = \sum_{j=1}^d [e_i]_j\q_j = \q_i\] +\[\mat{Q}\vec{e}_i = \sum_{j=1}^n [\vec{e}_i]_j\q_j = \q_i\] where we have used the matrix-vector product identity from earlier. In summary: the isocontours of $f(\x) = \x\tran\A\x$ are ellipsoids such that the axes point in the directions of the eigenvectors of $\A$, and the radii of these axes are proportional to the inverse square roots of the corresponding eigenvalues. + +\subsection{Singular value decomposition} +Singular value decomposition (SVD) is a widely applicable tool in linear algebra. +Its strength stems partially from the fact that \textit{every matrix} $\A \in \R^{m \times n}$ has an SVD (even non-square matrices)! +The decomposition goes as follows: +\[\A = \mat{U}\mat{\Sigma}\mat{V}\tran\] +where $\mat{U} \in \R^{m \times m}$ and $\mat{V} \in \R^{n \times n}$ are orthogonal matrices and $\mat{\Sigma} \in \R^{m \times n}$ is a diagonal matrix with the \term{singular values} of $\A$ (denoted $\sigma_i$) on its diagonal. + +By convention, the singular values are given in non-increasing order, i.e. +\[\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_{\min(m,n)} \geq 0\] +Only the first $r$ singular values are nonzero, where $r$ is the rank of $\A$. + +Observe that the SVD factors provide eigendecompositions for $\A\tran\A$ and $\A\A\tran$: +\begin{align*} +\A\tran\A &= (\mat{U}\mat{\Sigma}\mat{V}\tran)\tran\mat{U}\mat{\Sigma}\mat{V}\tran = \mat{V}\mat{\Sigma}\tran\mat{U}\tran\mat{U}\mat{\Sigma}\mat{V}\tran = \mat{V}\mat{\Sigma}\tran\mat{\Sigma}\mat{V}\tran \\ +\A\A\tran &= \mat{U}\mat{\Sigma}\mat{V}\tran(\mat{U}\mat{\Sigma}\mat{V}\tran)\tran = \mat{U}\mat{\Sigma}\mat{V}\tran\mat{V}\mat{\Sigma}\tran\mat{U}\tran = \mat{U}\mat{\Sigma}\mat{\Sigma}\tran\mat{U}\tran +\end{align*} +It follows immediately that the columns of $\mat{V}$ (the \term{right-singular vectors} of $\A$) are eigenvectors of $\A\tran\A$, and the columns of $\mat{U}$ (the \term{left-singular vectors} of $\A$) are eigenvectors of $\A\A\tran$. + +The matrices $\mat{\Sigma}\tran\mat{\Sigma}$ and $\mat{\Sigma}\mat{\Sigma}\tran$ are not necessarily the same size, but both are diagonal with the squared singular values $\sigma_i^2$ on the diagonal (plus possibly some zeros). +Thus the singular values of $\A$ are the square roots of the eigenvalues of $\A\tran\A$ (or equivalently, of $\A\A\tran$)\footnote{ + Recall that $\A\tran\A$ and $\A\A\tran$ are positive semi-definite, so their eigenvalues are nonnegative, and thus taking square roots is always well-defined. +}. + +\subsection{Some useful matrix identities} +\subsubsection{Matrix-vector product as linear combination of matrix columns} +\begin{proposition} +Let $\x \in \R^n$ be a vector and $\A \in \R^{m \times n}$ a matrix with columns $\a_1, \dots, \a_n$. +Then +\[\A\x = \sum_{i=1}^n x_i\a_i\] +\end{proposition} +This identity is extremely useful in understanding linear operators in terms of their matrices' columns. +The proof is very simple (consider each element of $\A\x$ individually and expand by definitions) but it is a good exercise to convince yourself. + +\subsubsection{Sum of outer products as matrix-matrix product} +An \term{outer product} is an expression of the form $\a\b\tran$, where $\a \in \R^m$ and $\b \in \R^n$. +By inspection it is not hard to see that such an expression yields an $m \times n$ matrix such that +\[[\a\b\tran]_{ij} = a_ib_j\] +It is not immediately obvious, but the sum of outer products is actually equivalent to an appropriate matrix-matrix product! +We formalize this statement as +\begin{proposition} +Let $\a_1, \dots, \a_k \in \R^m$ and $\b_1, \dots, \b_k \in \R^n$. Then +\[\sum_{\ell=1}^k \a_\ell\b_\ell\tran = \mat{A}\mat{B}\tran\] +where +\[\mat{A} = \matlit{\a_1 & \cdots & \a_k}, \tab \mat{B} = \matlit{\b_1 & \cdots & \b_k}\] +\end{proposition} +\begin{proof} +For each $(i,j)$, we have +\[\left[\sum_{\ell=1}^k \a_\ell\b_\ell\tran\right]_{ij} = \sum_{\ell=1}^k [\a_\ell\b_\ell\tran]_{ij} = \sum_{\ell=1}^k [\a_\ell]_i[\b_\ell]_j = \sum_{\ell=1}^k A_{i\ell}B_{j\ell}\] +This last expression should be recognized as an inner product between the $i$th row of $\A$ and the $j$th row of $\mat{B}$, or equivalently the $j$th column of $\mat{B}\tran$. +Hence by the definition of matrix multiplication, it is equal to $[\mat{A}\mat{B}\tran]_{ij}$. +\end{proof} + +\subsubsection{Quadratic forms} +Let $\A \in \R^{n \times n}$ be a symmetric matrix, and recall that the expression $\x\tran\A\x$ is called a quadratic form of $\A$. +It is in some cases helpful to rewrite the quadratic form in terms of the individual elements that make up $\A$ and $\x$: +\[\x\tran\A\x = \sum_{i=1}^n\sum_{j=1}^n A_{ij}x_ix_j\] +This identity is valid for any square matrix (need not be symmetric), although quadratic forms are usually only discussed in the context of symmetric matrices. \ No newline at end of file diff --git a/cs189-probability.tex b/cs189-probability.tex index 5621ffc..4faecd8 100644 --- a/cs189-probability.tex +++ b/cs189-probability.tex @@ -1,5 +1,4 @@ Probability theory provides powerful tools for modeling and dealing with uncertainty. -It is used extensively in machine learning, particularly to construct and analyze classifiers. \subsection{Basics} Suppose we have some sort of randomized experiment (e.g. a coin toss, die roll) that has a fixed set of possible outcomes. @@ -9,6 +8,7 @@ \subsection{Basics} The set of events is denoted $\calF$.\footnote{ $\calF$ is required to be a $\sigma$-algebra for technical reasons; see \cite{rigorousprob}. } +The \term{complement} of the event $A$ is another event, $A\comp = \Omega \setminus A$. Then we can define a \term{probability measure} $\P : \calF \to [0,1]$ which must satisfy \begin{enumerate}[(i)] @@ -146,8 +146,8 @@ \subsubsection{Continuous random variables} One is as relative probabilities; even though the probability of each particular value being picked is technically zero, some points are still in a sense more likely than others. One can also think of the density as determining the probability that the variable will lie in a small range about a given value. -Recall that for small $\epsilon$, -\[\pr{x-\nicefrac{\epsilon}{2} \leq X \leq x+\nicefrac{\epsilon}{2}} = \int_{x-\nicefrac{\epsilon}{2}}^{x+\nicefrac{\epsilon}{2}} p(z)\dd{z} \approx \epsilon p(x)\] +This is because, for small $\epsilon > 0$, +\[\pr{x-\epsilon \leq X \leq x+\epsilon} = \int_{x-\epsilon}^{x+\epsilon} p(z)\dd{z} \approx 2\epsilon p(x)\] using a midpoint approximation to the integral. Here are some useful identities that follow from the definitions above: diff --git a/math4ml.bib b/math4ml.bib index 4859025..7603d4b 100644 --- a/math4ml.bib +++ b/math4ml.bib @@ -45,10 +45,10 @@ @book{rice address = "Belmont, California" } -@book{folland, - author = "Gerald B. Folland", - title = "Real Analysis: Modern Techniques and Their Applications (Second Edition)", - year = "1999", - publisher = "John Wiley \& Sons", +@book{afam, + author = "Ward Cheney", + title = "Analysis for Applied Mathematics", + year = "2001", + publisher = "Springer Science+Business Medias", address = "New York" } diff --git a/math4ml.pdf b/math4ml.pdf index 92caa73..d0a7b83 100644 Binary files a/math4ml.pdf and b/math4ml.pdf differ diff --git a/math4ml.tex b/math4ml.tex index 97425b5..0b66f94 100644 --- a/math4ml.tex +++ b/math4ml.tex @@ -41,8 +41,8 @@ \section{Notation} $\R^n$ & set (vector space) of $n$-tuples of real numbers, endowed with the usual inner product \\ $\R^{m \times n}$ & set (vector space) of $m$-by-$n$ matrices \\ $\delta_{ij}$ & Kronecker delta, i.e. $\delta_{ij} = 1$ if $i = j$, $0$ otherwise \\ -$\nabla f(\vec{x})$ & gradient of the function $f$ evaluated at $\x$ \\ -$\nabla^2 f(\vec{x})$ & Hessian of the function $f$ evaluated at $\x$ \\ +$\nabla f(\vec{x})$ & gradient of the function $f$ at $\x$ \\ +$\nabla^2 f(\vec{x})$ & Hessian of the function $f$ at $\x$ \\ $\A\tran$ & transpose of the matrix $\A$ \\ $\Omega$ & sample space \\ $\pr{A}$ & probability of event $A$ \\ @@ -84,7 +84,7 @@ \section{Probability} \newpage \section*{Acknowledgements} -The author would like to thank Michael Franco for suggested clarifications. +The author would like to thank Michael Franco for suggested clarifications, and Chinmoy Saayujya for catching a typo. \bibliography{math4ml} \addcontentsline{toc}{section}{References} diff --git a/measure-probability.pdf b/measure-probability.pdf index 1174dfd..03edd43 100644 Binary files a/measure-probability.pdf and b/measure-probability.pdf differ diff --git a/measure-probability.tex b/measure-probability.tex index 01b2a7f..d183075 100644 --- a/measure-probability.tex +++ b/measure-probability.tex @@ -19,7 +19,7 @@ \section{About} \section{Collections of sets} We would like to assign measures to various subsets of $\R^n$ characterizing their size. Ideally our measure $\mu$ would satisfy -\begin{enumerate} +\begin{enumerate}[(i)] \item For any countable collection of disjoint sets $E_1, E_2, \dots \subseteq \R^n$, \[\mu\bigg(\bigcup_i E_i\bigg) = \sum_i \mu(E_i)\] \item If two sets $E, F \subseteq \R^n$ are such that $E$ can be transformed into $F$ by rigid transformations, then $\mu(E) = \mu(F)$. @@ -42,15 +42,15 @@ \section{Collections of sets} \subsection{Algebras and $\sigma$-algebras} Let $\Omega$ be a non-empty set. Then $\calA \subseteq \calP(\Omega)$ is an algebra on $\Omega$ if -\begin{enumerate} +\begin{enumerate}[(i)] \item $\calA$ is non-empty. -\item If $E \in \calA$, then $E^c = \Omega \setminus E \in \calA$. +\item If $E \in \calA$, then $E\comp = \Omega \setminus E \in \calA$. \item If $E_1, \dots, E_n \in \calA$, then $\bigcup_{i=1}^n E_i \in \calA$. \end{enumerate} The second property states that $\calA$ is \term{closed under complements}. Using de Morgan's laws, properties 2 and 3 collectively imply that $\calA$ is closed under finite intersections as well, since -\[\bigcap_{i=1}^n E_i = \bigg(\bigcup_{i=1}^n E_i^c\bigg)^c\] -Then we must have $\varnothing \in \calA$; since $\calA$ is non-empty there exists some $E \in \calA$, so $E^c \in \calA$, and hence $\varnothing = E \cap E^c \in \calA$. +\[\bigcap_{i=1}^n E_i = \bigg(\bigcup_{i=1}^n E_i\comp\bigg)\comp\] +Then we must have $\varnothing \in \calA$; since $\calA$ is non-empty there exists some $E \in \calA$, so $E\comp \in \calA$, and hence $\varnothing = E \cap E\comp \in \calA$. In light of the desirability of countable additivity, we would like the collection of subsets we consider to be closed under unions of countably many sets, not just finitely many. Thus we need to strengthen condition 3, and arrive at the following definition: a \term{$\sigma$-algebra} is an algebra that is closed under countable unions. @@ -67,7 +67,7 @@ \section{Measures} Let $\Omega$ be a non-empty set and $\calM \subseteq \calP(\Omega)$ a $\sigma$-algebra. The pair $(\Omega, \calM)$ is called a \term{measurable space}, and the elements of $\calM$ are its \term{measurable sets}. A \term{measure} on $(\Omega, \calM)$ is a function $\mu : \calM \to [0,\infty]$ such that -\begin{enumerate} +\begin{enumerate}[(i)] \item $\mu(\varnothing) = 0$ \item For any countable collection of disjoint sets $\{E_i\} \subseteq \calM$, \[\mu\bigg(\bigdotcup_i E_i\bigg) = \sum_i \mu(E_i)\] @@ -130,7 +130,7 @@ \section{Lebesgue measure} The key tool in constructing Lebesgue measure is the \term{Lebesgue outer measure} $\lambda^* : \calP(\R) \to [0,\infty]$, which is given by \[\lambda^*(E) = \inf\left\{\sum_{k=1}^\infty \ell(I_k) : I_k \in \calI, E \subseteq \bigcup_{k=1}^\infty I_k\right\}\] A set $E \subseteq \R$ is said to be \term{Lebesgue measurable} if for every $A \subseteq \R$, -\[\lambda^*(A) = \lambda^*(A \cap E) + \lambda^*(A \cap E^c)\] +\[\lambda^*(A) = \lambda^*(A \cap E) + \lambda^*(A \cap E\comp)\] It turns out that the set of Lebesgue measurable sets is very large and contains pretty much any reasonable set that one would encounter in practice. However, it is possible\footnote{ assuming the axiom of choice @@ -243,11 +243,11 @@ \section{Probability} From these axioms, a number of useful rules can be derived. \begin{proposition} -If $A$ is an event, then $\pr{A^c} = 1 - \pr{A}$. +If $A$ is an event, then $\pr{A\comp} = 1 - \pr{A}$. \end{proposition} \begin{proof} Using the countable additivity of $\pm$, we have -\[\pr{A} + \pr{A^c} = \pr{A \dotcup A^c} = \pr{\Omega} = 1\] +\[\pr{A} + \pr{A\comp} = \pr{A \dotcup A\comp} = \pr{\Omega} = 1\] which proves the result. \end{proof} diff --git a/orthogonal-projection.png b/orthogonal-projection.png new file mode 100644 index 0000000..3c4a5a5 Binary files /dev/null and b/orthogonal-projection.png differ