slides.tex

\documentclass{beamer}

% encoding
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
% language
\usepackage[ngerman]{babel}
% beamer theme
\usetheme{default}
\beamertemplatenavigationsymbolsempty
\hypersetup{pdfpagemode=UseNone} % don't show bookmarks on initial view
\setbeamertemplate{footline}[frame number]
\setbeamerfont{footline}{size=\footnotesize}
% fonts
\usepackage{mathpazo}
\usefonttheme{serif}
% for being able to copy text out of the PDF
\usepackage{cmap}

% math mode formatting etc.
\usepackage{amsmath,amsfonts,amssymb,amsthm,sistyle,xcolor,delim}
\SIstyle{German}
\usepackage{graphicx,tikz,calc,tabularx,hhline}
\usetikzlibrary{backgrounds,calc,decorations.pathreplacing,positioning,shapes.geometric}

\tikzset{
 >=stealth,
}

% metadata
\title{Training of Hidden Markov models as an instance of the expectation maximization algorithm}
\author{Stefan Majewsky}
\date{22. August 2017}

% NOTE: Arbeit in physischer Form mitbringen, falls Fragen dazu kommen

\begin{document}

\begin{frame}
 \titlepage
\end{frame}

\begin{frame}\frametitle{Caveat emptor!}
 Die Definitionen in diesem Vortrag sind aus Zeitgründen stark verkürzt. In der
 eigentlichen Arbeit wird alles rigoros eingeführt und abgeleitet.
\end{frame}

\begin{frame}<1-3>[label=gliederung]\frametitle{Gliederung}
 Sprachmodelle
 \begin{itemize}
  \item Bigramm-Modell
  \item Hidden-Markov-Modell
 \end{itemize}
 \pause
 EM-Algorithmen
 \begin{itemize}
  \item Baum-Welch-Algorithmus
  \pause
  \item<alert@4> Inside-Outside-EM-Algorithmus
  \item Instanziierung für Hidden-Markov-Modell
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Sprachmodell}
 \begin{itemize}
  \item Ziel: probabilistische Beschreibung einer Sprache
   \[
    \text{Satz } x \mapsto \text{Wahrscheinlichkeit } p(x)
    \vspace{-1em}
   \]
  \pause\item beschrieben durch \emph{Modellparameter} $\omega\in\Omega$
   \[\begin{tikzpicture}[c/.style={rectangle,draw},l/.style={anchor=east},r/.style={anchor=west}]
    \node[l] (corpus)   at (0,+0) { \strut Korpus };
    \node[c] (training) at (2,+0) { \strut Training };
    \node[r] (omega)    at (4,+0) { \strut Parameter $\omega$ };
    \draw[->] (corpus) -- (training);
    \draw[->] (training) -- (omega);
    \node[l] (input)    at (0,-2) { \strut Satz $x$ };
    \node[c] (eval)     at (2,-2) { \strut Evaluierung };
    \node[r] (output)   at (4,-2) { \strut $p_\omega(x)$ };
    \draw[->] (input) -- (eval);
    \draw[->] (eval) -- (output);
    \draw[->] (omega.south) |- ($(eval.north)!0.5!(omega.south)$) -| (eval.north);
   \end{tikzpicture}\]
 \end{itemize}
 \scriptsize Grafik nach: H. Vogler. \emph{Maschinelles Übersetzen natürlicher Sprachen.}\\Vorlesung, TU Dresden, Wintersemester 2015/2016.
\end{frame}

\begin{frame}\frametitle{Bigramm-Modell}
 Idee:
 \begin{itemize}
  \item jedes Wort individuell betrachten
  \item Wahrscheinlichkeit hängt nur vom vorherigen Wort ab
   \pause
   \[\begin{tikzpicture}[node distance=10mm]
    \node at (0,0) {~};
    \node (x0) at (1.3,0) { \strut \# };
    \node (x1) at (3.35,0) { \strut Alice };
    \node (x2) at (5.8,0) { \strut sees };
    \node (x3) at (8.05,0) { \strut Bob };
    \node (x4) at (9.7,0) { \strut \# };
    \node at (10,0) {~};
    \draw[decorate,decoration={brace,amplitude=5pt,mirror}] ($(x0.south)!0.3!(x0.south east)+(0,5pt)$) -- ($(x1.south)!0.3!(x1.south west)+(0,5pt)$);
    \draw[decorate,decoration={brace,amplitude=5pt,mirror}] ($(x1.south)!0.3!(x1.south east)+(0,5pt)$) -- ($(x2.south)!0.3!(x2.south west)+(0,5pt)$);
    \draw[decorate,decoration={brace,amplitude=5pt,mirror}] ($(x2.south)!0.3!(x2.south east)+(0,5pt)$) -- ($(x3.south)!0.3!(x3.south west)+(0,5pt)$);
    \draw[decorate,decoration={brace,amplitude=5pt,mirror}] ($(x3.south)!0.3!(x3.south east)+(0,5pt)$) -- ($(x4.south)!0.3!(x4.south west)+(0,5pt)$);
   \end{tikzpicture}\vspace*{-2em}\]
   ~
   \[
    p_b(x) = b(\text{Alice}|\#) \cdot b(\text{sees}|\text{Alice}) \cdot b(\text{Bob}|\text{sees}) \cdot b(\#|\text{Bob})
   \]
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Bigramm-Modell: Training}
 \begin{itemize}
  \item Likelihood eines Korpus...
   {\scriptsize\[
    p_b(c) = \prod_x p_b(x)^{c(x)}
   \]}
  \item ...wird durch empirische Wahrscheinlichkeit maximiert
   {\scriptsize\[
    b(w'|w) = \frac{\operatorname{count}(w\;w')}{\sum_{w''}\operatorname{count}(w\;w'')}
    \vspace*{-1.0em}
   \]}
   \pause
  \item z.~B.~für Korpus: "`Alice sees Bob"', "`Alice likes cake"'
 \end{itemize}
 \begin{center}\begin{tabular}{c|c|c}
  $x$   & \only<2>{\color{white}}$\operatorname{count}(\text{Alice}\;x)$ & \only<-3>{\color{white}}$b(x|\text{Alice})$ \\\hline
  Alice & \only<3->{0} & \only<4->{0}         \\
  sees  & \only<3->{1} & \only<4->{\num{0.5}} \\
  Bob   & \only<3->{0} & \only<4->{0}         \\
  likes & \only<3->{1} & \only<4->{\num{0.5}} \\
  cake  & \only<3->{0} & \only<4->{0}         \\
  $\#$  & \only<3->{0} & \only<4->{0}
 \end{tabular}\end{center}
\end{frame}

\begin{frame}\frametitle{Hidden-Markov-Modell}
 \begin{columns}\column{0.47\linewidth}\centering
  \begin{tikzpicture}
   \tikzstyle{q}=[rectangle,draw,minimum size=4ex]
   \tikzstyle{l}=[auto,node font=\scriptsize,rectangle,inner sep=0.5mm]
   \node[q,double] (q0) at (+0,+0) {\strut \#};
   \node[q]        (q1) at (+2,-2) {\strut noun};
   \node[q]        (q2) at (-2,-2) {\strut verb};
   \draw[->] (q0) edge[bend left=10] (q1);
   \draw[->] (q0) edge[bend left=10] (q2);
   \draw[->] (q1) edge[bend left=10] node[l] { $t(\text{verb}|\text{noun})$ } (q2);
   \draw[->] (q1) edge[bend left=10] (q0);
   \draw[->] (q2) edge[bend left=10] (q0);
   \draw[->] (q2) edge[bend left=10] node[l] { $t(\text{noun}|\text{verb})$ } (q1);
   \draw[->] (q0) edge[loop right] (q0);
   \draw[->] (q1) edge[loop above] (q1);
   \draw[->] (q2) edge[loop above] (q2);

   \only<2->{
    \node[anchor=north,node font=\scriptsize] (e1) at (+1.5,-3.25) {
     \begin{tabular}{c|c}
      $x$ & $e(x|\text{noun})$ \\\hline
      Alice & \num{0.5} \\
      Bob & \num{0.25} \\
      cake & \num{0.25} \\
      likes & 0 \\
      sees & 0
     \end{tabular}
    };
    \draw[->,dashed] (q1.south) -- (e1.north);

    \node[anchor=north,node font=\scriptsize] (e2) at (-1.5,-3.25) {
     \begin{tabular}{c|c}
      $x$ & $e(x|\text{verb})$ \\\hline
      Alice & 0 \\
      Bob & 0 \\
      cake & 0 \\
      likes & \num{0.5} \\
      sees & \num{0.5}
     \end{tabular}
    };
    \draw[->,dashed] (q2.south) -- (e2.north);
   }

   \draw[draw=none] (3.0,-5.5) rectangle (-3.0,0.5);
  \end{tikzpicture}
 \column{0.53\linewidth}
  \begin{itemize}
   \item Idee: Wahrscheinlichkeit jedes Wortes hängt vom Verhalten eines Zustandsautomaten ab
   \pause\pause
   \item Modellparameter: $\omega=(t,e)$ %TODO: kriegt man den Umbruch weg
    \begin{itemize}
     \item Transitions-Wkt.~$t$
     \item Emissions-Wkt.~$e$
    \end{itemize}
  \end{itemize}
 \end{columns}
 \scriptsize\vspace{1em}\[
  p_\omega(x=x_1\cdots x_n) = \sum_{q_1,\ldots,q_n} t(q_1|\#) \cdot e(x_1|q_1) \cdot \Bigl[\prod_{i=2}^n t(q_i|q_{i-1}) \cdot e(x_i|q_i)\Bigr] \cdot t(\#|q_n)
 \]
\end{frame}

\begin{frame}\frametitle{Hidden-Markov-Modell: Training}
 \begin{itemize}
  \item Problem: Korpus enthält nur Sätze, keine Zustände
   \begin{itemize}
    \item optimales $\omega = (t,e)$ nicht direkt ablesbar
   \end{itemize}
  \pause\item Idee: alle möglichen Beiträge zum Ergebnis abschätzen
   \begin{itemize}
    \item \emph{Baum-Welch-Algorithmus}
   \end{itemize}
  \pause\item z.~B.~für Transitionswahrscheinlichkeit von $\#$ nach $q$
   \begin{center}\begin{tikzpicture}
    \tikzstyle{n}=[rectangle,draw,minimum width=2.7cm]
    \node (spacing) at (0,+1){~};
    \node[n] (e) at (0,+0) { \strut Expectation };
    \node[n] (m) at (0,-1.5) { \strut Maximization };
    \draw[->] ($(e.south)!0.5!(e.south west)$) -- ($(m.north)!0.5!(m.north west)$);
    \draw[->] ($(m.north)!0.5!(m.north east)$) -- ($(e.south)!0.5!(e.south east)$);
    \draw[<-] (e.west) -- +(-1.0,0);

    \tikzstyle{f}=[anchor=west,node font=\scriptsize]
    \node[f] at (e.east) { $\qquad\operatorname{count}(q,\#) \leftarrow \sum_x c(x) \cdot P(q_1=q|x)$ };
    \node[f] at (m.east) { $\qquad t(q|\#) \leftarrow \frac{\operatorname{count}(q,\#)}{\sum_{q'} \operatorname{count}(q',\#)} $ };
   \end{tikzpicture}\end{center}
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Warum ein allgemeiner EM-Algorithmus?}
 \begin{center}\begin{tikzpicture}
  \tikzstyle{n}=[rectangle,draw,minimum width=2.7cm]
  \node (spacing) at (0,+1){~};
  \node[n] (e) at (0,+0) { \strut Expectation };
  \node[n] (m) at (0,-1.5) { \strut Maximization };
  \draw[->] ($(e.south)!0.5!(e.south west)$) -- node [auto,swap,node font=\scriptsize] {$\operatorname{count}$} ($(m.north)!0.5!(m.north west)$);
  \draw[->] ($(m.north)!0.5!(m.north east)$) -- node [auto,swap] {$\omega'$} ($(e.south)!0.5!(e.south east)$);
  \draw[<-] (e.west) -- node [auto] {$\omega$} +(-1.0,0);
  \draw[draw=white] (e.east) -- +(1.0,0);
 \end{tikzpicture}\end{center}
 \begin{itemize}
  \item EM = iterative Anpassung des Modellparameters $\omega$
  \pause
  \item Ist der neue Modellparameter wirklich \emph{besser} als der alte?
   \[
    p_{\omega'}(c) \geq p_\omega(c)
   \]\vspace{-1.5em}\pause
  \item EM-Algorithmen nach {\color{blue!75!black}[BSV15]} haben bewiesene Konvergenz-Eigenschaften
 \end{itemize}
 \scriptsize {\color{blue!75!black}[BSV15]} {\color{black!70}Matthias Büchse, Torsten Stüber und Heiko
 Vogler. \emph{A generic inside-outside EM algorithm.} Unpublished Manuscript,
 2015.}
\end{frame}

\againframe<4>{gliederung}

\newcommand\nbX{{X\!}_{\not\bot}}
\newcommand\nbY{{Y\!}_{\not\bot}}
\begin{frame}\frametitle{Inside-Outside-EM-Algorithmus nach [BSV15]}
 \begin{overlayarea}{\linewidth}{2.5cm}\begin{itemize}
  \item $\nbX$: abzählbare Menge der \emph{Beobachtungen} (z.~B.~Sätze)
  \item $\nbY$: abzählbare Menge der \emph{versteckten Informationen} (z.~B.~Syntaxbäume)
  \item<2-> $\pi_1\colon \nbY\to \nbX$ ordnet jedem $y$ eine Beobachtung $x$ zu
 \end{itemize}\end{overlayarea}
 \begin{center}\begin{tikzpicture}
  \draw (0,0) ellipse[x radius=1.5,y radius=1] node {$\nbX$};
  \draw (4,0) ellipse[x radius=1.5,y radius=1] node {$\nbY$};
  \tikzstyle{p}=[circle,fill,inner sep=0.3mm];

  \only<2->{
   \node[p] (x1) at (1,0,+0.0) {};
   \node[p] (x2) at (0.3,-0.6) {};
   \node[p] (y1) at (3.0,+0.0) {} edge [->,bend left=10] (x1);
   \node[p] (y2) at (3.4,-0.4) {} edge [->,bend left=10] (x2);
   \node[p] (y3) at (4.5,-0.7) {} edge [->,bend left=20] (x1);
   \node at (2.0,-1.0) { $\pi_1$ };
  }

  \draw[draw=none] (-1.5,1) rectangle (5.5,-1.4); % establish common size
 \end{tikzpicture}\end{center}
\end{frame}

\begin{frame}\frametitle{Bsp. Kontextfreie Grammatiken}
 \begin{center}\begin{tikzpicture}
  \draw (0,0) ellipse[x radius=1.5,y radius=1] node {$\nbX$};
  \draw (4,0) ellipse[x radius=1.5,y radius=1] node {$\nbY$};
  \tikzstyle{p}=[circle,fill,inner sep=0.3mm];

  \node[p] (x1) at (1,0,+0.0) {};
  \node[p] (x2) at (0.3,-0.6) {};
  \node[p] (y1) at (3.0,+0.0) {} edge [->,bend left=10] (x1);
  \node[p] (y2) at (3.4,-0.4) {} edge [->,bend left=10] (x2);
  \node[p] (y3) at (4.5,-0.7) {} edge [->,bend left=20] (x1);
  \node at (2.0,-1.0) { $\pi_1$ };

  \begin{scope}[every node/.style={rectangle,draw}]
   \node (r00) at (+4.0,-3.0) { $\text{S} \to \text{NP}\;\text{VP}$ };
   \node (r11) at (+2.5,-4.0) { $\text{NP} \to \text{Alice}$ };
   \node (r12) at (+5.5,-4.0) { $\text{VP} \to \text{VBP}\;\text{NP}$ };
   \node (r21) at (+4.0,-5.0) { $\text{VBP} \to \text{sees}$ };
   \node (r22) at (+7.0,-5.0) { $\text{NP} \to \text{Bob}$ };
   \draw[->] ($(r00.south)!0.2!(r00.south west)$) -- (r11.north);
   \draw[->] ($(r00.south)!0.2!(r00.south east)$) -- (r12.north);
   \draw[->] ($(r12.south)!0.2!(r12.south west)$) -- (r21.north);
   \draw[->] ($(r12.south)!0.2!(r12.south east)$) -- (r22.north);
  \end{scope}

  \draw[dashed] (r00.north) ++(0,0.15) -- (y2);

  \node (x) at (-0.5,-3.0) { Alice sees Bob };
  \draw[dashed] (x.north) ++(0,0.15) -- (x2);
 \end{tikzpicture}\end{center}
\end{frame}

\begin{frame}\frametitle{Zählinformationen}
 Jedes $y\in \nbY$ ist ein Baum mit Rang, der mit Zählereignissen beschriftet ist.
 \begin{overlayarea}{\linewidth}{2cm}
  \only<2-4>{\begin{itemize}
   \item \strut $C\subseteq A\times B$: \emph{Zählereignisse} (hierbei $A$, $B$ abzählbare Mengen)
   \item \emph{Baum mit Rang:} Anzahl der Nachfolger eines Knotens ist eine Funktion der Beschriftung des Knotens
  \end{itemize}}
  \only<5->{\begin{itemize}
   \item $\Omega$: Menge von \emph{Modellparametern} $\omega$
   \item $q$: \emph{$\Omega$-Wahrscheinlichkeitsmodell} für $A$ und $B$ {\scriptsize(eingeschränkt auf $C$)}
   \item $q_\omega(a|b)$: Wahrscheinlichkeit des Zählereignisses $c = (a,b)$
  \end{itemize}}
 \end{overlayarea}
 \[\begin{tikzpicture}
  \node[draw=none] (setsize) at (+0,-2.5) {};
  \node[draw=none] (setsize) at (+9,-5.5) {};

  \only<3->{\begin{scope}[every node/.style={rectangle,draw}]
   \node (r00) at (+4.0,-3.0) { $\text{S} \to \text{NP}\;\text{VP}$ };
   \node (r11) at (+2.5,-4.0) { $\text{NP} \to \text{Alice}$ };
   \node (r12) at (+5.5,-4.0) { $\text{VP} \to \text{VBP}\;\text{NP}$ };
   \node (r21) at (+4.0,-5.0) { $\text{VBP} \to \text{sees}$ };
   \node (r22) at (+7.0,-5.0) { $\text{NP} \to \text{Bob}$ };
   \draw[->] ($(r00.south)!0.2!(r00.south west)$) -- (r11.north);
   \draw[->] ($(r00.south)!0.2!(r00.south east)$) -- (r12.north);
   \draw[->] ($(r12.south)!0.2!(r12.south west)$) -- (r21.north);
   \draw[->] ($(r12.south)!0.2!(r12.south east)$) -- (r22.north);
  \end{scope}}

  \only<4->{\begin{scope}[every node/.style={ellipse,draw=black}]
   \node[pin=left:{$b\in B$},minimum size=0.8cm] (lb) at (+3.1,-3.0) {};
   \node[pin=right:{$a\in A$},minimum height=0.8cm,minimum width=1.6cm] (la) at (+4.4,-3.0) {};
  \end{scope}}
 \end{tikzpicture}\vspace*{-2em}\]
 \pause\pause\pause\pause
 \[
  p_\omega(y) = \prod_{{w\,\in\,\operatorname{pos}(y)}\atop{(a,b)\,=\,y(w)}} q_\omega(a|b)
 \]
\end{frame}

\begin{frame}\frametitle{Inside-Outside-EM-Algorithmus (Forts.)}
 Bestandteile einer \emph{Inside-Outside-Information:}
 \begin{itemize}
  \item $X$, $Y$, $\pi_1$, $A$, $B$, $C$, $\Omega$, $q$: wie gesehen
  \pause
  \item $K$: reguläre Baumgrammatik (RTG), die alle $y\in \nbY$ erzeugt
  \item $H(x)$: RTG, die alle $y\in \nbY$ mit $\pi_1(y)=x$ erzeugt
  \item Eindeutigkeit: je RTG genau eine Ableitung für jedes $y$
  \pause
  \item \emph{reguläre Baumgrammatik:} erzeugt Bäumen aus Regeln wie
   \[
    q_\text{VP} \to \left(
     \begin{tikzpicture}[baseline=(current bounding box.center)]
      \node[rectangle,draw] (r0) at (+0.0,+0.0) { $\text{VP} \to \text{VBP}\;\text{NP}$ };
      \node                 (r1) at (-0.8,-1.0) { $q_\text{VBP}$ };
      \node                 (r2) at (+0.8,-1.0) { $q_\text{NP}$ };
      \draw[->] ($(r0.south)!0.2!(r0.south west)$) -- (r1.north);
      \draw[->] ($(r0.south)!0.2!(r0.south east)$) -- (r2.north);
     \end{tikzpicture}
    \right)
   \]
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Bsp. Syntaxbaum einer CFG aus Baumgrammatik}
 \begin{center}\begin{tikzpicture}[every node/.style={rectangle,draw,minimum height=1.5em}]
  \draw[draw=none] (-4.1,-2.5) rectangle (4.1,0.5);

  \tikzstyle{gstate}=[color=blue,fill=blue!3]
  \tikzstyle{cevent}=[color=black]
  \node<1>  [gstate] (c00) at (+0.0,+0.0) { $q_\text{S}$ };
  \node<2-> [cevent] (c00) at (+0.0,+0.0) { $\text{S} \to \text{NP}\;\text{VP}$ };
  \node<2>  [gstate] (c11) at (-1.5,-1.0) { $q_\text{NP}$ };
  \node<3-> [cevent] (c11) at (-1.5,-1.0) { $\text{NP} \to \text{Alice}$ };
  \node<2-3>[gstate] (c12) at (+1.5,-1.0) { $q_\text{VP}$ };
  \node<4-> [cevent] (c12) at (+1.5,-1.0) { $\text{VP} \to \text{VBP}\;\text{NP}$ };
  \node<4>  [gstate] (c21) at (+0.0,-2.0) { $q_\text{VBP}$ };
  \node<5-> [cevent] (c21) at (+0.0,-2.0) { $\text{VBP} \to \text{sees}$ };
  \node<4-5>[gstate] (c22) at (+3.0,-2.0) { $q_\text{NP}$ };
  \node<6-> [cevent] (c22) at (+3.0,-2.0) { $\text{NP} \to \text{Bob}$ };
  \only<2->{
   \draw[->] ($(c00.south)!0.2!(c00.south west)$) -- (c11.north);
   \draw[->] ($(c00.south)!0.2!(c00.south east)$) -- (c12.north);
  }\only<4->{
   \draw[->] ($(c12.south)!0.2!(c12.south west)$) -- (c21.north);
   \draw[->] ($(c12.south)!0.2!(c12.south east)$) -- (c22.north);
  }
 \end{tikzpicture}\end{center}
 Beispiele für Regeln:
 \begin{align*}
  q_\text{S} &\to \left(
   \begin{tikzpicture}[baseline=(current bounding box.center)]
    \node[rectangle,draw] (r0) at (+0.0,+0.0) { $\text{S} \to \text{NP}\;\text{VP}$ };
    \node                 (r1) at (-0.8,-1.0) { $q_\text{NP}$ };
    \node                 (r2) at (+0.8,-1.0) { $q_\text{VP}$ };
    \draw[->] ($(r0.south)!0.2!(r0.south west)$) -- (r1.north);
    \draw[->] ($(r0.south)!0.2!(r0.south east)$) -- (r2.north);
   \end{tikzpicture}
  \right) &
  q_\text{NP} &\to
   \begin{tikzpicture}[baseline=(current bounding box.center)]
    \node[rectangle,draw] (r0) at (+0.0,+0.0) { $\text{NP} \to \text{Alice}$ };
    \draw[draw=none] (r0.south) -- +(0,-0.2);
   \end{tikzpicture}
 \end{align*}
\end{frame}

\begin{frame}\frametitle{EM-Algorithmus für Inside-Outside-Informationen}
 \begin{center}\begin{tikzpicture}
  \tikzstyle{n}=[rectangle,draw,minimum width=2.7cm]
  \node[n] (e) at (0,+0.0) { \strut Expectation };
  \node[n] (m) at (0,-1.5) { \strut Maximization };
  \draw[->] ($(e.south)!0.5!(e.south west)$) -- ($(m.north)!0.5!(m.north west)$);
  \draw[->] ($(m.north)!0.5!(m.north east)$) -- ($(e.south)!0.5!(e.south east)$);
  \draw[<-] (e.west) -- +(-1.0,0);

  \node (spacing) at (6.3,0) {};
  \tikzstyle{f}=[anchor=west,node font=\scriptsize]
  \only<1>{
   \node[f] at (e.east) { $\qquad\operatorname{count}(q,\#) \leftarrow \sum_x c(x) \cdot P(q_1=q|x)$ };
   \node[f] at (m.east) { $\qquad t(q|\#) \leftarrow \frac{\operatorname{count}(q,\#)}{\sum_{q'} \operatorname{count}(q',\#)} $ };
  }
  \only<2->{
   \node[f] at (e.east) { $\qquad\operatorname{count}(a,b) \leftarrow \sum_x c(x) \cdot \chi_{\omega,x}(a,b)$ };
   \node[f] at (m.east) { $\qquad q_{\omega'}(a|b) \leftarrow \frac{\operatorname{count}(a,b)}{\sum_{a'} \operatorname{count}(a',b)} $ };
  }
 \end{tikzpicture}\end{center}
 \pause
 \begin{itemize}
  \item $\chi_{\omega,x}(a,b)$ nutzt bekanntes $q_\omega$, um Häufigkeit von $(a,b)$ in der Beobachtung $x$ abzuschätzen
  \item dann $\omega'$ so wählen, dass $q_{\omega'}$ der empirischen Wahrscheinlichkeitsverteilung von $\operatorname{count}(a,b)$ entspricht
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Zur Berechnung von $\chi_{\omega,x}(a,b)$}
 \begin{center}\begin{tikzpicture}
  \node[anchor=south,rectangle] at (0,0) { $q_0$ };
  \only<1-5>{
   \draw (0,0) -- (5,-6) -- (-5,-6) -- cycle;
   \node[coordinate,pin=60:{\scriptsize Ableitungsbaum für $H(x)$}] at ($(0,0)!0.2!(5,-6)$) {};
  }
  \only<6>{
   \draw[thick,blue] (0,0) -- (5,-6) -- (-5,-6) -- cycle;
   \node[coordinate,pin=60:{\color{blue}$\beta_x(q_0)$}] at ($(0,0)!0.2!(5,-6)$) {};
  }
  \node[rectangle,draw] (rule) at (0,-4) { $q\to (a,b)(q_1,q_2,q_3)$ };
  \only<3->{
   \node (q0) at (0,-3) { $q$ };
   \node (q1) at (-1.0,-5) { $q_1$ };
   \node (q2) at (+0.0,-5) { $q_2$ };
   \node (q3) at (+1.0,-5) { $q_3$ };
   \draw[->] (q0.south) -- (rule.north);
   \draw[->] ($(rule.south)+(-1,0)$) -- (q1.north);
   \draw[->] ($(rule.south)+(+0,0)$) -- (q2.north);
   \draw[->] ($(rule.south)+(+1,0)$) -- (q3.north);
  }
  \only<2->{
   \begin{scope}[on background layer]
    \fill[fill=green!20] (rule.north west) rectangle (rule.south east);
   \end{scope}
   \node[anchor=south west,node font=\scriptsize,color=green!60!black] at (rule.north) { $q_\omega(a|b)$ };
   }
  \only<4->{
    \begin{scope}[on background layer]
    \fill[fill=blue!20] ($(rule.south)+(-1,0)$) -- (-0.5,-6) -- (-2.0,-6) -- cycle;
    \fill[fill=blue!10] ($(rule.south)+(+0,0)$) -- (-0.5,-6) -- (+0.5,-6) -- cycle;
    \fill[fill=blue!20] ($(rule.south)+(+1,0)$) -- (+0.5,-6) -- (+2.0,-6) -- cycle;
   \end{scope}
   \node[anchor=south,node font=\scriptsize,color=blue] at (-1.25,-6.0) { $\beta_x(q_1)$ };
   \node[anchor=south,node font=\scriptsize,color=blue] at (+0.00,-6.0) { $\beta_x(q_2)$ };
   \node[anchor=south,node font=\scriptsize,color=blue] at (+1.25,-6.0) { $\beta_x(q_3)$ };
  }
  \only<5->{
   \begin{scope}[on background layer]
    \fill[fill=red!20] (0,0) -- (5,-6) -- (2,-6) -- (rule.north east) -- (q0.south east) -- (q0.south west) -- (rule.north west) -- (-2,-6) -- (-5,-6) -- cycle;
   \end{scope}
   \node[color=red] at (0,-1.5) { $\alpha_x(q)$ };
  }
 \end{tikzpicture}\end{center}
 \scriptsize Grafik nach: H. Vogler. \emph{Maschinelles Übersetzen natürlicher Sprachen.}\\Vorlesung, TU Dresden, Wintersemester 2015/2016.
\end{frame}

\begin{frame}\frametitle{EM-Algorithmus für Inside-Outside-Informationen}
 \begin{center}\begin{tikzpicture}
  \tikzstyle{n}=[rectangle,draw,minimum width=2.7cm]
  \node[n] (e) at (0,+0.0) { \strut Expectation };
  \node[n] (m) at (0,-1.5) { \strut Maximization };
  \draw[->] ($(e.south)!0.5!(e.south west)$) -- ($(m.north)!0.5!(m.north west)$);
  \draw[->] ($(m.north)!0.5!(m.north east)$) -- ($(e.south)!0.5!(e.south east)$);
  \draw[<-] (e.west) -- +(-1.0,0);

  \node (spacing) at (6.3,0) {};
  \tikzstyle{f}=[anchor=west,node font=\scriptsize]
  \node[f] at (e.east) { $\qquad\operatorname{count}(a,b) \leftarrow \sum_x c(x) \cdot \chi_{\omega,x}(a,b)$ };
  \node[f] at (m.east) { $\qquad q_{\omega'}(a|b) \leftarrow \frac{\operatorname{count}(a,b)}{\sum_{a'} \operatorname{count}(a',b)} $ };
 \end{tikzpicture}\end{center}\vspace{1em}
 \[
  \chi_{\omega,x}(a,b) = \frac{
   \sum_{q \to (a,b)(q_1,\ldots,q_k)} \alpha_x(q) \cdot q_\omega(a|b) \cdot \beta_x(q_1) \cdots \beta_x(q_k)
  }{\beta_x(q_0)}
 \]
 \pause
 \begin{center}
  \textbf{Ist der Baum-Welch-Algorithmus eine Instanz hiervon?}
 \end{center}
\end{frame}

\begin{frame}\frametitle{Versteckte Information für Hidden-Markov-Modell}
 \begin{center}\begin{tikzpicture}
  \draw (0,0) ellipse[x radius=1.5,y radius=1] node {$\nbX$};
  \draw (4,0) ellipse[x radius=1.5,y radius=1] node {$\nbY$};
  \tikzstyle{p}=[circle,fill,inner sep=0.3mm];

  \node[p] (x1) at (1,0,+0.0) {};
  \node[p] (x2) at (0.3,-0.6) {};
  \node[p] (y1) at (3.0,+0.0) {} edge [->,bend left=10] (x1);
  \node[p] (y2) at (3.4,-0.4) {} edge [->,bend left=10] (x2);
  \node[p] (y3) at (4.5,-0.7) {} edge [->,bend left=20] (x1);
  \node at (2.0,-1.0) { $\pi_1$ };

  \node (x) at (-0.5,-3.0) { Alice sees Bob };
  \draw[dashed] (x.north) ++(0,0.15) -- (x2);

  \only<2->{
   \begin{scope}[every node/.style={rectangle,draw,node font=\scriptsize,minimum width=2.5cm}]
    \node (r00) at (+5.0,-3.0) { $\bigl(\text{noun},(\#,T)\bigr)$ };
    \node (r10) at (+5.0,-4.0) { $\bigl(\text{verb},(\text{noun},T)\bigr)$ };
    \node (r11) at (+2.0,-4.0) { $\bigl(\text{Alice},(\text{noun},E)\bigr)$ };
    \node (r20) at (+5.0,-5.0) { $\bigl(\text{noun},(\text{verb},T)\bigr)$ };
    \node (r21) at (+2.0,-5.0) { $\bigl(\text{sees},(\text{verb},E)\bigr)$ };
    \node (r30) at (+5.0,-6.0) { $\bigl(\#,(\text{noun},T)\bigr)$ };
    \node (r31) at (+2.0,-6.0) { $\bigl(\text{Bob},(\text{noun},E)\bigr)$ };
    \draw[->] (r00.south) -- (r10.north);
    \draw[->] (r10.south) -- (r20.north);
    \draw[->] (r20.south) -- (r30.north);
    \draw[->] (r00.south west) -- (r11.north east);
    \draw[->] (r10.south west) -- (r21.north east);
    \draw[->] (r20.south west) -- (r31.north east);
   \end{scope}

   \draw[dashed] (r00.north) ++(0,0.15) -- (y2);

   \begin{scope}[note/.style={color=white},every node/.style={node font=\tiny,note}]
    \only<3>{\tikzset{note/.style={color=black}}}
    \node[anchor=south] at (r00.north east) { $t(\text{noun}|\#)$ };
    \node[anchor=south] at (r10.north east) { $t(\text{verb}|\text{noun})$ };
    \node[anchor=south] at (r20.north east) { $t(\text{noun}|\text{verb})$ };
    \node[anchor=south] at (r30.north east) { $t(\#|\text{noun})$ };
    \node[anchor=south east] at (r11.north east) { $e(\text{Alice}|\text{noun})$ };
    \node[anchor=south east] at (r21.north east) { $e(\text{sees}|\text{verb})$ };
    \node[anchor=south east] at (r31.north east) { $e(\text{Bob}|\text{noun})$ };
   \end{scope}

  }

  \begin{scope}[note/.style={color=white}]
   \only<2-3>{\tikzset{note/.style={color=gray}}}
   \node[anchor=south west,align=left,note,node font=\scriptsize] at (-3.5,-6.3) {
    Knotenbeschriftung: $c = (a,b)$
    \only<3>{\\ Annotation: $q_\omega(a|b)$}
   };
  \end{scope}

  \node[rectangle] (spacing) at (6.85,-6.25) {};
 \end{tikzpicture}\end{center}
\end{frame}

\newcommand\gstate[2]{{\color{blue}(#2,#1)}}
\newcommand\cevent[3]{{\color{red!90!black}\bigl(#1,(#2,#3)\bigr)}}
\begin{frame}\frametitle{Beispiel: Ableitung aus der Baumgrammatik $K$}
 \begin{center}\begin{tikzpicture}[every node/.style={rectangle,draw,node font=\scriptsize,minimum width=2.5cm,minimum height=2.0em}]
  \tikzstyle{gstate}=[color=blue,fill=blue!3]
  \tikzstyle{cevent}=[color=red!90!black,fill=red!3]
  \node<1>  [gstate] (r01) at (+0.0,+0) { $(T,\#)$ };
  \node<2-> [cevent] (r01) at (+0.0,+0) { $\bigl(\text{noun},(\#,T)\bigr)$ };
  \node<2>  [gstate] (r11) at (-3.0,-1) { $(E,\text{noun})$ };
  \node<3-> [cevent] (r11) at (-3.0,-1) { $\bigl(\text{Alice},(\text{noun},E)\bigr)$ };
  \node<2-3>[gstate] (r12) at (+0.0,-1) { $(T,\text{noun})$ };
  \node<4-> [cevent] (r12) at (+0.0,-1) { $\bigl(\text{verb},(\text{noun},T)\bigr)$ };
  \node<4>  [gstate] (r21) at (-3.0,-2) { $(E,\text{verb})$ };
  \node<5-> [cevent] (r21) at (-3.0,-2) { $\bigl(\text{sees},(\text{verb},E)\bigr)$ };
  \node<4-5>[gstate] (r22) at (+0.0,-2) { $(T,\text{verb})$ };
  \node<6-> [cevent] (r22) at (+0.0,-2) { $\bigl(\#,(\text{verb},T)\bigr)$ };
  \draw<2->[->] (r01.south) -- (r12.north);
  \draw<2->[->] (r01.south west) -- (r11.north east);
  \draw<4->[->] (r12.south) -- (r22.north);
  \draw<4->[->] (r12.south west) -- (r21.north east);
  \draw[draw=none] (-4.3,0.3) rectangle (1.3,-2.3); % set stable size
 \end{tikzpicture}\end{center}
 \begin{itemize}
  \item Zustände: $\gstate qT$ für $q\in Q\cup\{\#\}$ und $\gstate qE$ für $q\in Q$
  \item Startzustand: $\gstate\#T$; Regeln:
   \begin{align*}
    \gstate qT &\to \cevent{q'}qT\bigl(\gstate{q'}E,\gstate{q'}T\bigr)
    &&\text{für } q\in Q\cup\{\#\}, q'\in Q \\
    \gstate qT &\to \cevent\#qT
    &&\text{für } q\in Q\cup\{\#\} \\
    \gstate qE &\to \cevent vqE
    &&\text{für } q\in Q, v\in V
   \end{align*}
  \item $H(x)$: Zustände erweitert um noch nicht generierte Wörter
 \end{itemize}
\end{frame}

\begin{frame}\frametitle{Zur Instanziierung des IO-EM-Algorithmus}
 \vspace*{-1em}
 \begin{center}\begin{tikzpicture}[every node/.style={node font=\scriptsize}]
  \only<1-2>{\colorlet{highlightcolor}{black}}
  \only<3->{\colorlet{highlightcolor}{blue}}
  \tikzstyle{formula}=[anchor=west,minimum size=0.6cm]
  \node[formula] (x)  at (-2.5, +0.0) { $x = \text{Alice sees Bob}$ };
  \node[formula] (q1) at (-2.5, +2.2) { $q = \text{{\color{highlightcolor}noun verb} noun}$ };
  \node[formula] (q2) at (-2.5, -2.2) { $q = \text{noun {\color{highlightcolor}noun verb}}$ };
  \draw[<-] (x.north) -- ($(q1.south east)!(x.north)!(q1.south west)$);
  \draw[<-] (x.south) -- node[auto]{u.~s.~w.} ($(q2.north east)!(x.south)!(q2.north west)$);

  \only<2->{
   \only<2-3>{\tikzstyle{highlight}=[color=black]}
   \only<4->{\tikzstyle{highlight}=[color=blue,fill=blue!3]}
   \draw[decorate,decoration={brace,amplitude=5pt}] (+0.6,-4.0) -- (+0.6,-0.4);
   \draw[decorate,decoration={brace,amplitude=5pt}] (+0.6,+0.4) -- (+0.6,+4.0);
   \begin{scope}[every node/.style={rectangle,draw,node font=\scriptsize,minimum width=2.5cm}]
    \node (r00) at (+5.0,-0.7) { $\bigl(\text{noun},(\#,T)\bigr)$ };
    \node (r10) at (+5.0,-1.7) { $\bigl(\text{noun},(\text{noun},T)\bigr)$ };
    \node (r11) at (+2.0,-1.7) { $\bigl(\text{Alice},(\text{noun},E)\bigr)$ };
    \node[highlight]
          (r20) at (+5.0,-2.7) { $\bigl(\text{verb},(\text{noun},T)\bigr)$ };
    \node (r21) at (+2.0,-2.7) { $\bigl(\text{sees},(\text{noun},E)\bigr)$ };
    \node (r30) at (+5.0,-3.7) { $\bigl(\#,(\text{verb},T)\bigr)$ };
    \node (r31) at (+2.0,-3.7) { $\bigl(\text{Bob},(\text{verb},E)\bigr)$ };
    \draw[->] (r00.south) -- (r10.north);
    \draw[->] (r10.south) -- (r20.north);
    \draw[->] (r20.south) -- (r30.north);
    \draw[->] (r00.south west) -- (r11.north east);
    \draw[->] (r10.south west) -- (r21.north east);
    \draw[->] (r20.south west) -- (r31.north east);

    \node (s00) at (+5.0,+3.7) { $\bigl(\text{noun},(\#,T)\bigr)$ };
    \node[highlight]
          (s10) at (+5.0,+2.7) { $\bigl(\text{verb},(\text{noun},T)\bigr)$ };
    \node (s11) at (+2.0,+2.7) { $\bigl(\text{Alice},(\text{noun},E)\bigr)$ };
    \node (s20) at (+5.0,+1.7) { $\bigl(\text{noun},(\text{verb},T)\bigr)$ };
    \node (s21) at (+2.0,+1.7) { $\bigl(\text{sees},(\text{verb},E)\bigr)$ };
    \node (s30) at (+5.0,+0.7) { $\bigl(\#,(\text{noun},T)\bigr)$ };
    \node (s31) at (+2.0,+0.7) { $\bigl(\text{Bob},(\text{noun},E)\bigr)$ };
    \draw[->] (s00.south) -- (s10.north);
    \draw[->] (s10.south) -- (s20.north);
    \draw[->] (s20.south) -- (s30.north);
    \draw[->] (s00.south west) -- (s11.north east);
    \draw[->] (s10.south west) -- (s21.north east);
    \draw[->] (s20.south west) -- (s31.north east);
   \end{scope}
  }

  \only<4->{
   \node[color=blue,minimum height=0.6cm] (chi) at (+7.0,+0.0) {
    \only<4>{$\chi_{\omega,x}\bigl(\text{verb},(\text{noun},T)\bigr)$}
    \only<5>{$\operatorname{count}\bigl(\text{verb},(\text{noun},T)\bigr)$}
    \only<6>{$q_{\omega'}\bigl(\text{verb}\!\bigm|\!(\text{noun},T)\bigr)$}
    \only<7>{$t'(\text{verb}\,|\,\text{noun})$}
   };
   \draw[color=blue,->,dashed] (r20.east) ++(0.2,0) -| (chi.south);
   \draw[color=blue,->,dashed] (s10.east) ++(0.2,0) -| (chi.north);
  }

  \draw[draw=none] (0,+4.1) rectangle (8.5,-4.1);
 \end{tikzpicture}\end{center}
\end{frame}

\againframe<3>{gliederung}

\end{document}\endinput

Gliederung/Zeitbudget:

- (3 Min) Idee Sprachmodell: P(x), vllt Bigramm-Modell zur Veranschaulichung
- (5 Min) Hidden Markov Model: H = (Q,V,#,t,e) -> P(x)
- (5 Min) Training: Idee Baum-Welch-Algorithmus bzw. EM allgemein -> gute Konvergenz, verallgemeinerbar?
- (10 Min) EM-Algorithmus aus [BSV15]; nur IO-Info, wrsl. eher phänomenologisch (am Beispiel PCFG) als exakt
- (7 Min) Anwendung auf HMM; Bäume aus [[K]] bzw. [[H(x)]]