forked from zafstojano/ml-interview-questions-and-answers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
4894 lines (4330 loc) · 397 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{article}
\input{preamble}
\usepackage{listings}
\usepackage{geometry}
\usepackage{algorithm2e}
\usepackage{bbm}
\usepackage{enumitem}
\newenvironment{QandA}{\begin{enumerate}[label=\arabic*.]}{\end{enumerate}}
\newenvironment{InnerQandA}{\begin{enumerate}[label=\roman*.]}{\end{enumerate}}
\newenvironment{ListAlph}{\begin{enumerate}[label=(\alph*)]}{\end{enumerate}}
\newenvironment{answer}{\par\normalfont \textbf{Answer:}}{}
\newenvironment{theorem}{\par\normalfont \textbf{Theorem:}}{}
\newenvironment{proof}{\par\normalfont \textbf{Proof:}}{}
% My commands
\newcommand{\R}{\mathbb{R}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\Exp}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\Vari}[1]{\text{Var}\left[ #1 \right]}
\newcommand{\Cov}[1]{\text{Cov}\left[ #1 \right]}
\newcommand{\KL}[2]{\text{KL}\left[#1 \Vert #2 \right]}
\newcommand{\CE}[2]{\text{H}\left[#1, #2 \right]}
\newcommand{\Ent}[1]{\text{H}\left[#1 \right]}
\newcommand{\Jac}{\text{Jac}}
\newcommand{\Indicator}[1]{\mathbbm{1}_{\left\{ #1 \right\}}}
\newcommand{\g}{\vert}
\newcommand{\notsure}{\textcolor{red}{[not sure]}}
\newcommand{\todo}{\textcolor{red}{[TODO]}}
\title{\Huge Answers to Chip Huyen's \\ Machine Learning Interview Questions}
\author{\Large Zafir Stojanovski}
\date{}
\usepackage{titling}
\renewcommand\maketitlehooka{\null\mbox{}\vfill}
\renewcommand\maketitlehookd{\vfill\null}
\begin{document}
\maketitle
\thispagestyle{empty} % Remove page number on the title page
\vfill % Center title and author vertically
\begin{center}
\today
\end{center}
\newpage
\tableofcontents
\setcounter{section}{4}
\newpage
\noindent This document contains my solutions to \href{https://huyenchip.com/ml-interviews-book/}{``Introduction to Machine Learning Interviews''} by Chip Huyen. The section and question numbering has been structured to align with the original book by Chip Huyen in order to maintain consistency.
\\
\\
The \LaTeX \hspace{0.07em} source files for this booklet are open source, and available at: \\ \href{https://github.com/starzmustdie/ml-interview-questions-and-answers}{github.com/starzmustdie/ml-interview-questions-and-answers}.
\\
\\
You can find the compiled PDF document in \href{https://github.com/starzmustdie/ml-interview-questions-and-answers/blob/main/ML_interview_questions_and_answers.pdf}{the GitHub repository} or on \href{https://drive.google.com/file/d/1P4w12EvvFG19f4uVsvai6fC93p7kRphE/view}{Google Drive}.
\\
\\
\textit{Please note}: not all questions have answers, and there's a possibility that my responses may not be entirely accurate. Therefore, it's advisable to consider the information provided cautiously.
\\
\\
For any questions and contributions, you can reach me at:
\begin{itemize}
\item email: \href{mailto:[email protected]}{[email protected]}
\item twitter: \href{https://twitter.com/starzmustdie}{twitter.com/starzmustdie}
\item linkedin: \href{https://www.linkedin.com/in/zafir-stojanovski/}{linkedin.com/in/zafir-stojanovski}
\end{itemize}
\vspace{2em}
\noindent \textbf{If you found this guide helpful, please consider supporting me by \href{https://www.buymeacoffee.com/starzmustdie}{buying me a coffee}}. :)
\newpage
\begin{figure}[t!]
\centering
\includegraphics[width=0.9\linewidth]{img/turing_2.jpg}
\caption{Prompt: ``Alan Turing, preparing for his upcoming Machine Learning Interview, cinematic, analog film''. Source: Clipdrop by stability.ai.}
\label{fig:enter-label}
\end{figure}
\newpage
\clearpage % or \newpage
\section{Math}
\subsection{Algebra and (little) calculus}
\subsubsection{Vectors}
\begin{QandA}
\item Dot product
\begin{InnerQandA}
\item What’s the geometric interpretation of the dot product of two vectors?
\begin{answer}
Multiplication of the length of one vector and the length of the projection of the other vector onto the first one.
\end{answer}
\item Given a vector $u$, find vector $v$ of unit length such that the dot product of $u$ and $v$ is maximum.
\begin{answer}
$$v = \frac{u}{\Vert u \Vert}$$
\end{answer}
\end{InnerQandA}
\item Outer product
\begin{InnerQandA}
\item Given two vectors $a=[3,2,1]$ and $b=[-1,0,1]$. Calculate the outer product $a^T b$?
\begin{answer}
\begin{align*}
a^T b &= \begin{bmatrix}
-3 & 0 & 3\\
-2 & 0 & 2 \\
-1 & 0 & 1
\end{bmatrix}
\end{align*}
\end{answer}
\item Give an example of how the outer product can be useful in ML.
\begin{answer}
The Covariance matrix is a commonly used quantity in Machine Learning algorithms (eg. PCA). Given a dataset $X \in \R^{n \times d}$ with $n$ samples and $d$ features, we calculate the (empirical) covariance as follows:
\begin{align*}
\Cov{X} = \frac{1}{n} \sum_{i=1}^n (x_i - \bar{x}) (x_i - \bar{x})^T
\end{align*}
where $\bar{x}$ is the mean feature vector: $\bar{x} = \frac{1}{n} \sum_{i=1}^n x_i$.
\end{answer}
\end{InnerQandA}
\item What does it mean for two vectors to be linearly independent?
\begin{answer}
Two vectors $a, b \in \R^n$ are linearly independent iff $a \neq c \cdot b$ for $c \in \R$.
\end{answer}
\item Given two sets of vectors $A = \{ a_1, a_2, \ldots, a_n\}$ and $B = \{b_1, b_2, \ldots, b_m\}$ how do you check if they share the same basis?
\begin{answer}
We should check if every vector in $B$ can be written as a linear combination of vectors in $A$. More precisely, for all $b_i \in B$ it should hold that:
\begin{align*}
b_i = \gamma_1^i a_1 + \gamma_2^i a_2 + \ldots + \gamma_n^i a_n
\end{align*}
where $\gamma_1^i, \gamma_2^i, \ldots, \gamma_n^i$ are some scalars.
\end{answer}
\item Given $n$ vectors, each of $d$ dimensions. What is the dimension of their span?
\begin{answer}
The dimension of their span is the number of linearly independent vectors.
\end{answer}
\item Norms and metrics
\begin{InnerQandA}
\item What's a norm? What is $L_0$, $L_1$, $L_2$, \ldots, $L_p$ norm?
\begin{answer}
A norm is a function $\Vert \cdot \Vert: X \rightarrow \R$, where $X$ is a vector space, that satisfies the following properties:
\begin{enumerate}[label={N.\arabic*.}]
\item Triangle inequality: $\Vert x + y \Vert \le \Vert x \Vert + \Vert y \Vert$
\item Absolute homogeneity: $\Vert cx \Vert = \vert c \vert \Vert x \Vert $, where $c \in \R$
\item $x = 0 \implies \Vert x \Vert = 0 $
\item $\Vert x \Vert \implies x = 0$
\end{enumerate}
$\Vert \cdot \Vert$ is a semi-norm if only N.1 - N.3 are satisfied. In general, we define the $p$-norm as follows:
\begin{align*}
\Vert x \Vert_p := \left( \sum_{i=1}^d x_i^p \right)^{\frac{1}{p}}
\end{align*}
Finally, we define the two special norms:
\begin{align*}
\Vert x \Vert_\infty := \max_{i} \vert x_i \vert \\
\Vert x \Vert_0 := \sum_{i=1}^d \mathbbm{1}_{\{ x_i \neq 0 \}}
\end{align*}
\end{answer}
\item How do norm and metric differ? Given a norm, make a metric. Given a metric, can we make a norm?
\begin{answer}
A metric is a function $d: \mathbb{X} \times \mathbb{X} \rightarrow \R$ if:
\begin{enumerate}[label={M.\arabic*.}]
\item Non-negativity: $d(x, y) > 0 \text{ if } x \neq y \text{ else } d(x,y) = 0$
\item Triangle inequality: $d(x, y) \le d(x, z) + d(z, y)$
\item Symmetry: d(x, y) = d(y, x)
\end{enumerate}
A norm can always induce a metric:
\begin{align*}
d(x, y) := \Vert x - y \Vert
\end{align*}
The other direction does not work in general.
\end{answer}
\end{InnerQandA}
\end{QandA}
\subsubsection{Matrices}
\begin{QandA}
\item Why do we say that matrices are linear transformations?
\begin{answer}
They are called linear transformations because they preserve vector addition and scalar multiplication:
\begin{align*}
A(x+y) = Ax + Ay \\
A(c \cdot x) = c \cdot (Ax)
\end{align*}
\end{answer}
\item What’s the inverse of a matrix? Do all matrices have an inverse? Is the inverse of a matrix always unique?
\begin{answer}
A matrix $A \in \R^{n \times n}$ is called \textit{invertible} if there exists a matrix $B \in \R^{n \times n}$ s.t. $AB = BA = I$. The inverse of a matrix is unique. A square matrix that does not have an inverse is called \textit{singular}. A square matrix is singular iff its determinant is 0. Furthermore, non-square matrices do not have an inverse.
\end{answer}
\item What does the determinant of a matrix represent?
\begin{answer}
The \textit{determinant} of a matrix is a function $\det(\cdot): \R^{n \times n} \rightarrow \R$ which outputs the change in (signed) "volume" represented by the transformation of the standard basis to the columns of $A$.
\end{answer}
\item What happens to the determinant of a matrix if we multiply one of its rows by a scalar?
\begin{answer}
The determinant is \textit{multilinear}: If the $j$-th column/row of a matrix $A$ is written as a linear combination $a_j = r \cdot v + w$ of two column vectors $v$ and $w$, and a scalar $r$, then the determinant of $A$ can be expressed as follows:
\begin{align*}
\vert A \vert &= \vert a_1, \ldots, a_{j-1}, a_j, a_{j+1}, \ldots, a_n \vert \\
&= \vert a_1, \ldots, a_{j-1}, r \cdot v + w, a_{j+1}, \ldots, a_n \vert \\
&= r \cdot \vert a_1, \ldots, v, \ldots, a_n \vert + \vert a_1, \ldots, w, \ldots, a_n \vert
\end{align*}
\end{answer}
\item A $4 \times 4$ matrix has four eigenvalues $3,3,2,-1$. What can we say about the trace and the determinant of this matrix?
\begin{answer}
The trace is the sum of the eignevalues: $3+3+2-1 = 7$. \\
The determinant is the product of the eigenvalues: $3 \cdot 3 \cdot 2 \cdot (-1) = -18$.
\end{answer}
\item Given the following matrix:
\begin{align*}
\begin{bmatrix}
1 & 4 & -2 \\
-1 & 3 & 2 \\
3 & 5 & -6
\end{bmatrix}
\end{align*}
Without explicitly using the equation for calculating determinants, what can we say about this matrix’s determinant? (Hint: rely on a property of this matrix to determine its determinant)
\begin{answer}
\todo
\end{answer}
\item What’s the difference between the Covariance matrix $A^TA$ and the Gram matrix $AA^T$ ?
\begin{answer}
Suppose $A \in \R^{n \times d}$, corresponding to $n$ samples with each having $d$ features. Then, the Covariance matrix $A^TA \in \R^{d \times d}$ captures the "distance" between features, whereas the Gram matrix $AA^T \in \R^{n \times n}$ captures the "distance" between samples.
\end{answer}
\item Given $A \in \R^{n \times m}$ and $b \in \R^n$:
\begin{InnerQandA}
\item Find $x$ such that $Ax = b$.
\begin{answer}
If $A$ is square and has full rank, then: $x = A^{-1}b$.
\end{answer}
\item When does this have a unique solution?
\begin{answer}
In a set of linear simultaneous equations, a unique solution exists if and only if:
\begin{itemize}
\item the number of unknowns and the number of equations are equal
\item all equations are consistent
\item there is no linear dependence between any two or more equations, that is, all equations are independent.
\end{itemize}
\end{answer}
\item Why is it when $A$ has more columns than rows, $Ax=b$ has multiple solutions?
\begin{answer}
Each unknown can be seen as an available degree of freedom. Each equation introduced into the system can be viewed as a constraint that restricts one degree of freedom. \\
Therefore, the critical case occurs when the number of equations and the number of free variables are equal -- for every variable giving a degree of freedom, there exists a corresponding constraint removing a degree of freedom. \\
The underdetermined case, by contrast, occurs when the system has been underconstrained -- that is, when the unknowns outnumber the equations.
\end{answer}
\item Given a matrix $A$ with no inverse. How would you solve the equation $Ax=b$ ? What is the pseudoinverse and how to calculate it?
\begin{answer}
For an arbitrary matrix $A \in \R^{m \times n}$, its pseudoinverse $A^{\#} \in \R^{n \times m}$ is a matrix that satisfies the following criteria:
\begin{itemize}
\item $A A^{\#} A = A$
\item $A^{\#} A A^{\#} = A^{\#}$
\item $(A A^{\#})^T = A A^{\#}$
\item $(A^{\#} A)^T = A^{\#} A$
\end{itemize}
Back to our system of linear equations, regardless of whether $A$ is square and regardless of the rank of $A$, all solutions (if any exist) can be obtained using the pseudoinverse:
\begin{align*}
x = A^{\#} b + (I - A^{\#}A)w
\end{align*}
where $w$ is a vector of free parameters that ranges over all possible $n \times 1$ vectors. A necessary and sufficient condition for any solution(s) to exist is that the potential solution obtained using $w=0$ satisfy $Ax = b$, that is: $A A^{\#}b = b$ (substitute $x = A^\# b$ in $Ax = b$). If this condition does not hold, the equation system is inconsistent and has no solution. If the condition holds, the system is consistent and at least one solution exists. For example, in the above-mentioned case in which $A$ is square and of full rank, $A^{\#}$ simply equals $A^{-1}$ and the general solution equation simplifies to:
\begin{align*}
x = A^{-1}b + (I - A^{-1}A)w = A^{-1}b + (I - I)w = A^{-1}b
\end{align*}
as previously stated, where $w$ has completely dropped out of the solution, leaving only a single solution. In other cases though, $w$ remains and hence an infinitude of potential values of the free parameter vector $w$ give an infinitude of solutions of the equation.
\end{answer}
\end{InnerQandA}
\item Derivative is the backbone of gradient descent.
\begin{InnerQandA}
\item What does the derivative represent?
\begin{answer}
The derivative of a function measures the sensitivity to change in the function output with respect to a change in the input. \\
Moreover, when it exists, the derivative at a given point is the \textit{slope} of the tangent line to the graph of the function at that point. The tangent line is the best \textit{linear approximation} of the function at that input value. This is the reason why in gradient descent we (slowly) move in the (negative) direction of the derivative.
\end{answer}
\item What’s the difference between derivative, gradient, and Jacobian?
\begin{answer}
\begin{itemize}
\item When $f: \R \rightarrow \R$, we calculate the derivative $\frac{df}{dx}$.
\item When $f: \R^n \rightarrow \R$, we calculate the gradient:
\begin{align*}
\nabla_x f =
\begin{bmatrix}
\frac{\partial f}{ \partial x_1} & \frac{\partial f}{ \partial x_2} & \ldots & \frac{\partial f}{ \partial x_n}
\end{bmatrix}
\end{align*}
\item When $f: \R^n \rightarrow \R^m$, we calculate the Jacobian:
\begin{align*}
\Jac_x(f) =
\begin{bmatrix}
\frac{\partial f_1}{ \partial x_1} & \ldots & \frac{\partial f_1}{ \partial x_n} \\
\vdots & \ddots & \vdots \\
\frac{\partial f_m}{ \partial x_1} & \ldots & \frac{\partial f_m}{ \partial x_n}
\end{bmatrix}
\end{align*}
\end{itemize}
\end{answer}
\end{InnerQandA}
\item Say we have the weights $w \in \R^{d \times m}$ and a mini-batch $x$ of $n$ elements, each element is of the shape $1 \times d$ so that $x \in \R^{n \times d}$. We have the output $y = f(x; w) = xw$. What's the dimension of the Jacobian $\frac{\partial y}{\partial x}$?
\begin{answer}
First, notice that $y \in \R^{n \times m}$. With that said, $\Jac_x(f) \in \R^{(n \times m) \times (n \times d)}$, or equivalently $\Jac_x(f) \in \R^{(n \cdot m) \times (n \cdot d)}$, given that we have reshaped the 4-dim tensor into a 2-dim tensor, i.e. a matrix.
\end{answer}
\item Given a very large symmetric matrix $A$ that doesn't fit into memory, say $A \in \R^{1M \times 1M}$ and a function $f$ that can quickly compute $f(x) = Ax$ for $x \in \R^{1M}$. Find the unit vector $x$ so that $x^T A x$ is minimal. (Hint: Can you frame it as an optimization problem and use gradient descent to find an approximate solution?).
\begin{answer}
Since this is a constrained optimization problem, we can turn it into an unconstrained one by using a Lagrange multiplier:
\begin{align*}
L(x, \lambda) = x^T A x + \lambda (x^t x - 1)
\end{align*}
The critical points of Lagrangians occur at saddle points, rather than at local minima. Unfortunately, methods such as gradient descent which are designed to find local minima (or maxima) cannot be applied directly. For this reason, we must either use an optimization technique that stationary points that are not necessarily extrema (such as Newton's method without an extremum seeking line search), or we modify the formulation to ensure that it's a minimization problem. \\\\
For the purpose of our problem, let us use the latter solution. First, let us compute the gradients:
\begin{align*}
\frac{\partial L}{ \partial x} &= 2Ax + 2 \lambda x \in \R^n \\
\frac{\partial L}{\partial \lambda} &= x^T x - 1 \in \R
\end{align*}
Next, we stack them into a single column vector:
\begin{align*}
\nabla_{x, \lambda}L =
\begin{bmatrix}
\frac{\partial L}{\partial x} \\
\\
\frac{\partial L}{\partial \lambda}
\end{bmatrix} \in \R^{n + 1}
\end{align*}
Finally, we minimize the (squared) magnitude of the gradient:
\begin{align*}
x^{*}, \lambda^{*} &= \argmin_{x, \lambda} \sqrt{(\nabla_{x, \lambda}L)^T (\nabla_{x, \lambda}L)} \\
&= \argmin_{x, \lambda} \underbrace{(\nabla_{x, \lambda}L)^T (\nabla_{x, \lambda}L)}_{\ge 0}
\end{align*}
At the optimal $x^{*}$, $\lambda^{*}$ we have $ \Vert \nabla_{x, \lambda}L \Vert ^2 \approx 0$, meaning that all partial derivatives are approximately $0$. Therefore, they are a solution to the original optimization problem $L(x, \lambda)$. Unlike the critical points in $L$ however, $x^{*}$ and $\lambda^{*}$ occur at local minima (instead of at saddle points), so numerical optimization techniques (such as gradient descent) can be used to find them.
(Source: \href{https://en.wikipedia.org/wiki/Lagrange_multiplier#Examples}{Wikipedia})
\end{answer}
\end{QandA}
\subsubsection{Dimensionality reduction}
\begin{QandA}
\item Why do we need dimensionality reduction?
\begin{answer}
\textbf{Curse of dimensionality} -- When the dimensionality increases, the volume of the space increases so fast that the available data become sparse. In order to obtain a reliable result, the amount of data needed often grows exponentially with the dimensionality. Also, organizing and searching data often relies on detecting areas where objects form groups with similar properties; in high dimensional data, however, all objects appear to be sparse and dissimilar in many ways, which prevents common data organization strategies from being efficient.
\end{answer}
\item Eigendecomposition is a common factorization technique used for dimensionality reduction. Is the eigendecomposition of a matrix always unique?
\begin{answer}
The decomposition is not always unique. Suppose $A \in \R^{2 \times 2}$ has two equal eigenvalues $\lambda_1 = \lambda_2 = \lambda$, with corresponding eigenvectors $u_1, u_2$. Then:
\begin{align*}
A u_1 &= \lambda_1 u_1 = \lambda u_1 \\
A u_2 &= \lambda_2 u_2 = \lambda u_2
\end{align*}
Or written in matrix form:
\begin{align*}
A \begin{bmatrix}
u_1 & u_2
\end{bmatrix}
&= \begin{bmatrix}
u_1 & u_2
\end{bmatrix}
\begin{bmatrix}
\lambda & 0 \\
0 & \lambda
\end{bmatrix}
\end{align*}
Notice that we can permute the matrix of eigenvectors (thus obtaining a different factorization):
\begin{align*}
A \begin{bmatrix}
u_2 & u_1
\end{bmatrix}
&= \begin{bmatrix}
u_2 & u_1
\end{bmatrix}
\begin{bmatrix}
\lambda & 0 \\
0 & \lambda
\end{bmatrix}
\end{align*}
But we still end up with the same eigen-properties:
\begin{align*}
A u_2 &= \lambda u_2 \\
A u_1 &= \lambda u_1
\end{align*}
\end{answer}
\item Name some applications of eigenvalues and eigenvectors.
\begin{answer}
Principal Component Analysis (PCA), Spectral Clustering, Low rank factorization / rank-k approximation of a matrix (eg. image compression), Estimating curvature of the loss through the eigenvalues of the Hessian...
\end{answer}
\item We want to do PCA on a dataset of multiple features in different ranges. For example, one is in the range 0-1 and one is in the range 10 - 1000. Will PCA work on this dataset?
\begin{answer}
In PCA we are interested in the components that maximize the variance. If one component (e.g. human height) varies less than another (e.g. weight) because of their respective scales (meters vs. kilos), PCA might determine that the direction of maximal variance more closely corresponds with the ‘weight’ axis, if those features are not scaled. Since a change in height of one meter should be considered much more important than the change in weight of one kilogram, the previous assumption would be incorrect. Therefore, it is important to standardize the features before applying PCA (source: \href{https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html}{scikit}).
\end{answer}
\item Under what conditions can one apply eigendecomposition? What about SVD?
\begin{answer}
Eigendecomposition is possible only for (square) diagonalizable matrices. On the other hand, the Singular Value Decomposition (SVD) always exists (even for non-square matrices).
\end{answer}
\begin{InnerQandA}
\item What is the relationship between SVD and eigendecomposition?
\begin{answer}
Consider $A \in \R^{m \times n}$ of rank $r$. Then, we can factorize $A$ as follows:
\begin{align*}
A = U \Sigma V^T
\end{align*}
where $U \in \R^{m \times m}$ is an orthogonal matrix of left singular vectors, $V \in \R^{n \times n}$ is an orthogonal matrix of right singular vectors, and $\Sigma \in \R^{m \times n}$ is "diagonal" matrix of singular values s.t. exactly $r$ of the values $\sigma_i := \Sigma_{ii}$ are non-zero. By construction:
\begin{itemize}
\item The left singular vectors of $A$ are the eigenvectors of $AA^T$. From the Spectral Theorem, the eigenvectors (and thus the left singular vectors) are orthonormal.
\item The right singular vectors of $A$ are the eigenvecotrs of $A^TA$. From the Spectral Theorem, the eigenvectors (and thus the right singular vectors) are orthonormal.
\item If $\lambda$ is an eigenvalue of $AA^T$ (or $A^TA)$, then $\sqrt{\lambda}$ is a singular value of $A$. From the positive-semidefinitness of $AA^T$ (or $A^TA$), the eigenvalues (and thus the singular values) are non-negative.
\end{itemize}
\end{answer}
\item What’s the relationship between PCA and SVD?
\begin{answer}
Suppose we have data $X \in \R^{n \times d}$ with $n$ samples and $d$ features. Moreover, assume that the data has been centered s.t. the mean of each feature is $0$. Then, we can perform PCA in two main ways:
\begin{itemize}
\item First we compute the covariance matrix $C = \frac{1}{(n-1)} X^T X \in \R^{d \times d}$, and perform eigendecomposition: $C = V L V^T$, with eigenvalues as the diagonal of $L \in \R^{d \times d}$, and eigenvectors as the columns of $V \in \R^{d \times d}$. Then, we stack the $k$ eigenvectors of $V$ corresponding to the top $k$ eigenvalues into a matrix $\tilde{V} \in \R^{d \times k}$. Finally, we obtain the component values as follows: $\tilde{X} = X \tilde{V} \in \R^{n \times k}$.
\item Alternatively, instead of first computing the covariance matrix and then performing eigendecomposition, notice that given the above formulation, we can directly compute SVD on the data matrix $X$, thus obtaining: $X = U \Sigma V^t$. By construction, the right singular vectors in $V$ are the eigenvectors of $X^TX$. Similarly, we stack the $k$ right singular vectors corresponding to the top $k$ singular values into a matrix $\tilde{V} \in \R^{d \times k}$. Finally, we obtain the component values as follows: $\tilde{X} = X \tilde{V} \in \R^{n \times k}$.
\end{itemize}
Even though SVD is slower, is often considered to be the preferred method because of its higher numerical accuracy.
(See more \href{https://stats.stackexchange.com/questions/79043/why-pca-of-data-by-means-of-svd-of-the-data}{here})
\end{answer}
\end{InnerQandA}
\item How does t-SNE (T-distributed Stochastic Neighbor Embedding) work? Why do we need it?
\begin{answer}
t-SNE is a statistical method for visualizing high-dimensional data by giving each datapoint a location in a two or three-dimensional map. Specifically, it models each high-dimensional object by a two- or three-dimensional point in such a way that similar objects are modeled by nearby points and dissimilar objects are modeled by distant points with high probability. \\\\
First, t-SNE constructs a probability distribution over pairs of high-dimensional objects in such a way that similar objects are assigned a higher probability while dissimilar points are assigned a lower probability. In particular, given a set of $N$ high-dimensional objects $x_1, \ldots, x_N$, t-SNE computes:
\begin{align*}
p_{j \mid i} = \frac{\exp(- \Vert x_i - x_j\Vert^2/2\sigma_i^2)}{\sum_{k \neq i} \exp(- \Vert x_i - x_k\Vert^2/2\sigma_i^2)}
\end{align*}
and sets $p_{i \vert i} = 0$. Note that $\sum_j p_{j \vert i} = 1$ for all $i$. Then, it computes:
\begin{align*}
p_{ij} = \frac{p_{j \vert i} + p_{i \vert j}}{2N}
\end{align*}
Note that $p_{ij} = p_{ji}, p_{ii} = 0, \sum_{i,j} p_{ij} = 1$. The similarity of datapoint $x_j$ to datapoint $x_i$ is the conditional probability $p_{j \vert i}$ that $x_i$ would pick $x_j$ as its neighbor if neighbors were picked in proportion to their probability density under a Gaussian centered at $x_i$. \\\\
Second, t-SNE defines a similar probability distribution over the points in the low-dimensional map. In particular, it aims to learn $d$-dimensional map $y_1, \ldots, y_N$ (where $d$ is chosen 2 or 3) that reflects the similarities $p_{ij}$ as well as possible. To this end, it measures similarities $q_{ij}$ between two points $y_i$ and $y_j$ using:
\begin{align*}
q_{ij} = \frac{(1 + \Vert y_i - y_j \Vert^2)^{-1}}{\sum_k \sum_{l \neq k} (1 + \Vert y_k - y_l \Vert^2)^{-1}}
\end{align*}
and set $q_{ii} = 0$. Herein a heavy-tailed Student t-distribution (with one-degree of freedom, which is the same as a Cauchy distribution) is used to measure similarities between low-dimensional points in order to allow dissimilar objects to be modeled far apart in the map. \\\\
Finally, the locations $y_i$ in the map are determined by minimizing the Kullback-Leibler divergence of the distribution $P$ from the distribution $Q$:
\begin{align*}
\KL{P}{Q} = \sum_{i \neq j} p_{ij} \log \frac{p_{ij}}{q_{ij}}
\end{align*}
The minimization of the KL divergence wrt the points $y_i$ is performed using gradient descent. \\\\
While t-SNE plots often seem to display clusters, the visual clusters can be influenced strongly by the chosen parameterization and therefore a good understanding of the parameters for t-SNE is necessary. Such "clusters" can be shown to even appear in non-clustered data, and thus may be false findings. Interactive exploration may thus be necessary to choose parameters and validate results.
(Source: \href{https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding}{Wikipedia})
\end{answer}
\end{QandA}
\subsubsection{Calculus and convex optimization}
\begin{QandA}
\item Differentiable functions
\begin{InnerQandA}
\item What does it mean when a function is differentiable?
\begin{answer}
A function $f: U \rightarrow \R$ is said to be differentiable at $a \in U$ if the derivative:
\begin{align*}
f'(a) = \lim_{h \rightarrow 0} \frac{f(a+h) - f(a)}{h}
\end{align*}
exists. This implies that the function is continuous at $a$. Note that every continuous function is not necessarily differentiable.
\end{answer}
\item Give an example of when a function doesn’t have a derivative at a point.
\begin{answer}
Any non-continuous function. For example,
\begin{align*}
f(x) =
\begin{cases}
x^2 & \text{if } x \le 0\\
1+x & \text{otherwise}
\end{cases}
\end{align*}
is not differentiable at $x=0$.
\end{answer}
\item Give an example of non-differentiable functions that are frequently used in machine learning. How do we do backpropagation if those functions aren’t differentiable?
\begin{answer}
Some non-differentiable functions commonly used in Machine Learning:
\begin{align*}
f(x) &= \vert x \vert \\
\text{ReLU}(x) &= \begin{cases}
x & \text{if } x \ge 0\\
0 & \text{otherwise}
\end{cases} \\
\text{LeakyReLU}(x) &= \begin{cases}
x & \text{if } x \ge 0\\
\alpha x & \text{otherwise}
\end{cases} \\
\end{align*}
Each of these functions are not differentiable at $x=0$. In theory, since any finite set of points is a set of measure $0$, we have $p(x=0)$ and therefore can disregard that these functions are not differentiable. In practice however, due to finite precision it can occur that $x=0$. In these cases we can use a "faux" derivative -- by usually picking the left or the right derivative at the given point.
\end{answer}
\end{InnerQandA}
\item Convexity
\begin{InnerQandA}
\item What does it mean for a function to be convex or concave? Draw it.
\begin{answer}
A function is called convex if the line segment between any two points on the graph of the function lies above the graph between the two points. More precisely, the function $f: X \rightarrow \R$ is convex if and only if for all $0 \le t \le 1$ and all $x_1, x_2 \in X$:
\begin{align*}
f(t x_1 + (1-t)x_2) \le t f(x_1) + (1-t)f(x_2)
\end{align*}
\end{answer}
The function $f$ is said to be concave if $-f$ is convex.
\item Why is convexity desirable in an optimization problem?
\begin{answer}
Convexity is desirable because any local minimum of a convex function is also a global minimum.
\end{answer}
\item Show that the cross-entropy loss function is convex.
\begin{answer}
We will prove that $\CE{P}{Q}$ is convex in $Q$, since typically $P$ is the ground-truth probability distribution and $Q$ is the predicted distribution of our model. For the purpose of this task, we have to show two results: the Log sum inequality, and the convexity of the KL divergence.
\begin{theorem}
Let $a_1, \ldots, a_n$ and $b_1, \ldots, b_n$ be non-negative real numbers, and define $a = \sum_{i=1}^n a_i$ and $b = \sum_{i=1}^n b_i$. Then, the log sum inequality that:
\begin{align*}
\sum_{i=1}^n \log \frac{a_i}{b_i} \ge a \log \frac{a}{b}
\end{align*}
\end{theorem}
\begin{proof}
Let $f(x) = x \log x$. Then, we re-write:
\begin{align*}
\sum_{i=1}^n a_i \log \frac{a_i}{b_i} &= \sum_{i=1}^n b_i f \left(\frac{a_i}{b_i} \right) \\
&= b \sum_{i=1}^n \frac{b_i}{b} f \left( \frac{a_i}{b_i} \right)
\end{align*}
Because $f(x)$ is a convex function, and:
\begin{align*}
\frac{b_i}{b} &\ge 0 \\
\sum_{i=1}^n \frac{b_i}{b} &= 1
\end{align*}
applying the Jensen's inequality yields:
\begin{align*}
b \sum_{i=1}^n \frac{b_i}{b} f \left( \frac{a_i}{b_i} \right) & \ge b f \left( \sum_{i=1}^n \frac{b_i}{b} \frac{a_i}{b_i} \right) \\
&= b f \left( \frac{1}{b} \sum_{i=1}^n a_i \right) \\
&= b f \left( \frac{a}{b} \right) \\
&= a \log \frac{a}{b}
\end{align*}
Therefore, we obtain the \textbf{Log sum inequality}:
\begin{align*}
\sum_{i=1}^n a_i \log \frac{a_i}{b_i} \ge a \log \frac{a}{b}
\end{align*}
\end{proof}
\begin{theorem}
The Kullback-Leibler divergence is convex in the pair of probability distributions $(p, q)$, i.e.
\begin{align*}
\KL{\lambda p_1 + (1-\lambda)p_2}{\lambda q + (1-\lambda)q_2} \le \lambda \KL{p_1}{q_1} + (1-\lambda)\KL{p_2}{q_2}
\end{align*}
\end{theorem}
\begin{proof}
The Kullback-Leibler divergence of $P$ from $Q$ is defined as:
\begin{align*}
\KL{P}{Q} = \sum_{x} p(x) \log \frac{p(x)}{q(x)}
\end{align*}
Now:
\begin{align}
&\KL{\lambda p_1 + (1-\lambda)p_2}{\lambda q + (1-\lambda)q_2} \\
= &\sum_{x}\left[ \left[\lambda p_1(x) + (1-\lambda)p_2(x)\right] \cdot \log \frac{\lambda p_1(x) + (1-\lambda)p_2(x)}{\lambda q_1(x) + (1-\lambda)q_2(x)} \right] \\
\le &\sum_x \left[ \lambda p_1(x) \log \frac{\lambda p_1(x)}{\lambda q_1(x)} + (1 - \lambda) p_2(x) \log \frac{(1-\lambda) p_2(x)}{(1-\lambda) q_2(x)} \right] \\
= &\lambda \sum_x p_1(x) \log \frac{ p_1(x)}{q_1(x)} + (1 - \lambda) \sum_x p_2(x) \log \frac{p_2(x)}{q_2(x)} \\
= &\lambda \KL{p_1}{q_1} + (1-\lambda) \KL{p_2}{q_2}
\end{align}
Where in (3) we use the Log sum inequality with:
\begin{align*}
a_1 &= \lambda p_1(x) & a_2 = (1-\lambda) p_2(x)\\
b_1 &= \lambda q_1(x) & b_2 = (1-\lambda) q_2(x)\\
\end{align*}
\end{proof}
Finally, going back to the original task of proving the convexity of $\CE{P}{Q}$ in $Q$, let us first decompose the cross-entropy:
\begin{align*}
\CE{P}{Q} &= \Ent{P} + \KL{P}{Q}
\end{align*}
Since $\Ent{P}$ is constant wrt. $Q$, and we showed that $\KL{P}{Q}$ is convex wrt. both $P$ and $Q$, we can deduce that $\CE{P}{Q}$ is convex wrt. $Q$.
(Source: \href{https://statproofbook.github.io/P/entcross-conv}{The Book of Statistical Proofs})
\end{answer}
\end{InnerQandA}
\item Given a logistic discriminant classifier:
\begin{align*}
p(y=1 \vert x) = \sigma(w^tx)
\end{align*}
where the sigmoid function is given by:
\begin{align*}
\sigma(z) = (1 + \exp(-z))^{-1}
\end{align*}
The logistic loss for training sample $x_i$ with class label $y_i$ is given by:
\begin{align*}
L(y_i, x_i; w) = -y_i \log p(y_i \vert x_i)
\end{align*}
(Note: I believe in the book $y_i$ is missing from the loss $L$)
\begin{InnerQandA}
\item Show that $p(y=-1 \vert x) = \sigma (-w^t x)$.
\begin{answer}
\begin{align*}
p(y=-1 \vert x) &= 1 - p(y=1 \vert x) \\
&= 1 - \frac{1}{1 + \exp(-w^tx)} \\
&= \frac{1 + \exp(-w^tx) - 1}{1 + \exp(-w^tx)} \\
&= \frac{\exp(-w^tx)}{1 + \exp(-w^tx)} \\
&= \frac{1}{\exp(w^tx)(1 + \exp(-w^tx))} \\
&= \frac{1}{\exp(w^tx) + 1} \\
&= \sigma (-w^t x)
\end{align*}
\end{answer}
\item Show that $\nabla_w L (y_i, x_i; w) = -y_i (1 - p(y_i \vert x_i))x_i$.
\begin{answer}
We have the following computation graph:
\begin{align*}
z &= w^t x \\
\sigma(z) &= \frac{1}{1 + \exp(-z)} \\
L(y_i, x_i; w) &= -y_i \log \sigma(z)
\end{align*}
Therefore, using the chain rule we can decompose the derivative as follows:
\begin{align*}
\frac{\partial L}{\partial w} &= \frac{\partial L}{\partial \sigma} \cdot \frac{\partial \sigma}{\partial z} \cdot \frac{\partial z}{\partial w}
\end{align*}
Let's compute each term separately:
\begin{align*}
\frac{\partial L}{\partial \sigma} &= -\frac{y_i}{\sigma(z)} \\
\frac{\partial \sigma(z)}{\partial(z)} &= \frac{\partial}{\partial z} \left( \frac{1}{1 + \exp(-z)} \right) = -\frac{1}{(1 + \exp(-z))^2} (-\exp(-z)) = \frac{1}{1+ \exp(-z)} \cdot \frac{\exp(-z)}{1 + \exp(-z)} \\
&= \frac{1}{1+ \exp(-z)} \cdot \frac{\exp(-z) + 1 - 1}{1 + \exp(-z)} = \frac{1}{1+ \exp(-z)} \cdot \left(
1 - \frac{1}{1+ \exp(-z)} \right) = \sigma(z)(1 - \sigma(z)) \\
\frac{\partial z}{\partial w} &= \frac{\partial}{\partial w} \left( w^w x \right) = x
\end{align*}
Therefore:
\begin{align*}
\frac{\partial L}{\partial w} &= \frac{\partial L}{\partial \sigma} \cdot \frac{\partial \sigma}{\partial z} \cdot \frac{\partial z}{\partial w} = -\frac{y_i}{\sigma(z)} \cdot \sigma(z)(1 - \sigma(z)) \cdot x = -y_i (1 - \sigma(z)) x
\end{align*}
\end{answer}
\item Show that $\nabla_w L (y_i, x_i; w)$ is convex.
\begin{answer}
\todo
\end{answer}
\end{InnerQandA}
\item Most ML algorithms we use nowadays use first-order derivatives (gradients) to construct the next training iteration.
\begin{InnerQandA}
\item How can we use second-order derivatives for training models?
\begin{answer}
Given a twice differentiable function $f: \R \rightarrow \R$ we seek to solve the optimization problem:
\begin{align*}
\min_{x \in \R} f(x)
\end{align*}
Newton's method approaches this problem in an iterative fashion by constructing a sequence $\left\{ x_k \right\}$ that converges towards the minimizer $x^{*}$ of f. In particular, it performs second-order Taylor approximation of $f$ around $x_k$:
\begin{align*}
f(x_k + t) \approx f(x_k) + f'(x_k)t + \frac{1}{2} f''(x_k)t^2
\end{align*}
The next iterate $x_{k+1}$ is defined as to minimize this quadratic approximation in t, and setting $x_{k+1} = x_k + t$. If the second derivative is positive, the quadratic approximation is a convex function of $t$, and its minimum can be found by setting the derivative to zero. More precisely:
\begin{align*}
0 = \frac{d}{dt} \left( f(x_k) + f'(x_k)t + \frac{1}{2} f''(x_k)t^2 \right) = f'(x_k) + f''(x_k)t
\end{align*}
Therefore, the minimum of the quadratic approximation is achieved for:
\begin{align*}
t = -\frac{f'(x_k)}{f''(x_k)}
\end{align*}
Putting everything together, the Newton's method performs the following iteration:
\begin{align*}
x_{k+1} = x_k + t = x_k -\frac{f'(x_k)}{f''(x_k)}
\end{align*}
(Source: \href{https://en.wikipedia.org/wiki/Newton%27s_method_in_optimization}{Wikipedia})
\end{answer}
\item Pros and cons of second-order optimization.
\begin{answer}
The advantage of the Newton optimization method is that in general it converges faster than pure gradient descent, since we perform a higher order local approximation (second order as opposed to first order), and therefore make a more informed choice about the next step in the descent. \\\\
Nevertheless, it comes with several drawbacks as well:
\begin{itemize}
\item Finding the inverse of the Hessian in high dimensions can be an expensive operation. Several approximations exist: quasi-Newton methods, Levenberg-Marquardt algorithm, ...
\item It does not work if the Hessian is not invertible.
\item It may not converge at all, but can enter a cycle having more than 1 point.
\item It can converge to a saddle point instead of to a local minimum.
\end{itemize}
\end{answer}
\item Why don’t we see more second-order optimization in practice?
\begin{answer}
To further highlight the intractability of computing the Hessian, let us note that today's Deep Learning models are on the order of $10^{10}$ (some even $10^{12}$) parameters. Even with pure gradient descent, training these models requires a lot of engineering effort in order to distribute the computation across multitude of GPUs. Since the Hessian is quadratic in the number of parameters, it almost certainly isn't possible to scale the compute requirements with today's state of the hardware.
\end{answer}
\end{InnerQandA}
\item How can we use the Hessian (second derivative matrix) to test for critical points?
\begin{answer}
Suppose we have $f \in \R^n \rightarrow \R$ and $a \in \R^n$ be a critical point for which the Hessian matrix is invertible. Then:
\begin{itemize}
\item If the Hessian is positive definite (equivalently, has all eigenvalues positive) at $a$, then $f$ attains a local minimum at $a$.
\item If the Hessian is negative definite (equivalently, has all eigenvalues negative) at $a$, then $f$ attains a local maximum at $a$.
\item If the Hessian has both positive and negative eigenvalues, then $a$ is a saddle point for $f$.
\end{itemize}
In those cases not listed above, the test is inconclusive.
(Source: \href{https://en.wikipedia.org/wiki/Second_partial_derivative_test}{Wikipedia})
\end{answer}
\item Jensen’s inequality forms the basis for many algorithms for probabilistic inference, including Expectation-Maximization and variational inference. Explain what Jensen’s inequality is.
\begin{answer}
As stated before, for a given convex function $f$, we had the following property:
\begin{align*}
g(tx_1 + (1-t)x_2) \le tg(x_1) + (1-t)g(x_2)
\end{align*}
Let us generalize this property. Again, suppose we have a convex function $f$, variables $x_1, \ldots, x_n \in I$, and non-negative real numbers $\alpha_1, \ldots, \alpha_n$ s.t. $\sum_i \alpha_i = 1$. Then, by induction we have:
\begin{align*}
g(\alpha_1 x_1 + \ldots + \alpha_n x_n) \le \alpha_1 g(x_1) + \ldots + \alpha_n g(x_n)
\end{align*}
Let's formalize it one step further. Consider a convex function $f$, a discrete random variable $X$ with $n$ possible values $x_1, \ldots, x_n$, and real non-negative values $a_i = p(X=x_i)$. Then, we obtain the general form of the Jensen's inequality:
\begin{align*}
g(\Exp{X}) \le \Exp{g(X)}
\end{align*}
(Source: \href{https://www.probabilitycourse.com/chapter6/6_2_5_jensen's_inequality.php}{Introduction to Probability, Statistics, and Random Processes})
\end{answer}
\item Explain the chain rule.
\begin{answer}
The chain rule is a formula that expresses the derivative of the composition of two differentiable functions $f$ and $g$ in terms of the derivatives of $f$ and $g$. More precisely, if $h = f \circ g$ is the function such that $h(x) = f(g(x))$ for every $x$, then the chain rule is:
\begin{align*}
\frac{\partial h}{\partial x} = \frac{\partial f}{\partial g} \cdot \frac{\partial g}{\partial x}
\end{align*}
(Source: \href{https://en.wikipedia.org/wiki/Chain_rule}{Wikipedia})
\end{answer}
\item Let $x \in \R^n$, $L = CrossEntropy(Softmax(x), y)$, in which $y$ is a one-hot vector. Take the derivative of $L$ with respect to $x$.
\begin{answer}
First, let us expand the loss:
\begin{align*}
L &= - \sum_{c=1}^C y_c \log (s_c)
\end{align*}
where $C$ is the total number of classes, and $s_c$ is the $c$-th entry of the softmax output:
\begin{align*}
s_c &= \frac{\exp(x_c)}{\sum_{k=1}^C \exp(x_k)}
\end{align*}
Using the chain rule, we obtain the following:
\begin{align*}
\frac{\partial L}{\partial x_l} = \frac{\partial}{\partial x_l} \left( - \sum_{c=1}^C y_c \log (s_c) \right) = - \sum_{c=1}^C y_c \frac{\partial \log (s_c)}{\partial x_l}
\end{align*}
Now:
\begin{align*}
\frac{\partial}{\partial x_l} \log(s_c) &= \frac{\partial}{\partial x_l} \left( x_c - \log \left( \sum_{k=1}^C \exp(x_k) \right) \right) \\
&= \frac{\partial x_c}{\partial x_l} - \frac{\partial}{\partial x_l} \log \left( \sum_{k=1}^C \exp(x_k) \right) \\
&= \Indicator{c = l} - \frac{1}{\sum_{k=1}^C \exp(x_k)} \cdot \exp(x_l) \\
&= \Indicator{c = l} - s_l
\end{align*}
Therefore:
\begin{align*}
\frac{\partial L}{\partial x_l} &= - \sum_{c=1}^C y_c \frac{\partial \log (s_c)}{\partial x_l} \\
&= \sum_{c=1}^C y_c (s_l - \Indicator{c = l}) \\
&= s_l \sum_{c=1}^C y_c - \sum_{c=1}^C y_c \Indicator{c = l} \\
&= s_l - y_l
\end{align*}
Or equivalently in vector form:
\begin{align*}
\frac{\partial L}{\partial x} = s - y
\end{align*}
(Source: \href{https://towardsdatascience.com/derivative-of-the-softmax-function-and-the-categorical-cross-entropy-loss-ffceefc081d1}{Thomas Kurbiel's blog}) \end{answer}
\item Given the function $f(x, y) = 4x^2 - y$ with the constraint $x^2 + y^2 = 1$. Find the function's maximum and minimum values.
\begin{answer}
In order to solve the constrained optimization problem, we form the Lagrangian:
\begin{align*}
L(x, y, \lambda) = 4x^2 - y + \lambda(x^2 + y^2 - 1)
\end{align*}
Calculating the gradient and setting it to zero, we obtain:
\begin{align*}
\nabla_{x, y, \lambda}L = \left( 8x + 2\lambda x, -1 + 2 \lambda y, x^2 + y^2 -1 \right) = (0, 0, 0)
\end{align*}
This results in the following system of equations:
\begin{align*}
\begin{cases}
8x + 2\lambda x = 0 \\
-1 + 2 \lambda y = 0 \\
x^2 + y^2 - 1 = 0
\end{cases}
\end{align*}
Solving the system, we obtain the following stationary points $(x, y, \lambda)$:
\begin{align*}
\left(0, 1, \frac{1}{2} \right), \left(0, -1, -\frac{1}{2} \right), \left(\frac{3\sqrt{7}}{8}, -\frac{1}{8}, -4 \right), \left(\frac{-3\sqrt{7}}{8}, -\frac{1}{8}, -4 \right)
\end{align*}
By evaluating the function $f$ that we wish to optimize at these stationary points, we get:
\begin{align*}
f \left(0, 1\right) &= -1 \\
f \left(0, -1\right) &= 1 \\
f \left(\frac{3\sqrt{7}}{8}, -\frac{1}{8}\right) &= \frac{65}{16} \\
f \left(-\frac{3\sqrt{7}}{8}, -\frac{1}{8}\right) &= \frac{65}{16}
\end{align*}
Therefore, the constrained maximum is $\frac{65}{16}$, and the constrained minimum is $-1$.
(Source: \href{https://en.wikipedia.org/wiki/Lagrange_multiplier}{Wikipedia})
\end{answer}
\end{QandA}
\subsection{Probability and statistics}
\subsubsection{Probability}
\begin{QandA}
\item Given a uniform random variable $X$ in the range of $\left[ 0, 1 \right]$ inclusively. What's the probability that $X = 0.5$?
\begin{answer}
The probability density function (PDF) for a uniform distribution on the range $\left[a, b\right]$ is:
\begin{align*}
p(x) = \frac{1}{b-a}
\end{align*}
Therefore, the probability of interest is:
\begin{align*}
P(X=0.5) = \int_{0.5}^{0.5} p(x) dx = \int_{0.5}^{0.5} \frac{1}{1-0}dx = \int_{0.5}^{0.5} 1dx = 0
\end{align*}
In fact, for most continuous random variable $X$, the probability of obtaining a specific value $c$ is $P(X=c)=0$, since any constant is a set of measure $0$. An exception to this rule would be the \href{https://en.wikipedia.org/wiki/Dirac_delta_function}{Dirac delta function}.
\end{answer}
\item Can the values of PDF be greater than 1? If so, how do we interpret the PDF?
\begin{answer}
The values of the PDF can indeed be greater than 1. All that matters is that the PDF $p(x)$ integrates to 1, that is:
\begin{align*}
\int_{\R} p(x)dx = 1
\end{align*}
Intuitively, you can imagine the PDF as being a `border' to a fluid container of volume $1$. If we squeeze it from the sides, thereby reducing the volume in this area, we have to increase the volume (that is, extend the border) in the middle in order to compensate for the loss of volume on the sides. Nevertheless, the total volume of the container is still exactly $1$.
\end{answer}
\item What’s the difference between multivariate distribution and multi-modal distribution?
\begin{answer}
Multi-modal distribution refers to a distribution that has multiple modes (values that appear `significantly' more often than others). On the other hand, a multivariate distribution refers to a distribution of multiple variables.
(See more \href{https://stats.stackexchange.com/questions/168586/what-is-the-difference-between-multimodal-and-multivariate}{here})
\end{answer}
\item What does it mean for two variables to be independent?
\begin{answer}
In general, continuous random variables $X_1, \ldots, X_n$ admitting a joint density are all independent from each other if and only if:
\begin{align*}
p_{X_1, \ldots, X_n} \left(x_1, \ldots, x_n \right) = p_{X_1}(x_1)\cdots p_{X_N}(x_n)
\end{align*}
(Source: \href{https://www.wikiwand.com/en/Probability_density_function#/Independence}{Wikiwand})
\end{answer}
\item It’s a common practice to assume an unknown variable to be of the normal distribution. Why is that?
\begin{answer}
The Central Limit Theorem (CLT) states that the distribution of the sum of a large number of independent, identically distributed random variables is approximately normal, regardless of the underlying distribution. Because so many things in the universe can be modeled as the sum of a large number of independent random variables, the normal distribution pops up a lot.
(Source: yours truly, \href{https://twitter.com/joaoli13/status/1530959944064851968}{GPT-3})
\end{answer}
\item How would you turn a probabilistic model into a deterministic model?
\begin{answer}
A deterministic mathematical model is meant to yield a single solution describing the outcome of some "experiment" given appropriate inputs. A probabilistic model is, instead, meant to give a distribution of possible outcomes (i.e. it describes all outcomes and gives some measure of how likely each is to occur).
It should be noted that a probabilistic model can be quite useful even for a person who believes the entire universe to be deterministic. This utility arises because even a deterministic process may have so many variables that any model that attempts to account for them all is too cumbersome to work with. For example, a coin toss might be deterministic if one could precisely measure everything about the flip, the coin, the floor, the air currents, the tides, the precise location on earth, etc. In practice, this level of deterministic modeling is impossible, so stochastic models are used instead.
Similarly, deterministic models can be used to great effect even in real-world process that is clearly stochastic. For example, the heat equation works great in many situations despite the fact that it ignores the "random" motion of the atoms involved. Usually, in these scenarios, the distribution of possible final answers is so sharply peaked (i.e. has such a small variance) that there is no need to complicate the model by forcing it to calculate the distribution rather than just a single value.
(Source: \href{https://www.quora.com/What-is-the-difference-between-probabilistic-and-deterministic-models}{Quora})
\end{answer}
\item Is it possible to transform non-normal variables into normal variables? How?
\begin{answer}
In statistics, a power transform is a family of functions applied to create a monotonic transformation of data using power functions. It is a data transformation technique used to stabilize variance, make the data more normal distribution-like, improve the validity of measures of association (such as the Pearson correlation between variables), and for other data stabilization procedures.\\\\
One popular choice of a power transform is the Box-Cox transformation:
\begin{align*}
y_i^{(\lambda)} = \begin{cases}
\frac{y_i^\lambda - 1}{\lambda} &\text{if }\lambda \neq 0\\
\log y_i &\text{if } \lambda = 0
\end{cases}
\end{align*}
where the parameter $\lambda$ is estimated using the profile likelihood function and using goodness-of-fit tests. Moreover, this transformation only holds for $y_i > 0$. \\\\
On the other hand, the Yeo-Johnson transformation allows also for zero and negative values of $y$. The hyperparameter $\lambda$ can be any real number, where $\lambda=1$ produces the identity transformation. The transformation law reads:
\begin{align*}
y_i^{(\lambda)} = \begin{cases}
((y_i+1)^y - 1)/\lambda &\text{if } \lambda \neq 0, y \ge 0 \\
\log (y_i + 1) &\text{if } \lambda = 0, y \ge 0 \\
-((-y_i + 1)^{2-\lambda} -1)/(2-\lambda) &\text{if } \lambda \neq 2, y < 0 \\
- \log(-y_i + 1) &\text{if } \lambda = 2, y < 0
\end{cases}
\end{align*}
(Source: \href{https://en.wikipedia.org/wiki/Power_transform}{Wikipedia})
\end{answer}
\item When is the t-distribution useful?
\begin{answer}
The student's t-distribution is any member of a family of continuous probability distributions that arise when estimating the mean of a normally distributed population in situations where the sample size is small and the population's standard deviation is unknown.\\\\
The t-distribution is symmetric and bell-shaped, like the normal distribution. However, the t-distribution has heavier tails, meaning that it allows for producing values that fall far from its mean.\\\\
The t-distribution plays a role in a number of widely used statistical analyses, including Student's t-test for assessing the statistical significance of the difference between two sample means, the construction of confidence intervals for the difference between two population means, and in linear regression analysis. \\\\
Moreover, we previously saw that the Student t-distribution (with one degree of freedom) is used to measure the similarities between low-dimensional points in t-SNE, in order to allow for dissimilar objects to be modeled far apart in the map. Since it has heavier tails than the Gaussian distribution, it penalizes these larger distances less vigorously.
(Source: \href{https://en.wikipedia.org/wiki/Student%27s_t-distribution}{Wikipedia})
\end{answer}
\item Assume you manage an unreliable file storage system that crashed 5 times in the last year, each crash happens independently.
\begin{InnerQandA}
\item What's the probability that it will crash in the next month?
\begin{answer}
We will resort to the Poisson distribution, which is concerned with expressing the probability of a given number of events occurring in a fixed time interval, given that these events happen with a known constant mean rate $\lambda$, and independently of the time since the last event. For this problem, the expected number of events is $\lambda=5$ crashes per year. \\\\
Because the events occur at constant rate, we can look what happens in multiples of the fixed time interval. In particular, since we are interested in the number of crashes per month, we can construct a new Poisson random variable $X$ with a mean rate of $\lambda_m = \frac{5}{12}$ crashes per month. With that said, we now resort to calculating the probability that there is going to be at least one crash in the following month:
\begin{align*}
P(X \ge 1) = 1 - P(X=0) = 1 - \frac{\lambda_m^0e^{-\lambda_m}}{0!} = 1 - 0.65924 = 0.34075
\end{align*}
\end{answer}
\item What's the probability that it will crash at any given moment?
\begin{answer}
Even though time is continuous, let us attempt to discretize it down to every moment (or timestamp). We can easily find that $\lambda_t = \frac{5}{t} \rightarrow 0$ as $t \rightarrow \infty$. However, since the Poisson distribution is only defined for $\lambda \in (0, \infty)$, i.e. $\lambda > 0$, we cannot apply it to this problem formulation.
\end{answer}
\end{InnerQandA}
\item Say you built a classifier to predict the outcome of football matches. In the past, it's made $10$ wrong predictions out of $100$. Assume all predictions are made independently., what's the probability that the next $20$ predictions are all correct?
\begin{answer}
\notsure Assuming binomial distribution, we can calculate the Maximum Likelihood Estimate for the probability of the classifier being right:
\begin{align*}
\hat{p} = \frac{90}{100} = 0.9
\end{align*}
Therefore, the probability that the classifier correctly predicts the next 20 out of 20 games is:
\begin{align*}
P(X=20 \g n=20, p=0.9) = C_{20}^{20} \cdot (0.9)^{20} \cdot (0.1)^{0} = 0.1215
\end{align*}
\end{answer}
\item Given two random variables $X$ and $Y$. We have the values $P(X \g Y)$ and $P(Y)$ for all values of $X$ and $Y$. How would you calculate $P(X)$?
\begin{answer}
\begin{align*}
P(X=x) = \sum_{y \in \mathcal{Y}} P(X=x, Y=y) = \sum_{y \in \mathcal{Y}} P(X=x \g Y=y) P(Y=y)
\end{align*}
\end{answer}
\item You know that your colleague Jason has two children and one of them is a boy. What’s the probability that Jason has two sons? (Hint: it’s not $\frac{1}{2}$)
\begin{answer}
Define the following variables:
\begin{align*}
&b = \begin{cases}
1: &\text{boy in the family} \\
0: &\text{no boys in the family}
\end{cases}
&g = \begin{cases}
1: &\text{girl in the family} \\
0: &\text{no girls in the family}
\end{cases}
\end{align*}
Given that the 4 possibilities are $\left\{ (B, B), (B, G), (G, B), (G, G) \right\}$, we obtain the following probabilities:
\begin{align*}
&p(b=1) = \frac{3}{4}, p(b=0) = \frac{1}{4} &p(g=1) = \frac{3}{4}, p(g=0) = \frac{1}{4}
\end{align*}
With that said, we are interested in the probability that there are no girls (or equivalently, both children are boys), given that one of the children is a boy. More precisely:
\begin{align*}
p(g=0 \g b = 1) &= \frac{p(b=1 \g g = 0) p(g=0)}{p(b=1)} \\
&= \frac{1 \cdot \frac{1}{4}}{\frac{3}{4}} \\
&= \frac{1}{3}
\end{align*}
(Read more \href{https://en.wikipedia.org/wiki/Boy_or_Girl_paradox}{here})
\end{answer}
\item There are only two electronic chip manufacturers: $A$ and $B$, both manufacture the same amount of chips. $A$ makes defective chips with probability of $30\%$, while $B$ makes defective chips with a probability of $70\%$.
\begin{InnerQandA}
\item If you randomly pick a chip from the store, what is the probability that it is defective?
\begin{answer}
Define the following variables:
\begin{align*}
&c = \begin{cases}
1: &\text{chip is functional} \\
0: &\text{chip is defective}