Bayesian Machine Learning
Sources
Probabilistic Programming and Bayesian Methods for Hackers by Cameron Davidson-Pilon
S. Lall at Stanford
Reza Shadmehr at Johns Hopkins University
Table of Contents
Suppose the data $x \in \mathbb{R}$ in 1D. Assume we have two classes ($\mathcal{C}_1$ and $\mathcal{C}_2$) with the probability density functions (pdf) and their cumulative distribution functions (cdf).
$$
\begin{align*}
f_1(x) &= \frac{\partial{F_1(x)}}{\partial x}\\\\
f_2(x) &= \frac{\partial{F_2(x)}}{\partial x}\\\\
\end{align*}
$$
We further assume two classes are Gaussian distributed and $\mu_1 < \mu_2$. Then an instance $x \in \mathbb{R}$ belongs to one of the these two classes:
$$
x \sim
\begin{cases}
\mathcal{N}(\mu_1, \sigma_1^2), \quad \text{if } x \in \mathcal{C}_1\\
\mathcal{N}(\mu_2, \sigma_2^2), \quad \text{if } x \in \mathcal{C}_2
\end{cases}
$$
Since this is a binary classification problem in 1 dimensional space, we have to determine the threshold $\omega$ where $\mu_1 < \omega < \mu_2$. Then
$$
\begin{cases}
\text{if } x < \omega,\quad x \in \mathcal{C}_1 \\
\text{if } x > \omega, \quad x \in \mathcal{C}_2
\end{cases}
$$
$$
\begin{align*}
P(\text{error}) &= P(x> \omega, x \in \mathcal{C}_1) + P(x< \omega, x \in \mathcal{C}_2) \\\\
&= P(x> \omega \mid x \in \mathcal{C}_1)P(x \in \mathcal{C}_1) + P(x< \omega \mid x \in \mathcal{C}_2)P(x \in \mathcal{C}_2)\\\\
&= \left(1-F_1(\omega)\right)\,\pi_1 + F_2(\omega)\, \pi_2
\end{align*}
$$
$$
\begin{align*}
P(x \in \mathcal{C}_1) = \pi_1\\
P(x \in \mathcal{C}_2) = \pi_2
\end{align*}
$$
$$\min_{\omega}P(\text{error}) = \min_{\omega} \left\{ \left(1-F_1(\omega)\right)\,\pi_1 + F_2(\omega)\, \pi_2 \right\}$$
$$
\begin{align*}
\frac{\partial P(\text{error})}{\partial \omega} =& -f_1(\omega) \, \pi_1 + f_2(\omega) \, \pi_2 = 0\\\\
&\implies f_1(\omega) \, \pi_1 = f_2(\omega) \, \pi_2
\end{align*}
$$
Another way is equating the posterior probabilities to have the equation of the classification boundary. For $x$ on the boundary
$$
\begin{align*}
P(x \in \mathcal{C}_1 \mid X=x) &= P(x \in \mathcal{C}_2 \mid X=x)\\\\
\frac{P(X=x \mid x \in \mathcal{C}_1)P(x \in \mathcal{C}_1)}{P(X=x)} &= \frac{P(X=x \mid x \in \mathcal{C}_2)P(x \in \mathcal{C}_2)}{P(X=x)}\\\\
f_1(x)\, \pi_1 &= f_2(x)\, \pi_2
\end{align*}
$$
Now let us think of data as multivariate Gaussian distributions, $x \sim \mathcal{N}(\mu, \Sigma)$.
$$f(x) = \frac{1}{\sqrt{(2\pi)^d\lvert \Sigma\rvert}}\exp{\left(-\frac{1}{2} (x-\mu)^T\Sigma^{-1}(x-\mu)\right)}$$
Then the equation of boundary
$$\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma_1\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_1)^T\Sigma_1^{-1}(x-\mu_1)\right)} \, \pi_1=
\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma_2\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_2)^T\Sigma_2^{-1}(x-\mu_2)\right)} \, \pi_2$$
$$
\begin{align*}
\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_1)^T\Sigma^{-1}(x-\mu_1)\right)} \, \pi_1 &=
\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_2)^T\Sigma^{-1}(x-\mu_2)\right)} \, \pi_2 \\\\
\exp{\left(-\frac{1}{2} (x-\mu_1)^T\Sigma^{-1}(x-\mu_1)\right)} \, \pi_1 &=
\exp{\left(-\frac{1}{2} (x-\mu_2)^T\Sigma^{-1}(x-\mu_2)\right)} \, \pi_2 \\\\
-(x-\mu_1)^T\Sigma^{-1}(x-\mu_1) + 2 \ln \pi_1 &=
-(x-\mu_2)^T\Sigma^{-1}(x-\mu_2) + 2 \ln \pi_2 \\\\
-x^T\Sigma^{-1} x + x^T\Sigma^{-1} \mu_1 + \mu_1 \Sigma^{-1} x -\mu_1^T\Sigma^{-1} \mu_1 + 2 \ln \pi_1 &=
-x^T\Sigma^{-1} x + x^T\Sigma^{-1} \mu_2 + \mu_2 \Sigma^{-1} x -\mu_2^T\Sigma^{-1} \mu_2 + 2 \ln \pi_2
\end{align*}
$$
$$
\begin{align*}
\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma_1\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_1)^T\Sigma_1^{-1}(x-\mu_1)\right)} \, \pi_1 &=
\frac{1}{\sqrt{(2\pi)^d\lvert \Sigma_2\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_2)^T\Sigma_2^{-1}(x-\mu_2)\right)} \, \pi_2 \\\\
\frac{1}{\sqrt{(\lvert \Sigma_1\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_1)^T\Sigma_1^{-1}(x-\mu_1)\right)} \, \pi_1 &=
\frac{1}{\sqrt{(\lvert \Sigma_2\rvert}}\exp{\left(-\frac{1}{2} (x-\mu_2)^T\Sigma_2^{-1}(x-\mu_2)\right)} \, \pi_2 \\\\
-\ln \left(\lvert \Sigma_1\rvert \right) -(x-\mu_1)^T\Sigma_1^{-1}(x-\mu_1) + 2 \ln \pi_1 &=
-\ln \left(\lvert \Sigma_2\rvert \right) -(x-\mu_2)^T\Sigma_2^{-1}(x-\mu_2) + 2 \ln \pi_2 \\\\
-\ln \left(\lvert \Sigma_1\rvert \right) -x^T\Sigma_1^{-1} x + x^T\Sigma_1^{-1} \mu_1 + \mu_1 \Sigma_1^{-1} x -\mu_1^T\Sigma_1^{-1} \mu_1 + 2 \ln \pi_1 &=
-\ln \left(\lvert \Sigma_2\rvert \right) -x^T\Sigma_2^{-1} x + x^T\Sigma_2^{-1} \mu_2 + \mu_2 \Sigma_2^{-1} x -\mu_2^T\Sigma_2^{-1} \mu_2 + 2 \ln \pi_2
\end{align*}
$$
Given the height $x$ of a person, decide whether the person is male ($y=1$) or female ($y=0$).
$$
\begin{align*}
P(y=1 \mid x) &=\frac{P(x \mid y=1)P(y=1)}{P(x)} =\frac{ \underbrace{P(x \mid y=1)}_{\text{likelihood}} \underbrace{P(y=1)}_{\text{prior}}}{\underbrace{P(x)}_{\text{marginal}}} \\
P(y=0 \mid x) &=\frac{P(x \mid y=0)P(y=0)}{P(x)}
\end{align*}
$$
$$
\begin{align*}
\text{If} \; P(y=1 \mid x) > P(y=0 \mid x),\; \text{then} \; \hat{y} = 1 \\
\text{If} \; P(y=1 \mid x) < P(y=0 \mid x),\; \text{then} \; \hat{y} = 0
\end{align*}
$$
$$\sigma_0 = \sigma_1 \qquad \text{and} \qquad P(y=0)=P(y=1)=\frac{1}{2}$$
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import norm
from matplotlib import animation
%matplotlib inline
x = np.arange(130,200)
mu0 = 160
mu1 = 173
sigma0 = 6.5
sigma1 = 6.5
L1 = norm.pdf(x,mu0,sigma0)
L2 = norm.pdf(x,mu1,sigma1)
prior0 = 1/2
prior1 = 1/2
Px = L1*prior0 + L2*prior1
posterior0 = (L1*prior0)/Px
posterior1 = (L2*prior1)/Px
var1 = posterior1 - posterior1**2;
plt.figure(figsize=(10,12))
plt.subplot(4,1,1)
plt.plot(x,L1,'r',x,L2,'b')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-10,0.08,'P(x|y=0)', color='r', fontsize=15)
plt.text(mu1,0.08,'P(x|y=1)', color='b', fontsize=15)
plt.subplot(4,1,2),
plt.plot(x,L1*prior0,'r',x,L2*prior1,'b',x,Px,'k')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-14,0.05,'P(x|y=0)P(y=0)', color='r', fontsize=15)
plt.text(mu1,0.05,'P(x|y=1)P(y=1)', color='b', fontsize=15)
plt.text(mu0+5,0.06,'P(x)', fontsize=15)
plt.subplot(4,1,3),
plt.plot(x,posterior0,'r',x,posterior1,'b')
plt.axis([135, 195, 0, 1.1])
plt.text(142,0.8,'P(y=0|x)', color='r', fontsize=15)
plt.text(180,0.8,'P(y=1|x)', color='b', fontsize=15)
plt.subplot(4,1,4),
plt.plot(x,var1,'k')
plt.axis([135, 195, 0, 1.1])
plt.text(170,0.4,'var(y|x)', fontsize=15)
plt.xlabel('x', fontsize = 15)
plt.show()
$$\sigma_0 = \sigma_1 \qquad \text{and} \qquad P(y=1) > P(y=0)$$
x = np.arange(130,200)
mu0 = 160
mu1 = 173
sigma0 = 6.5
sigma1 = 6.5
L1 = norm.pdf(x,mu0,sigma0)
L2 = norm.pdf(x,mu1,sigma1)
prior0 = 1/4
prior1 = 3/4
Px = L1*prior0 + L2*prior1
posterior0 = (L1*prior0)/Px
posterior1 = (L2*prior1)/Px
var1 = posterior1 - posterior1**2;
plt.figure(figsize=(10,12))
plt.subplot(4,1,1)
plt.plot(x,L1,'r',x,L2,'b')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-10,0.08,'P(x|y=0)', color='r', fontsize=15)
plt.text(mu1,0.08,'P(x|y=1)', color='b', fontsize=15)
plt.subplot(4,1,2),
plt.plot(x,L1*prior0,'r',x,L2*prior1,'b',x,Px,'k')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-10,0.05,'P(x|y=0)P(y=0)', color='r', fontsize=15)
plt.text(mu1,0.05,'P(x|y=1)P(y=1)', color='b', fontsize=15)
plt.text(185,0.03,'P(x)', fontsize=15)
plt.subplot(4,1,3),
plt.plot(x,posterior0,'r',x,posterior1,'b')
plt.axis([135, 195, 0, 1.1])
plt.text(140,0.8,'P(y=0|x)', color='r', fontsize=15)
plt.text(180,0.8,'P(y=1|x)', color='b', fontsize=15)
plt.subplot(4,1,4),
plt.plot(x,var1,'k')
plt.axis([135, 195, 0, 1.1])
plt.text(170,0.4,'var(y|x)', fontsize=15)
plt.xlabel('x', fontsize = 15)
plt.show()
$$\sigma_0 \ne \sigma_1 \qquad \text{and} \qquad P(y=0)=P(y=1)=\frac{1}{2}$$
x = np.arange(130,200)
mu0 = 160
mu1 = 173
sigma0 = 6.5
sigma1 = 15
L1 = norm.pdf(x,mu0,sigma0)
L2 = norm.pdf(x,mu1,sigma1)
prior0 = 1/2
prior1 = 1/2
Px = L1*prior0 + L2*prior1
posterior0 = (L1*prior0)/Px
posterior1 = (L2*prior1)/Px
var1 = posterior1 - posterior1**2;
plt.figure(figsize=(10,12))
plt.subplot(4,1,1)
plt.plot(x,L1,'r',x,L2,'b')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-10,0.08,'P(x|y=0)', color='r', fontsize=15)
plt.text(mu1,0.08,'P(x|y=1)', color='b', fontsize=15)
plt.subplot(4,1,2),
plt.plot(x,L1*prior0,'r',x,L2*prior1,'b',x,Px,'k')
plt.axis([135, 195, 0, 0.1])
plt.text(mu0-10,0.05,'P(x|y=0)P(y=0)', color='r', fontsize=15)
plt.text(mu1,0.05,'P(x|y=1)P(y=1)', color='b', fontsize=15)
plt.text(185,0.03,'P(x)', fontsize=15)
plt.subplot(4,1,3),
plt.plot(x,posterior0,'r',x,posterior1,'b')
plt.axis([135, 195, 0, 1.1])
plt.text(140,0.8,'P(y=0|x)', color='r', fontsize=15)
plt.text(180,0.8,'P(y=1|x)', color='b', fontsize=15)
plt.subplot(4,1,4),
plt.plot(x,var1,'k')
plt.axis([135, 195, 0, 1.1])
plt.text(170,0.4,'var(y|x)', fontsize=15)
plt.xlabel('x', fontsize = 15)
plt.show()
Lecture 23 in Learning Theory (Reza Shadmehr, Johns Hopkins University)
%%html
<center><iframe src="https://www.youtube.com/embed/3r5SlvjJptM?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/X1WB6IJqMjM?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/J5uQcuQ_fJ0?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/a357NoXy4Nk?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
m = 10
mu = 0
sigma = 5
x = np.random.normal(mu,sigma,[m,1])
xp = np.linspace(-20,20,100)
y0 = np.zeros([m,1])
X = []
for i in range(m):
X.append(norm.pdf(xp,x[i,0],sigma))
X = np.array(X).T
Xnorm = np.sum(X,1)/m
plt.figure(figsize=(10,6))
plt.plot(x,y0,'kx')
plt.plot(xp,X,'b--')
plt.plot(xp,Xnorm,'r',linewidth=5)
plt.show()
Probability density estimation
Start with prior beliefs, which can be thought of as a summary of opinions.
Given our prior, we can update our opinion, and produce a new opinion.
Iterate
$$ P(H,D) = P(H \mid D)P(D) = P(D \mid H)P(H) $$
%%html
<center><iframe src="https://www.youtube.com/embed/w1u6-_2jQJo?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
Compute posterior probability
Assume conditional independence
$$
\begin{align*} P(H \mid \underbrace{d_1,d_2,\cdots,d_m}_{\text{multiple evidences}}) &= \frac{P(d_1,d_2,\cdots,d_m \mid H) \; P(H)}{P(d_1,d_2,\cdots,d_m)} \\ \\
&=\frac{P(d_1 \mid H)P(d_2 \mid H)\cdots P(d_m \mid H) \;P(H)}{P(d_1,d_2,\cdots,d_m)} \\ \\
&= \eta\prod\limits_{i=1}^m P(d_i \mid H)P(H), \qquad \eta: \text{normalizing}
\end{align*}$$
$$
\begin{align*}P(a,b) &= P(a \mid b) P(b) \\
P(a,b \mid c) & = P(a \mid b,c) P(b \mid c)
\end{align*}$$
$$
\begin{align*}
P(H\mid d_1) &= \frac{P(d_1 \mid H) P(H)}{P(d_1)} = \eta_1 \, P(d_1 \mid H) \underbrace{P(H)}_{\text{prior}} \\
P(H\mid d_1 d_2) &= \frac{P(d_1 d_2\mid H) P(H)}{P(d_1 d_2)} = \frac{P(d_1 \mid d_2, H)P(d_2 \mid H) P(H)}{P(d_1 d_2)} = \frac{P(d_1 \mid H)P(d_2 \mid H) P(H)}{P(d_1 d_2)} = \eta_2 \, P(d_2 \mid H) \underbrace{P(H\mid d_1)}_{\text{acting as a prior}} \\
& \quad \vdots \\ \\
P(H \mid d_1,d_2,\cdots,d_m) &= \eta_m \,P(d_m \mid H) \underbrace{P(H \mid d_1,d_2,\cdots,d_{m-1})}_{\text{acting as a prior}}
\end{align*}
$$
$$P_0(H) = P(H) \implies \; P(H \mid d_1)=P_1(H) \implies \; P(H \mid d_1 d_2) = P_2(H) \implies \; \cdots$$
Example 1: Bernoulli model
The Bernoulli distribution
$$d = \{0,1\},\qquad \theta \in [0,1]$$import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def normalize(y, x):
return y / np.trapz(y, x)
N = 101
theta = np.linspace(0, 1, N)
prior = normalize(np.repeat(1, N), theta)
d = np.random.choice([0,1])
likelihood = theta**d * (1 - theta)**(1 - d)
posterior = likelihood * prior
posterior = normalize(posterior, theta)
plt.figure(figsize = (8,6))
plt.plot(theta, prior, linestyle = '--', label = 'prior')
plt.plot(theta, posterior, label = 'posterior')
plt.title('observed: %d' % d, fontsize = 15)
plt.xlabel(r'$\theta$', fontsize = 15)
plt.legend(fontsize = 13)
plt.show()
def bernoulli_model(d, theta, prior):
likelihood = theta**d * (1 - theta)**(1 - d)
posterior = likelihood * prior
return normalize(posterior, theta)
fig, ax = plt.subplots(ncols = 3, nrows = 3, figsize = (14,14))
ax = np.ravel(ax)
prior = normalize(np.repeat(1, N), theta)
for n in range(9):
observed = np.random.choice([0,1])
posterior = bernoulli_model(observed, theta, prior)
ax[n].plot(theta, prior, linestyle = '--')
ax[n].plot(theta, posterior)
ax[n].set_title('observed: %d' % observed)
ax[n].set_xlim([0,1])
prior = posterior
plt.show()
prior = normalize(np.repeat(1, N), theta)
observation = []
for _ in range(100):
observed = np.random.choice([0,1])
observation.append(observed)
posterior = bernoulli_model(observed, theta, prior)
prior = posterior
print(observation,'\n')
print(np.mean(observation))
plt.figure(figsize = (8,6))
plt.plot(theta, posterior)
plt.axvline(0.5, color = 'red', linestyle = '--')
plt.xlabel(r'$\theta$', fontsize = 15)
plt.show()
Example 2: Gaussian model
h = 4
sigma = 5
N = 201
D = np.linspace(-30, 30, N)
likelihood = stats.norm.pdf(D,h,sigma)
plt.figure(figsize = (8,6))
plt.plot(D, likelihood, color = u'#1f77b4')
plt.axvline(h, color = 'k', linestyle = '--')
plt.xlabel('D', fontsize = 15)
plt.ylabel('P(D|H)', fontsize = 15)
plt.title('Given h = {}'.format(h), fontsize = 15)
plt.show()
H = np.linspace(-30,30, N)
prior = normalize(np.repeat(1, N), H)
d = np.random.normal(0, sigma)
likelihood = []
for h in H:
likelihood.append(stats.norm.pdf(d,h,sigma))
posterior = likelihood * prior
posterior = normalize(posterior, H)
plt.figure(figsize = (8,6))
plt.plot(H, prior, u'#1f77b4', '--', label = 'prior')
plt.plot(H, posterior, u'#ff7f0e', label = 'posterior')
plt.xlim([-15,15])
plt.xlabel('H', fontsize = 15)
plt.ylabel('P(D|H)', fontsize = 15)
plt.title('Given d = {0:1.2f}'.format(d), fontsize = 15)
plt.legend(fontsize = 15)
plt.show()
def Gaussian_model(d, H, prior):
likelihood = []
for h in H:
likelihood.append(stats.norm.pdf(d,h,sigma))
posterior = likelihood * prior
return normalize(posterior, H)
fig, ax = plt.subplots(ncols = 3, nrows = 3, figsize = (14,14))
ax = np.ravel(ax)
for n in range(9):
observed = np.random.normal(0, sigma)
posterior = Gaussian_model(observed, H, prior)
ax[n].plot(H, prior, '--')
ax[n].plot(H, posterior)
ax[n].set_title('observed: %1.2f' % observed)
ax[n].set_ylim([0,0.3])
ax[n].set_xlim([-15,15])
prior = posterior
plt.show()
prior = normalize(np.repeat(1, N), H)
observation = []
for _ in range(20):
d = np.random.normal(0, sigma)
observation.append(d)
posterior = Gaussian_model(d, H, prior)
prior = posterior
print(np.mean(observation),'\n')
plt.figure(figsize = (8,6))
plt.plot(H, posterior, color = u'#ff7f0e')
plt.plot(observation, np.zeros(np.size(observation)), 'k+')
plt.xlim([-15, 15])
plt.xlabel('H', fontsize = 15)
plt.ylabel('P(H|D)', fontsize = 15)
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')