Parameter Estimation in Probabilistic Model
Table of Contents
Reconstructing the probability density function from a set of given data samples $y_1, y_2, \cdots, y_m$
Want to recover the underlying probability density function generating our dataset
$$P \left(y_1,y_2,\cdots,y_m \,;\, \mu,\sigma^2 \right) = \prod_{i=1}^{m} P \left(y_i \,;\, \mu,\sigma^2 \right)$$
Method of estimating the parameters of a probability distribution by maximizing a likelihood function
Under the assumed statistical model the observed data is most probable
$$\mathcal{L} = P(D \mid \theta) = P(D \,;\, \theta)$$
$$\theta_{MLE} = \underset{\theta}{\mathrm{argmax}}\;\; P(D \,;\, \theta)$$
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
%matplotlib inline
# MLE of Gaussian distribution
# mu
m = 20
mu = 0
sigma = 5
x = np.random.normal(mu,sigma,[m,1])
xp = np.linspace(-20, 20, 100)
y0 = np.zeros([m, 1])
muhat = [-5, 0, 5, np.mean(x)]
plt.figure(figsize=(8, 8))
for i in range(4):
yp = norm.pdf(xp, muhat[i], sigma)
y = norm.pdf(x, muhat[i], sigma)
logL = np.sum(np.log(y))
plt.subplot(4, 1, i+1)
plt.plot(xp, yp, 'r')
plt.plot(x, y, 'bo')
plt.plot(np.hstack([x, x]).T, np.hstack([y, y0]).T, 'k--')
plt.title(r'$\hat\mu$ = {0:.2f}'.format(muhat[i]), fontsize=15)
plt.text(-15,0.06,np.round(logL,4),fontsize=15)
plt.axis([-20, 20, 0, 0.11])
plt.tight_layout()
plt.show()
Compare to a result from formula
$$\mu_{MLE}=\frac{1}{m}\sum_{i = 1}^{m}x_i$$# mean is unknown in this example
# variance is known in this example
m = 10
mu = 0
sigma = 5
x = np.random.normal(mu,sigma,[m,1])
mus = np.arange(-10, 10.5, 0.5)
LOGL = []
for i in range(np.size(mus)):
y = norm.pdf(x, mus[i], sigma)
logL = np.sum(np.log(y))
LOGL.append(logL)
muhat = np.mean(x)
print(muhat)
plt.figure(figsize=(10, 6))
plt.plot(mus, LOGL, '.')
plt.title('$log (\prod \mathcal{N}(x \mid \mu , \sigma^2))$', fontsize=20)
plt.xlabel(r'$\hat \mu$', fontsize=15)
plt.grid(alpha=0.3)
plt.show()
# mean is known in this example
# variance is unknown in this example
m = 100
mu = 0
sigma = 3
x = np.random.normal(mu,sigma,[m,1]) # samples
sigmas = np.arange(1, 10, 0.1)
LOGL = []
for i in range(sigmas.shape[0]):
y = norm.pdf(x, mu, sigmas[i]) # likelihood
logL = np.sum(np.log(y))
LOGL.append(logL)
sigmahat = np.sqrt(np.var(x))
print(sigmahat)
plt.figure(figsize=(10,6))
plt.title(r'$\log (\prod \mathcal{N} (x|\mu,\sigma^2))$',fontsize=20)
plt.plot(sigmas, LOGL, '.')
plt.xlabel(r'$\hat \sigma$', fontsize=15)
plt.axis([0, np.max(sigmas), np.min(LOGL), -200])
plt.grid(alpha=0.3)
plt.show()
$$\begin{align*}P\left(y \mid x\right) & \sim \mathcal{N}\left(Cx,R\right)\\
&= \frac{1}{\sqrt{\left(2\pi\right)^2\vert R \vert}}\exp\left(-\frac{1}{2}\left(y-Cx\right)^TR^{-1}\left(y-Cx\right)\right)\end{align*}$$
$$
\begin{align*}
\left(y-Cx\right)^TR^{-1}\left(y-Cx\right) &= y^TR^{-1}y-y^TR^{-1}Cx-x^TC^TR^{-1}y+x^TC^TR^{-1}Cx \\ \\
\implies \frac{d\ell}{dx} & =0=-2C^TR^{-1}y + 2C^TR^{-1}Cx\\
\therefore \;\; \hat{x} &=\left(C^TR^{-1}C\right)^{-1}C^TR^{-1}y
\end{align*}
$$
Example of two rulers
x = 5 # true state (length in this example)
a = 1 # sigma of a
b = 2 # sigma of b
YA = []
YB = []
XML = []
for i in range(2000):
ya = x + np.random.normal(0,a)
yb = x + np.random.normal(0,b)
xml = (1/a**2*ya + 1/b**2*yb)/(1/a**2+1/b**2)
YA.append(ya)
YB.append(yb)
XML.append(xml)
plt.figure(figsize=(8, 6))
plt.subplot(3, 1, 1), plt.hist(YA, 41, density=True), plt.axis([0, 10, 0, 0.5]),
plt.title(r'$y_a$', fontsize=20), plt.grid(alpha=0.3)
plt.subplot(3, 1, 2), plt.hist(YB, 41, density=True), plt.axis([0, 10, 0, 0.5]),
plt.title(r'$y_b$', fontsize=20), plt.grid(alpha=0.3)
plt.subplot(3, 1, 3), plt.hist(XML, 41, density=True), plt.axis([0, 10, 0, 0.5]),
plt.title(r'$x_{ML}$', fontsize=20), plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
Example of two GPSs
x = np.array([5, 10]).reshape(-1, 1) # true position
mu = np.array([0, 0])
Ra = np.matrix([[9, 1],
[1, 1]])
Rb = np.matrix([[1, 1],
[1, 9]])
YA = []
YB = []
XML = []
for i in range(1000):
ya = x + np.random.multivariate_normal(mu, Ra).reshape(-1, 1)
yb = x + np.random.multivariate_normal(mu, Rb).reshape(-1, 1)
xml = (Ra.I+Rb.I).I*(Ra.I*ya+Rb.I*yb)
YA.append(ya.T)
YB.append(yb.T)
XML.append(xml.T)
YA = np.vstack(YA)
YB = np.vstack(YB)
XML = np.vstack(XML)
plt.figure(figsize=(10, 6))
plt.title('Data Fusion', fontsize=15)
plt.plot(YA[:,0], YA[:,1], 'b.', label='Observation 1')
plt.plot(YB[:,0], YB[:,1], 'r.', label='Observation 2')
plt.plot(XML[:,0], XML[:,1], 'k.', label='MLE')
plt.axis('equal')
plt.grid(alpha=0.3)
plt.legend(fontsize=15)
plt.show()
%%html
<center><iframe src="https://www.youtube.com/embed/52jlBrAcw9Q?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
$$ P(D) = \int P(\theta)P(D \mid \theta) d\theta $$
$$ \theta_{MAP} = \underset{\theta}{\mathrm{argmax}} \;\; \left\{\sum_{i=1}^{m}\log{P \left(d_i \mid \theta \right)} + \log{P(\theta) }\right\} $$
$$ \theta_{MLE} = \underset{\theta}{\mathrm{argmax}} \;\; \left\{\sum_{i=1}^{m}\log{P \left(d_i \mid \theta \right)}\right\} $$
$$ x_i \sim \mathcal{N}(\theta,\sigma^2) $$
$$\therefore \;\theta_{MAP} = \frac{\frac{m}{\sigma^2}}{\frac{m}{\sigma^2}+1}\bar{x}+\frac{1}{\frac{m}{\sigma^2}+1}\mu \;\;\;\text{: look familiar ?}$$
Note: prior knowledge
Example) Experiment in class
Suppose that $\theta$ is a random variable with $\theta \sim \mathcal{N}(\mu,1^2)$, but a prior knowledge (unknown $\theta$ and known $\mu, \; \sigma^2$)
$$ x_i \sim \mathcal{N}(\theta,\sigma^2) $$# known
mu = 5
sigma = 2
# unknown theta
theta = np.random.normal(mu,1)
x = np.random.normal(theta, sigma)
print('theta = {:.4f}'.format(theta))
print('x = {:.4f}'.format(x))
# MAP
m = 4
X = np.random.normal(theta,sigma,[m,1])
xbar = np.mean(X)
theta_MAP = m/(m+sigma**2)*xbar + sigma**2/(m+sigma**2)*mu
print('mu = 5')
print('xbar = {:.4f}'.format(xbar))
print('theta_MAP = {:.4f}'.format(theta_MAP))
# theta
mu = 5
theta = np.random.normal(mu,1)
sigma = 2
m = 2000
X = np.random.normal(theta,sigma,[m,1])
X = np.asmatrix(X)
print('theta = {:.4f}'.format(theta))
plt.figure(figsize=(10,6))
plt.hist(X,31)
plt.xlim([-5,15])
plt.show()
m = 50
XMLE = []
XMAP = []
for k in range(m):
xmle = np.mean(X[0:k+1,0])
xmap = k/(k+sigma**2)*xmle + sigma**2/(k + sigma**2)*mu
XMLE.append(xmle)
XMAP.append(xmap)
print('theta = {:.4f}'.format(theta))
plt.figure(figsize=(10,6))
plt.plot(XMLE)
plt.plot(XMAP)
plt.legend(['MLE','MAP'])
plt.show()
$$y = Ax + \omega$$
$x$ is what we want to estimate
$y$ is measured
$A$ characterizes sensors or measurements
$\omega$ is sensor noise
Common assumptions
$x \sim \mathcal{N}(\mu_x, \Sigma_x)$ $\rightarrow$ for simplicity $x \sim \mathcal{N}(0, \Sigma_x)$
$\omega \sim \mathcal{N}(\mu_{\omega}, \Sigma_{\omega})$ $\rightarrow$ $\omega \sim \mathcal{N}(0, \Sigma_{\omega})$ is usually
$x$ and $\omega$ are independent
$$\begin{bmatrix} x \\ \omega \end{bmatrix} \sim \mathcal{N}\left( \begin{bmatrix} \mu_x \\ \mu_{\omega}\end{bmatrix}, \begin{bmatrix} \Sigma_{x}& 0\\ 0& \Sigma_{\omega} \end{bmatrix} \right)$$
$$\begin{bmatrix} x \\ y \end{bmatrix} = \begin{bmatrix} I & 0 \\A & I \end{bmatrix} \begin{bmatrix} x \\ \omega \end{bmatrix}$$
$$\Sigma_y = \underbrace{ A\Sigma_xA^T}_{\text{signal covariance}} + \underbrace{\Sigma_{\omega}}_{\text{noise covariance}}$$
$$x \mid y \sim \mathcal{N} \left(\mu_x + \Sigma_{xy}\Sigma_{y}^{-1}(y-\mu_y), \; \Sigma_{x} - \Sigma_{xy}\Sigma_{y}^{-1}\Sigma_{yx} \right)$$
$$
\begin{align*}
\hat{x} = \phi(y) &= E[x \mid y]\\
& = \mu_x + \Sigma_{xy}\Sigma_{y}^{-1}(y-\mu_y)\\
& = \mu_x + \Sigma_x A^T(A\Sigma_xA^T + \Sigma_{\omega})^{-1}(y-\mu_y)\\
& = \mu_x + K(y-\mu_y)\\\\
\text{cov}(x \mid y) &= \Sigma_{x} - \Sigma_{xy}\Sigma_{y}^{-1}\Sigma_{yx}\\\\
&= \Sigma_{x} - \Sigma_x A^T (A\Sigma_xA^T + \Sigma_{\omega})^{-1} A \Sigma_x\leq \Sigma_x
\end{align*}
$$
Interpretation
$\mu_x$ is our best prior guess of $x$ (before measurement)
$y - \mu_y$ is the discrepancy between what we actually measure ($y$) and the expected value of $y$
Estimator modifies prior guess by $K$ times this discrepancy
Estimator blends prior information with measurement
Covariance of estimation error is always less than prior covariance of $x$
$$\text{var}(y_a) = \sigma_a^2\\
\text{var}(y_b) = \sigma_b^2$$
$$
\begin{align*}
\hat x_1 &= y_a\\
\hat{\sigma}_{1}^2 &= \sigma_a^2
\end{align*}
$$
$$
\begin{align*}
E[\hat x] &= \frac{\frac{1}{\sigma_a^2} y_a + \frac{1}{\sigma_b^2}y_b}{\frac{1}{\sigma_a^2} + \frac{1} {\sigma_b^2}} = \frac{\frac{1}{\sigma^2_a}}{\frac{1}{\sigma^2_a} + \frac{1}{\sigma^2_b}}y_a + \frac{\frac{1}{\sigma^2_b}}{\frac{1}{\sigma^2_a} + \frac{1}{\sigma^2_b}}y_b \quad \text{and set } K = \frac{\frac{1}{\sigma_b^2}}{\frac{1}{\sigma_a^2} + \frac{1}{\sigma_b^2}}\\\\
\hat{x}_2 = E[\hat x] &= (1-K) y_a + K y_b\\
&= (1-K) \hat x_1 + K y_b\\
&= \hat x_1 + K (y_b - \hat x_1 )
\end{align*}
$$
$$\underbrace{\hat x_2}_{\text{final estimation}} = \underbrace{\hat x_1}_{\text{prediction}} + K ( \overbrace{ \underbrace{y_b}_{\text{measured}} - \hat x_1}^{\text{prediction error}})$$
$$
\begin{align*}
x_{t+1} &= A x_t + B u_t\\
y_t &= C x_t
\end{align*}
$$
Summary
%%html
<center><iframe src="https://www.youtube.com/embed/rf3DKqWajWY?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/5yUjYCkm2jI?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
Start with prior beliefs, which can be thought of as a summary of opinions.
Given our prior, we can update our opinion, and produce a new opinion.
Iterate
%%javascript
$.getScr`aipt('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')