Statistics for Machine Learning
Table of Contents
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
## random number generation (1D)
m = 1000;
# uniform distribution U(0,1)
x1 = np.random.rand(m,1);
# uniform distribution U(a,b)
a = 1;
b = 5;
x2 = a + (b-a)*np.random.rand(m,1);
# standard normal (Gaussian) distribution N(0,1^2)
# x3 = np.random.normal(0, 1, m)
x3 = np.random.randn(m,1);
# normal distribution N(5,2^2)
x4 = 5 + 2*np.random.randn(m,1);
# random integers
x5 = np.random.randint(1, 6, size = (1,m));
$ \Rightarrow$ rough sense of density of data
# statistics
# numerically understand statisticcs
m = 100
x = np.random.rand(m,1)
#xbar = 1/m*np.sum(x, axis = 0)
#np.mean(x, axis = 0)
xbar = 1/m*np.sum(x)
np.mean(x)
varbar = (1/(m - 1))*np.sum((x - xbar)**2)
np.var(x)
print(xbar)
print(np.mean(x))
print(varbar)
print(np.var(x))
# various sample size m
m = np.arange(10, 2000, 20)
means = []
for i in m:
x = np.random.normal(10, 30, i)
means.append(np.mean(x))
plt.figure(figsize = (10,6))
plt.plot(m, means, 'bo', markersize = 4)
plt.axhline(10, c = 'k', linestyle='dashed')
plt.xlabel('# of smaples (= sample size)', fontsize = 15)
plt.ylabel('sample mean', fontsize = 15)
plt.ylim([0, 20])
plt.show()
$$ \bar{x} =\frac{x_1+x_2+...+x_m}{m}$$
$$ \bar{x} \rightarrow N\left(\mu_x,\left(\frac{\sigma}{\sqrt{m}}\right)^2 \right) $$
N = 100
m = np.array([10, 40, 160]) # sample of size m
S1 = [] # sample mean (or sample average)
S2 = []
S3 = []
for i in range(N):
S1.append(np.mean(np.random.rand(m[0], 1)))
S2.append(np.mean(np.random.rand(m[1], 1)))
S3.append(np.mean(np.random.rand(m[2], 1)))
plt.figure(figsize = (10, 6))
plt.subplot(1,3,1), plt.hist(S1, 21), plt.xlim([0, 1]), plt.title('m = '+ str(m[0])), plt.yticks([])
plt.subplot(1,3,2), plt.hist(S2, 21), plt.xlim([0, 1]), plt.title('m = '+ str(m[1])), plt.yticks([])
plt.subplot(1,3,3), plt.hist(S3, 21), plt.xlim([0, 1]), plt.title('m = '+ str(m[2])), plt.yticks([])
plt.show()
$$
x^{(i)} = \begin{bmatrix}x_1^{(i)} \\ x_2^{(i)}\\ \vdots \end{bmatrix}, \quad X = \begin{bmatrix} -& (x^{(i)})^T & -\\ - & (x^{(i)})^T & -\\ & \vdots & \\ - & (x^{(m)})^T & -\end{bmatrix}$$
$$
\begin{align*}
\text{sample mean} \; \bar x &= \frac{x^{(1)} + x^{(2)} + \cdots + x^{(m)}}{m} = \frac{1}{m} \sum\limits_{i=1}^{m}x^{(i)} \\
\text{sample variance} \; S^2 &= \frac{1}{m-1} \sum\limits_{i=1}^{m}(x^{(i)} - \bar x)^2 \\
(\text{Note: } &\text{population variance} \; \sigma^2 = \frac{1}{N}\sum\limits_{i=1}^{N}(x^{(i)} - \mu)^2
\end{align*}
$$
$$
\begin{align*} \text{Sample Variance} : S_x &= \frac{1}{m-1} \sum\limits_{i=1}^{m}\left(x^{(i)}-\bar x\right)^2 \\
\text{Sample Covariance} : S_{xy} &= \frac{1}{m-1} \sum\limits_{i=1}^{m}\left(x^{(i)}-\bar x\right)\left(y^{(i)}-\bar y \right)\\
\text{Sample Covariance matrix} : S &=
\begin{bmatrix}
S_x & S_{xy} \\
S_{yx} & S_y
\end{bmatrix}\\
\text{sample correlation coefficient} :
r &= \frac{S_{xy}}{ \sqrt {S_{xx}\cdot S_{yy}} }
\end{align*}$$
$+1 \to$ close to a straight line
$-1 \to$ close to a straight line
Indicate how close to a linear line, but
No information on slope
$$
\sum = \begin{bmatrix}
E[(X_1-\mu_1)(X_1-\mu_1)]& E[(X_1-\mu_1)(X_2-\mu_2)] & \cdots &E[(X_1-\mu_1)(X_n-\mu_n)]\\
E[(X_2-\mu_2)(X_1-\mu_1)]& E[(X_2-\mu_2)(X_2-\mu_2)] & \cdots &E[(X_2-\mu_2)(X_n-\mu_n)]\\
\vdots & \vdots & \ddots & \vdots\\
E[(X_n-\mu_n)(X_1-\mu_1)]& E[(X_n-\mu_n)(X_2-\mu_2)] & \cdots &E[(X_n-\mu_n)(X_n-\mu_n)]\\
\end{bmatrix}$$
# correlation coefficient
m = 300
x = np.random.rand(m)
y = np.random.rand(m)
xo = np.sort(x)
yo = np.sort(y)
yor = -np.sort(-y)
plt.figure(figsize = (8, 8))
plt.plot(x, y, 'ko', label = 'random')
plt.plot(xo, yo, 'ro', label = 'sorted')
plt.plot(xo, yor, 'bo', label = 'reversely ordered')
plt.xticks([])
plt.yticks([])
plt.xlabel('x', fontsize = 20)
plt.ylabel('y', fontsize = 20)
plt.axis('equal')
plt.legend(fontsize = 12)
plt.show()
print(np.corrcoef(x,y), '\n')
print(np.corrcoef(xo,yo), '\n')
print(np.corrcoef(xo,yor))
# correlation coefficient
m = 300
x = 2*np.random.randn(m)
y = np.random.randn(m)
xo = np.sort(x)
yo = np.sort(y)
yor = -np.sort(-y)
plt.figure(figsize = (8, 8))
plt.plot(x, y, 'ko', label = 'random')
plt.plot(xo, yo, 'ro', label = 'sorted')
plt.plot(xo, yor, 'bo', label = 'reversely ordered')
plt.xticks([])
plt.yticks([])
plt.xlabel('x', fontsize = 20)
plt.ylabel('y', fontsize = 20)
plt.axis('equal')
plt.legend(fontsize = 12)
plt.show()
print(np.corrcoef(x,y), '\n')
print(np.corrcoef(xo,yo), '\n')
print(np.corrcoef(xo,yor))
import seaborn as sns
import pandas as pd
d = {'col. 1': x, 'col. 2': xo, 'col. 3': yo, 'col. 4': yor}
df = pd.DataFrame(data = d)
sns.pairplot(df)
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')