Table of Contents
Correlation of Two Random Variables
Correlation Coefficient Plot
Covariance Matrix
How?
idea: highly correlated data contains redundant features
Now consider a change of axes
Each example $x$ has 2 features $\{u_1,u_2\}$
Consider ignoring the feature $u_2$ for each example
Each 2-dimensional example $x$ now becomes 1-dimensional $x = \{u_1\}$
Are we losing much information by throwing away $u_2$ ?
No. Most of the data spread is along $u_1$ (very little variance along $u_2$)
Find unit vector $u$ such that maximizes variance of projections
Note: $m \approx m-1$ for big data
Choose top $k$ (orthonormal) eigenvectors, $U = [u_1, u_2, \cdots, u_k]$
Project $x_i$ onto span $\{ u_1, u_2, \cdots, u_k\}$
$\qquad \qquad \qquad \qquad x^{(i)} \rightarrow$ projection onto unit vector $u \implies u^Tx^{(i)} = $ distance from the origin along $u$
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# data generation
m = 5000
mu = np.array([0, 0])
sigma = np.array([[3, 1.5],
[1.5, 1]])
X = np.random.multivariate_normal(mu, sigma, m)
X = np.asmatrix(X)
fig = plt.figure(figsize = (6, 4))
plt.plot(X[:,0], X[:,1], 'k.', alpha = 0.3)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.show()
S = 1/(m-1)*X.T*X
D, U = np.linalg.eig(S)
# idx = np.argsort(-D)
# D = D[idx]
# U = U[:,idx]
print(D, '\n')
print(U)
h = U[1,0]/U[0,0]
xp = np.arange(-6, 6, 0.1)
yp = h*xp
plt.figure(figsize = (6, 4))
plt.plot(X[:,0], X[:,1], 'k.', alpha = 0.3)
plt.plot(xp, yp, 'r', linewidth = 3)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.show()
Z = X*U[:,0]
plt.figure(figsize = (6, 4))
plt.hist(Z, 51)
plt.xlim([-8, 8])
plt.show()
Z = X*U[:,1]
plt.figure(figsize = (6, 4))
plt.hist(Z, 51)
plt.xlim([-8, 8])
plt.show()
from sklearn.decomposition import PCA
X = np.array(X)
pca = PCA(n_components = 1)
pca.fit(X)
u = pca.transform(X)
plt.figure(figsize = (6, 4))
plt.hist(u, 51)
plt.xlim([-8, 8])
plt.show()
%%html
<center><iframe
width="560" height="315" src="https://www.youtube.com/embed/Pkit-64g0eU" frameborder="0" allowfullscreen>
</iframe></center>
%%html
<center><iframe
width="560" height="315" src="https://www.youtube.com/embed/x4lvjVjUUqg" frameborder="0" allowfullscreen>
</iframe></center>
%%html
<center><iframe
width="560" height="315" src="https://www.youtube.com/embed/2t62WkNIqxY" frameborder="0" allowfullscreen>
</iframe></center>
Download files
from google.colab import drive
drive.mount('/content/drive')
from six.moves import cPickle
X = cPickle.load(open('/content/drive/MyDrive/ML_Colab/ML_data/pca_spring.pkl','rb'))
X = np.asmatrix(X.T)
print(X.shape)
m = X.shape[0]
plt.figure(figsize = (6, 4))
plt.subplot(1,3,1)
plt.plot(X[:, 0], -X[:, 1], 'r')
plt.axis('equal')
plt.title('Camera 1')
plt.subplot(1,3,2)
plt.plot(X[:, 2], -X[:, 3], 'b')
plt.axis('equal')
plt.title('Camera 2')
plt.subplot(1,3,3)
plt.plot(X[:, 4], -X[:, 5], 'k')
plt.axis('equal')
plt.title('Camera 3')
plt.show()
X = X - np.mean(X, axis = 0)
S = 1/(m-1)*X.T*X
D, U = np.linalg.eig(S)
idx = np.argsort(-D)
D = D[idx]
U = U[:,idx]
print(D, '\n')
print(U)
plt.figure(figsize = (6, 4))
plt.stem(np.sqrt(D))
plt.grid(alpha = 0.3)
plt.show()
# relative magnitutes of the principal components
Z = X*U
xp = np.arange(0, m)/24 # 24 frame rate
plt.figure(figsize = (6, 4))
plt.plot(xp, Z)
plt.yticks([])
plt.show()
## projected onto the first principal component
# 6 dim -> 1 dim (dim reduction)
# relative magnitute of the first principal component
Z = X*U[:,0]
plt.figure(figsize = (6, 4))
plt.plot(Z)
plt.yticks([])
plt.show()
Geometry of Linear Maps
Matrix $A$ (or linear transformation) = rotate + stretch/compress
an extremely important fact:
Singular Values and Singular Vectors
the numbers $\sigma_1, \cdots, \sigma_n$ are called the singular values of $A$ by convention, $\sigma_i > 0$
the vectors $u_1, \cdots, u_n$ are unit vectors along the principal semiaxes of $AS$
the vectors $v_1, \cdots, v_n$ are the preimages of the principal semiaxes, defined so that
Thin Singular Value Decomposition
We have $A \in \mathbb{R}^{m \times n}$, skinny and full rank (i.e., $r = n$), and
let
in matrix form, the above equation is $AV = \hat U \hat \Sigma$ and since $V$ is orthogonal
called the thin (or reduced) SVD of $A$
This is the (full) singular value decomposition of $A$
Every matrix $A$ has a singular value decomposition.
If $A$ is not full rank, then some of the diagonal entries of $\Sigma$ will be zero
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
[U, S, VT] = np.linalg.svd(A, full_matrices = True)
print(U.shape, S.shape, VT.shape, '\n')
print(U, '\n')
print(S, '\n')
print(VT)
$$ VS V^T \quad \left(\text{where }S = \frac{1}{m}X^TX = \left(\frac{X}{\sqrt{m}}\right)^T \frac{X}{\sqrt{m}} = A^TA \right) $$
A = np.array([[4, 4],
[-3, 3]])
A = np.asmatrix(A)
[D, V] = np.linalg.eig(A.T*A)
print(V, '\n')
print(D)
[D, U] = np.linalg.eig(A*A.T)
print(U, '\n')
print(D)
%%html
<center><iframe src="https://www.youtube.com/embed/Nx0lRBaXoz4?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
Learn how to reduce the amount of storage needed for an image using the singular value decomposition.
Approximation of $A$
Downlaod the image file
from google.colab import drive
drive.mount('/content/drive')
import cv2
img = cv2.imread('/content/drive/MyDrive/ML_Colab/ML_data/kaist_me.jpg', 0)
print(img.shape)
plt.figure(figsize = (6, 4))
plt.imshow(img, 'gray')
plt.axis('off')
plt.show()
A = img
[U, S, VT] = np.linalg.svd(A, full_matrices = False)
U = np.asmatrix(U)
S = np.asmatrix(np.diag(S))
VT = np.asmatrix(VT)
print(U.shape, S.shape, VT.shape, '\n')
sig_val = np.diag(S)
plt.figure(figsize = (6, 4))
plt.stem(sig_val)
plt.title('Singular Values')
plt.show()
Approximated image when k = 20
k = 20
low_U = U[:, :k]
low_S = S[:k, :k]
low_VT = VT[:k, :]
A_hat = low_U*low_S*low_VT
plt.figure(figsize = (8, 4))
plt.subplot(1,2,1), plt.imshow(A, 'gray'), plt.axis('off'), plt.title('Original image')
plt.subplot(1,2,2), plt.imshow(A_hat, 'gray'), plt.axis('off'), plt.title('Approximated image w/ rank = 20')
plt.show()
Approximated images with k varied
K = [1, 2, 3, 4, 10, 20, 30, 100]
plt.figure(figsize = (10, 4))
for i in range(len(K)):
low_U = U[:, :K[i]]
low_S = S[:K[i], :K[i]]
low_VT = VT[:K[i], :]
A_hat = low_U*low_S*low_VT
plt.subplot(2,4,i+1), plt.imshow(A_hat, 'gray'), plt.axis('off')
plt.title('rank {}'.format(K[i]))
plt.show()
The data provided in this problem is a set of sequential pictures (or frames) taken from a video of people walking. The total number of pictures is 20. In this problem, you are asked to get rid of people in the pictures and only extract background from pictures.
(a) Plot 'capimage_0.jpg' image in grayscale.
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
data_path = '/content/drive/MyDrive/ML_Colab/ML_data'
img = cv2.imread(data_path + '/capimage_0.jpg', 0)
plt.figure(figsize = (6, 4))
plt.imshow(img, 'gray')
plt.axis('off')
plt.show()
print(img.shape)
row = img.shape[0]
col = img.shape[1]
(b) Load the image into grayscale and reshape each picture as (row $\times$ col, 1), then horizontally stack them to form matrix $A$ with a shape of (row $\times$ col, 20).
for i in range(20):
img = cv2.imread(data_path + '/capimage_' + str(i) + '.jpg', 0)
flat_img = img.reshape(-1, 1)
if i == 0:
A = flat_img
else:
A = np.hstack((A, flat_img))
A.shape
(c) Compute SVD of matrix $A$ and plot the sigular values.
U, S, VT = np.linalg.svd(A, full_matrices = False)
U = np.asmatrix(U)
S = np.asmatrix(np.diag(S))
VT = np.asmatrix(VT)
print(U.shape, S.shape, VT.shape, '\n')
plt.figure(figsize = (6, 4))
plt.stem(np.diag(S))
plt.title('Singular Values')
plt.show()
(d) Apply low rank approximation to matrix $A$ with $k=1$, and then plot $\hat A[:,0]$ with a reshape of (row, col).
A_hat = U[:,0] * S[0,0] * VT[0,:]
plt.figure(figsize = (6, 8))
plt.subplot(2,1,1)
plt.imshow(A[:, 0].reshape(row, col), 'gray')
plt.axis('off')
plt.subplot(2,1,2)
plt.imshow(A_hat[:, 0].reshape(row, col), 'gray')
plt.axis('off')
plt.show()
Suppose we have data that is a function of both space and time
How can use this data to decompose this system into its building blocks?
To start to answer these questions, we can start with a simple separation of variables:
Arrangement of Data
Suppose we collect data at spatial locations $x_1, x_2, \cdots, x_n$
Suppose we collect data at times $t_1, t_2, \cdots, t_m$
We can start by assembling the data $y(x,t)$ into an $n \times m$ matrix
$\quad\;\;\;$(typically after first subtracting a mean or equilibrium state)
POD Algorithm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# data generated from the function
x = np.linspace(0, 5, 100)
Nx = np.size(x)
dt = 0.01
t = np.arange(0, 5, dt)
k = 2
omega = 4*np.pi
Y = np.zeros([np.size(x), np.size(t)], dtype = 'd')
for i in range(0, np.size(x)):
for j in range(0, np.size(t)):
Y[i, j] = np.sin(omega*t[j] - k*x[i]) + 0.1*np.random.randn()
# Let's see the signal in time
from matplotlib.animation import FuncAnimation
import time
from IPython import display
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
def animate(i):
ax1.clear()
ax1.plot(x, Y[:,i], linewidth = 2)
ax1.set_xlabel('x')
ax1.set_xlim(0, 5)
ax1.set_ylim(-2, 2)
ax1.grid()
ani = FuncAnimation(fig, animate, frames = int(np.size(t)/1), interval = 10)
video = ani.to_html5_video()
html = display.HTML(video)
display.display(html)
plt.close()
U, S, VT = np.linalg.svd(Y, full_matrices = False)
print(U.shape)
print(S.shape)
print(VT.shape)
plt.semilogy(S, '-o')
plt.xlim(0, 5)
plt.ylabel('Singular Value', fontsize = 15)
plt.xlabel('Index', fontsize = 15)
plt.show()
plt.plot(x, U[:,0])
plt.title('POD Mode 1', fontsize = 12)
plt.xlabel('x', fontsize = 15)
plt.ylim([-2, 2])
plt.grid()
plt.show()
plt.plot(x, U[:,1])
plt.title('POD Mode 2', fontsize = 12)
plt.xlabel('x', fontsize = 15)
plt.ylim([-2, 2])
plt.grid()
plt.show()
plt.plot(t, S[0]*VT[0,:], label = 'a1(t)', linewidth = 2)
plt.plot(t, S[1]*VT[1,:], label = 'a2(t)', linewidth = 2)
plt.xlim(0, 5)
plt.xlabel('time', fontsize = 15)
plt.legend()
plt.title('Mode Coefficients', fontsize = 12)
plt.grid()
plt.show()
U = np.asmatrix(U)
diagS = np.asmatrix(np.diag(S))
VT = np.asmatrix(VT)
print(U.shape, diagS.shape, VT.shape, '\n')
k = 2
low_U = U[:, :k]
low_S = diagS[:k, :k]
low_VT = VT[:k, :]
low_Y = low_U*low_S*low_VT
print(low_U.shape, low_S.shape, low_VT.shape, '\n')
# Let's see the signal in time
from matplotlib.animation import FuncAnimation
import time
from IPython import display
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
def animate(i):
ax1.clear()
ax1.plot(x, low_Y[:,i], linewidth = 2)
ax1.set_xlim(0, 5)
ax1.set_ylim(-2, 2)
ax1.grid()
ani = FuncAnimation(fig, animate, frames = int(np.size(t)/1), interval = 10)
video = ani.to_html5_video()
html = display.HTML(video)
display.display(html)
plt.close()
Data Desciription and its Visualization
ALL.mat - 151 snapshots of u and v velocity as well as vorticity (UALL, VALL, VORTALL).
Datat set for fluid flow past at cylinder at RE = 100
Steven L. Brunton
May 12, 2016
from google.colab import drive
drive.mount('/content/drive')
from scipy import io
flows = io.loadmat('/content/drive/MyDrive/tutorials/CAE및응용역학/24년_춘계학술대회/CYLINDER_ALL.mat')
flows_mat_u = flows['UALL']
flows_mat_v = flows['VALL']
flows_mat_vort = flows['VORTALL']
flows_mat_u.shape
nx = 449
ny = 199
flows_mat_u[:,0].reshape(nx, ny),
plt.subplot(1,3,1)
plt.imshow(flows_mat_u[:,150].reshape(nx, ny), 'gray')
plt.title('u velocity')
plt.axis('off')
plt.subplot(1,3,2)
plt.imshow(flows_mat_v[:,150].reshape(nx, ny), 'gray')
plt.title('v velocity')
plt.axis('off')
plt.subplot(1,3,3)
plt.imshow(flows_mat_vort[:,150].reshape(nx, ny), 'gray')
plt.title('vorticity')
plt.axis('off')
plt.show()
# Let's see the flow in time
from matplotlib.animation import FuncAnimation
import time
from IPython import display
fig = plt.figure()
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2)
ax3 = fig.add_subplot(1, 3, 3)
def animate(i):
ax1.clear()
ax1.imshow(flows_mat_u[:,i].reshape(nx, ny), 'gray')
ax1.set_title('u velocity')
ax1.axis('off')
ax2.clear()
ax2.imshow(flows_mat_v[:,i].reshape(nx, ny), 'gray')
ax2.set_title('v velocity')
ax2.axis('off')
ax3.clear()
ax3.imshow(flows_mat_vort[:,i].reshape(nx, ny), 'gray')
ax3.set_title('vorticity')
ax3.axis('off')
ani = FuncAnimation(fig, animate, frames = int(150/1), interval = 10)
video = ani.to_html5_video()
html = display.HTML(video)
display.display(html)
plt.close()
Typically after first subtracting a mean or equilibrium state
flows_mat_vort_mean = flows_mat_vort.mean(axis = 1)
flows_mat_vort_centered = flows_mat_vort - flows_mat_vort_mean[:, None]
flows_mat_vort_centered_normalized = (flows_mat_vort_centered - flows_mat_vort_centered.min()) / (flows_mat_vort_centered.max() - flows_mat_vort_centered.min())
plt.imshow(flows_mat_vort_mean.reshape(nx, ny), 'gray')
plt.title('vorticity')
plt.axis('off')
plt.show()
flows_mat = flows_mat_vort_centered_normalized
U, S, VT = np.linalg.svd(flows_mat, full_matrices = False)
print(U.shape)
print(S.shape)
print(VT.shape)
plt.semilogy(S, '-o')
plt.ylabel('Singular Value', fontsize = 15)
plt.xlabel('Index', fontsize = 15)
plt.show()
plt.subplot(1,4,1)
plt.imshow(U[:,0].reshape(nx, ny), 'gray')
plt.title('POD Mode 1')
plt.axis('off')
plt.subplot(1,4,2)
plt.imshow(U[:,1].reshape(nx, ny), 'gray')
plt.title('POD Mode 2')
plt.axis('off')
plt.subplot(1,4,3)
plt.imshow(U[:,4].reshape(nx, ny), 'gray')
plt.title('POD Mode 5')
plt.axis('off')
plt.subplot(1,4,4)
plt.imshow(U[:,12].reshape(nx, ny), 'gray')
plt.title('POD Mode 13')
plt.axis('off')
plt.show()
U = np.asmatrix(U)
diagS = np.asmatrix(np.diag(S))
VT = np.asmatrix(VT)
print(U.shape, diagS.shape, VT.shape, '\n')
k = 5
low_U = U[:, :k]
low_S = diagS[:k, :k]
low_VT = VT[:k, :]
print(low_U.shape, low_S.shape, low_VT.shape, '\n')
low_flows_mat = low_U*low_S*low_VT
# Let's see the flow in time
from matplotlib.animation import FuncAnimation
import time
from IPython import display
fig = plt.figure()
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
def animate(i):
ax1.clear()
ax1.imshow(flows_mat[:,i].reshape(nx, ny), 'gray')
ax1.axis('off')
ax2.clear()
ax2.imshow(low_flows_mat[:,i].reshape(nx, ny), 'gray')
ax2.axis('off')
ani = FuncAnimation(fig, animate, frames = int(150/1), interval = 10)
video = ani.to_html5_video()
html = display.HTML(video)
display.display(html)
plt.close()