## demonstrate calculating PCA from the covariance matrix for a very ## small data set: say time series of four stocks. Eli Tziperman, ## APM120, 201709 import numpy as np from numpy import linalg import scipy as scipy import matplotlib.pyplot as plt import matplotlib ## make data: N=100000; # number of time points M=4; # number of data points at a given time Np=200; # portion of data to plot # create data: V1=np.array([0, 1.6, 0.0, -1]); V2=np.array([0.5, 0, 1.5, 0]); V1.shape=(4,1);V2.shape=(4,1); t=np.arange(0,N); tp=np.arange(0,Np); t.shape=(1,N); tp.shape=(1,Np); F=3*V1@np.cos(t/3)+V2@np.cos(t/5); ## display a small sample of the data: print('sample data:') np.set_printoptions(linewidth=132,precision=5) print(F[:,0:8]); wait = input("Enter to continue...") ## plot data: matplotlib.rcParams.update({'font.size': 18}) # default font size fig1=plt.figure(1); plt.clf(); hl1,=plt.plot(tp.T,F[0,tp].T,'r-',linewidth=2); hl2,=plt.plot(tp.T,F[1,tp].T,'g-',linewidth=2); hl3,=plt.plot(tp.T,F[2,tp].T,'b-',linewidth=2); hl4,=plt.plot(tp.T,F[3,tp].T,'c-',linewidth=2); plt.legend([hl1,hl2,hl3,hl4],['F1','F2','F3','F4']); plt.xlabel('day'); plt.ylabel('price'); plt.title('Stock prices'); plt.pause(0.05) print('time series of stock prices plotted.'); wait = input("Enter to continue...") ## calculate covariance matrix: print('Covariance matrix: C=F*F''/N=') print(' red green blue cyan'); C=F@F.T/N; mycolor=['red ','green','blue ','cyan ']; for i in range(0,4): print(repr(mycolor[i]),repr(C[i,:])); wait = input("Enter to continue...") ## Calculate PCs: D,U=np.linalg.eig(C); ## sort eigenvalues and vectors by size of abs(eigenvalue) from large to small: idx = np.abs(D).argsort(); idx=idx[::-1]; D = D[idx] U = U[:,idx] print('\n\nPCs (columns of U):'); print(U); wait = input("Enter to continue...") print('eigen values (D):') print(D); wait = input("Enter to continue...") print('dimensional time series of PC coefficients (columns of V):') print('T=U.T@F;'); T=U.T@F; print(T[:,0:7]); ## plot time expansion coefficients: fig2=plt.figure(2); plt.clf(); hl1,=plt.plot(tp.T,T[0,tp].T,'m-',linewidth=2); hl2,=plt.plot(tp.T,T[1,tp].T,'c-',linewidth=2); hl3,=plt.plot(tp.T,T[2,tp].T,'b-',linewidth=2); hl4,=plt.plot(tp.T,T[3,tp].T,'k--',linewidth=2); plt.legend([hl1,hl2,hl3,hl4],['v1','v2','v3','v4']); plt.xlabel('time'); plt.ylabel('$v_i(t)$'); plt.title('expansion coefficients $v_{ni}$:') plt.pause(0.05) wait = input("Enter to continue...") ## demonstrate reconstruction of data using some of the PCs, ## i.e. dimension reduction: print('reconstruct data using only two of the PCs (dimension reduction):') print('F_reconstructed=U[:,0:1]@T[0:1,:];'); F_reconstructed=U[:,0:2]@T[0:2,:]; print("F="); print(F[:,0:5]); print("F_reconstructed="); print(F_reconstructed[:,0:5]); print('reconstruction error=np.max(np.abs(F-F_reconstructed)):'); F_reconstruction_error=np.max(np.abs(F-F_reconstructed)); print(F_reconstruction_error); ## variance explained by each PC: print('Explained variance is given by eigenvalues of covariance matrix,\n ' +'variance_explained_by_each_mode=D/sum(D)=') variance_explained_by_each_mode=D/sum(D); print(variance_explained_by_each_mode);