AdSense

Friday, July 17, 2020

Understanding Principal Component Analysis in Python

Understanding Principal Component Analysis in Python



0_MacOS_Python_setup.txt
# Install on Terminal of MacOS

# 1. NumPy
#pip3 install -U numpy

# 2. pandas
#pip3 install -U pandas

# 3. matplotlib
#pip3 install -U matplotlib

# 4. scikit-learn (sklearn)
#pip3 install -U scikit-learn

# 5. seaborn
#pip3 install -U seaborn


1_MacOS_Terminal.txt
########## Run Terminal on MacOS and execute
### TO UPDATE
cd "YOUR_WORKING_DIRECTORY"


python3 pca01.py X.csv 2 x1 x2



Input data files



X.csv
x1,x2
-0.6253016177147778,-0.1700636571125488
0.9606950333275389,0.5909005970088841
-0.5985433850075956,-0.40259339311989306
-2.2280593766522214,-0.5325767401368825
-0.4614300598767516,-0.4988672444678538
-0.9589290278490424,-0.2693310237319169
-0.6730799091546266,-0.33830854748013184
1.3050186115481948,0.5913578455364443
0.37454559743575305,-0.09854420488874384
-1.8262862664844461,-0.40617025383032
0.6682622844811562,0.33687739585814436
-0.582646676021939,-0.17736921748075757
-0.4181289762061614,-0.37381138862526
0.17220937064575279,0.2646688363445262
0.3771166872946183,0.18844296908147612
-0.6793962296056062,-0.1316019778374699
1.0314895989546813,0.42555001754472105
0.336041798802642,0.03909827210839272
0.7057459850229555,0.4887306489211889
0.8395115474671283,0.15212587178745451
1.4988289811446753,0.47138080860235554
0.28835663844440446,0.03313347136138862
-0.5029350109723469,-0.3686654262572918
1.4792106688843745,0.7404457237454994
-0.4443824292899965,-0.16501936382926874
-0.5334642282766763,-0.06022219108193748
-0.6162294222666116,-0.2117839215105133
0.0746598965247852,-0.06143210770485435
-0.11363701103744236,0.07328776784924641
-0.020071729783539105,0.060974458601556945
0.18958296683876208,0.19976936885949087
0.9384661030448095,0.5417311316419882
-0.3666979887816997,-0.03649713755342033
-0.893528485543383,-0.37281401282437254
-1.3175957070283486,-0.35758107302395725
-0.38180278184117733,-0.20125246497579233
-0.513790215738037,-0.5059740860360368
0.9037379337854844,0.24951826048596157
-0.438875079467635,-0.15064812956892226
-0.5941709069988491,-0.09412241094079501
-0.3129439309072686,-0.0006573129658781017
-0.3002290892889903,-0.42205027177225224
-0.5992393195183092,-0.2603981800756424
0.22037003644161027,0.20391441508487246
0.5251648442025796,0.18702636260776917
0.5566250109115648,-0.1337450936293308
1.3542815721236108,0.5473974215855587
-0.6831598059463396,-0.3946931871585983
-0.12332510881871668,-0.10426653884571879
-0.313175458181276,-0.06983066191967202
-2.4883827672198118,-0.844571248983918
1.2499658587687152,0.585738114758799
0.17676443083245547,0.11073020815351371
-0.5078085381859656,-0.3157463002557443
1.8273850163544514,0.6202325024679164
0.810156740130441,0.17717119383530391
0.42849439812666407,0.12989240692362633
-0.0681124709347002,-0.18341414945223794
-0.23808571181520388,0.03202830530282251
-0.5765113112191823,-0.46111955845296854
0.786579790722323,0.24042095069142833
-0.3940283640280845,-0.11323899454137867
0.3002810160826966,0.04058301029479762
0.8343544924408379,0.36341169543257545
0.677025394441299,0.08621805880954383
0.8228587852634555,0.07952628678979308
1.1106022591442504,0.08384472094572175
-1.1105631980164716,-0.22185161445683957
-3.255811716715632e-05,0.25258628730620064
0.9013460042196363,0.466550032273377
0.6133252822589438,0.22942997155559533
1.0028032134638973,0.2676369700820755
-0.5003006472680256,-0.2651946778474655
-1.4683907966664256,-0.2625916324829058
-1.1655921035052261,-0.43565877729570385
1.232905714340008,0.3726449404855601
-0.08713741656430898,-0.07682053624162678
1.3261173087851847,0.42330557530293655
-0.655945864481047,-0.2364227993382929
-0.3988689687977965,-0.13229617799556867
0.14653723286275153,0.028871450029168452
0.834743613889784,0.2786312693918669
0.1264552008350165,0.0183894434782219
0.20170755625559836,0.06384350191987659
-0.2677757195594144,0.004920115066358239
0.2850850619265961,0.05361280309889841
-0.7534185494319212,-0.3375313194877926
0.5293134165710455,0.024596083148549396
0.36557771876219164,-0.05627178109385569
0.03628223499348943,-0.01715871426325706
0.19813999803683843,0.14880645440940787
-0.7566737721004273,-0.20586422817915984
0.11573326248000207,-0.02550109224389107
-0.18194453723128004,-0.08989333726537009
0.15719830623491635,0.12612466052713817
0.5834703702968259,0.23726671623004414
-0.9467702130017053,-0.2889270900141682
0.7131403526818803,0.177219645253482
1.3015718732510495,0.6245024454537768
-0.5490637095425713,-0.4446386052870516
-0.42972825662244346,-0.25094185555443876
-0.3867663078355488,-0.26615940072768385
-0.6576432577750558,-0.08450556251223318
1.2396141475684104,0.4906595122345453
0.31844659565746924,0.00412165000038993
-0.898118948974779,-0.21013758118618273
0.3368813191969155,0.18796100443519898
-0.41845291136338925,-0.18133671667206272
0.23660115895408176,0.33948703902729255
0.35131972759867486,0.0923282159145785
1.3532394247496584,0.4199700271922287
-0.8348195176596596,-0.2000442134064468
2.329386000927877,0.9163669216952257
0.04671744343525102,0.2491486988635611
0.45537323798218987,0.197896475733475
-0.7106849190944476,-0.01566073872219537
-0.05500951648619287,-0.2192433817766162
-0.45455451107980727,-0.26230796883093094
-0.10821361083201804,-0.04111319037942166
-0.8975524067664928,-0.24112928429837818
0.7348024907303494,0.08560776090645643
0.22526650079886087,-0.24953117491168642
-0.32743823994506444,0.1875610244254125
1.2042254843133238,0.28920812059234957
0.17033197408830542,-0.21321241421310294
1.0000793029093251,0.3606265072324854
-0.6715165633594682,-0.0720747636351032
1.1922161563991507,0.34937990369445265
0.2401742050919358,0.13245126629075182
0.5565923888898513,0.339247335512669
-1.2311390888032698,-0.30157224742195027
0.15265992580652274,-0.03222433313903002
1.3761179554699428,0.43890501948806576
-0.705213114119547,-0.18692398635416027
-1.2503541532412021,-0.6158527467053451
-1.8765101084780167,-0.587438868639975
-1.4710568984778531,-0.7576869379635249
-1.5034644679660039,-0.6391752913522825
-0.3743174720024021,-0.12447326608637561
0.8785248258922201,0.3865290473020056
0.043796065302285726,-0.13360093707284143
0.5459828435062852,0.09789868532533491
0.14156912654701717,-0.03319345268900965
0.0635824494337492,0.0026003742177041914
-0.0887046212662992,-0.05084308715641993
0.13241180824589355,-0.052589664467018696
0.42914314386081237,0.1394515202742384
-0.5624501660995315,-0.3554597997348278
0.5982938287413969,0.30535887865961914
-0.3526694737890958,0.2778729045016903
0.29270813774315824,-0.05885337230364588
1.492163246956624,0.24374680132654183
-0.32113688351077574,-0.21198878371371824
-0.42874183870380655,-0.16243147412180026
0.055660218158465974,0.047228046018850105
-0.1866690795903888,-0.05752362868355279
-0.31597688972246374,-0.13566613265746788
-0.9522872924809886,-0.2034312762147448
-0.6172168705296928,-0.16860795605042092
0.26072412351108243,0.28382726780703793
-1.2960687227320984,-0.5875015083327422
0.13020443641657642,0.10653659974038616
0.03620269153100732,-0.0714259522021235
0.4514068288039194,0.22002174620909223
0.7823436171641021,0.15583634261331558
-1.915279395290569,-0.8412263811413127
1.3347052662111705,0.1770082747074838
-0.5572415535553438,0.09782177032582899
-0.25403015749338176,0.006535856664278692
0.03834264146846239,-0.141592253201906
1.6690068187623457,0.25826388380318754
-0.4012013017360435,-0.12490065133457608
1.339802976104429,0.5546998947511015
0.31216077538561204,0.17060543600074968
2.0951768340115513,0.6464873446375877
-0.6856530796067453,-0.23752537496307333
-0.9812321593591005,-0.5307957332980906
0.3815161508468817,0.21606760064890315
0.08353861810384086,0.257654663999833
0.15622704295661638,0.010726371074333238
-0.898591325618534,-0.4651697690633494
0.21667377802103935,-0.13526539099053692
0.3990912763057593,0.1868245239609231
-0.390120283917219,-0.0557531130260491
0.1511367907084867,-0.035001733813416036
-0.025994727909912866,-0.053019427125884186
-0.5556780881206816,-0.2823207194372078
-0.4144372026664577,-0.1611577687020534
-0.5436960940610972,-0.4311614567758672
1.427591447499168,0.5345541401448728
0.45186186734666195,-0.14350285930798962
0.807369404052474,0.1445241918910205
-0.46125120826088006,-0.3088740278796747
-0.10635944761748291,0.24002562874429395
-1.0981033386108714,-0.5662486871643857
0.8384085456738412,0.2783800067455952
0.3127083097370869,-0.010600123267777883
1.9558758522219668,0.6382241135386741
-1.1083035213676484,-0.39505365784405105
0.39864720557220024,0.02318655255760607




Python files

pca01.py
#################### Principal Component Analysis in Python ####################

#Run this script on Terminal of MacOS as follows:
#
#python3 pca01.py X.csv 2 x1 x2
#python3 pca01.py (X file that includes columns x1 and x2) (number of PCA components) (column x1) (column x2)

#Reference
#
#In Depth: Principal Component Analysis
#https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html


########## import
import sys
#
#print(sys.argv[0])
#pca01.py
#
Xfilename = sys.argv[1]
n = int(sys.argv[2])
x1name = sys.argv[3]
x2name = sys.argv[4]

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.decomposition import PCA
##########


########## raw data plotting

#rng = np.random.RandomState(1)
#
#X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
#
#
#print(type(X))
#<class 'numpy.ndarray'>
#print(X.shape)
#
#colnames=['x1', 'x2']
#pd.DataFrame(X).to_csv('X.csv', header=False, index=False)
#
#add x1, x2 to the columns of X.csv

X = pd.read_csv(Xfilename, header=0)
#
#print(X)
#print(X[x1name])
#print(X[x2name])

#plt.scatter(X[:, 0], X[:, 1])
plt.scatter(X[x1name], X[x2name])
plt.xlabel(x1name)
plt.ylabel(x2name)
plt.axis('equal')
plt.savefig('Fig_1.png')
plt.show()
plt.close()
##########


########## PCA fitting

pca = PCA(n_components=n)
pca.fit(X)

print(pca.components_)
'''
[[-0.94446029 -0.32862557]
 [-0.32862557  0.94446029]]
'''
'''
       x1           x2
pc1    -0.94446029 -0.32862557
pc2    -0.32862557  0.94446029

Namely,
pc1 (calculated) = ((-0.94446029) * x1) + ((-0.32862557) * x2))
pc2 (calculated) = ((-0.32862557) * x1) + (( 0.94446029) * x2))
'''

print(pca.explained_variance_)
#[0.7625315 0.0184779]

print(pca.explained_variance_ratio_)
#[0.97634101 0.02365899]

print(np.cumsum(pca.explained_variance_ratio_))
#[0.97634101 1.        ]


########## vector drawing function

def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    color='k',
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)
##########


########## plot data x1 & x2 and principal components PC1 & PC2

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

##### PCA
X_pca = pca.transform(X)

#print(type(X_pca[:, 0]))
#<class 'numpy.ndarray'>
#
#print(max(X_pca[:, 0]))
#2.6580358349697173
#
#print(np.argmax(X_pca[:, 0]))
#50
#
#print(X_pca[:, 0][np.argmax(X_pca[:, 0])])
#2.6580358349697173
#
#print(max(X_pca[:, 1]))
#0.393203001154575
#
#print(X_pca[:, 1][np.argmax(X_pca[:, 1])])
#0.393203001154575


##### raw data

#X[:, 0]
#print(X[x1name][np.argmax(X_pca[:, 0])])
#-2.488382767219812
#
#print(X[x2name][np.argmax(X_pca[:, 0])])
#-0.844571248983918

#X[:, 1]
#print(X[x1name][np.argmax(X_pca[:, 1])])
#-0.3526694737890958
#
#print(X[x2name][np.argmax(X_pca[:, 1])])
#0.2778729045016903


##### validation: PCA data and raw data

###pc1
#
#print(((X[x1name][np.argmax(X_pca[:, 0])]) ** 2) + ((X[x2name][np.argmax(X_pca[:, 0])]) ** 2))
#6.905349390806785
#(-2.488382767219812) ** 2 + (-0.844571248983918) ** 2
#
#print(max(X_pca[:, 0]) ** 2)
#7.065154499983162
#(2.6580358349697173) ** 2

###pc2
#
#print(((X[x1name][np.argmax(X_pca[:, 1])]) ** 2) + ((X[x2name][np.argmax(X_pca[:, 1])]) ** 2))
#0.2015891087988832
#(-0.3526694737890958) ** 2 + (0.2778729045016903) ** 2
#
#print((max(X_pca[:, 1])) ** 2)
#0.15460860011696473
#(0.393203001154575) ** 2


##### plot raw data

#ax[0].scatter(X[:, 0], X[:, 1], alpha=0.2)
ax[0].scatter(X[x1name], X[x2name], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v, ax=ax[0])
#
ax[0].axis('equal')
#ax[0].set(xlabel='x', ylabel='y', title='input')
ax[0].set(xlabel=x1name, ylabel=x2name, title='Raw Data')
#
ax[0].text(X[x1name][np.argmax(X_pca[:, 0])] * 1.00, X[x2name][np.argmax(X_pca[:, 0])] * 1.15, '(' + str(X[x1name][np.argmax(X_pca[:, 0])]) + ', ' + str(X[x2name][np.argmax(X_pca[:, 0])]) + ')')
ax[0].text(X[x1name][np.argmax(X_pca[:, 1])] * 1.10, X[x2name][np.argmax(X_pca[:, 1])] * 1.30, '(' + str(X[x1name][np.argmax(X_pca[:, 1])]) + ', ' + str(X[x2name][np.argmax(X_pca[:, 1])]) + ')')
#


##### plot principal components

ax[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.2)
#
#draw_vector([0, 0], [0, 3], ax=ax[1])
draw_vector([0, 0], [0, max(X_pca[:, 1])], ax=ax[1])
#
#draw_vector([0, 0], [3, 0], ax=ax[1])
draw_vector([0, 0], [max(X_pca[:, 0]),0], ax=ax[1])
#
ax[1].axis('equal')
'''
ax[1].set(xlabel='PC1', ylabel='PC2',
          title='Principal Components',
          xlim=(-5, 5), ylim=(-3, 3.1))
'''
#print(max(X_pca[:, 0]))
#print(min(X_pca[:, 0]))
#print(max(X_pca[:, 1]))
#print(min(X_pca[:, 1]))
#
ax[1].set(
          xlabel='PC1',
          ylabel='PC2',
          title='Principal Components',
          xlim=(min(X_pca[:, 0]), max(X_pca[:, 0])),
          ylim=(min(X_pca[:, 1]), max(X_pca[:, 1]))
          )
#
ax[1].text(max(X_pca[:, 0])/2, 0.05, '(' + str(max(X_pca[:, 0])) + ', 0)')
ax[1].text(0.05, max(X_pca[:, 1])/2, '(0, ' + str(max(X_pca[:, 1])) + ')')
#
plt.savefig('Fig_2.png')
plt.show()
plt.close()


########## output

##### output principal components

pc1 = pd.DataFrame(X_pca[:, 0])
pc1.rename({0:'pc1'},axis=1,inplace=True)

pc2 = pd.DataFrame(X_pca[:, 1])
pc2.rename({0:'pc2'},axis=1,inplace=True)

Xpca = pd.concat(
    [
        pc1,
        pc2
    ],
    axis=1
)

#print(Xpca)
Xpca.to_csv('Xpca.csv', header=True, index=False)


##### output raw data and principal components

XXpca = pd.concat(
    [
        X,
        Xpca
    ],
    axis=1
)

#print(XXpca)
XXpca.to_csv('XXpca.csv', header=True, index=False)





Figures
Fig_1.png


Fig_2.png


No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights ...