# Install on Terminal of MacOS # 1. NumPy #pip3 install -U numpy # 2. pandas #pip3 install -U pandas # 3. matplotlib #pip3 install -U matplotlib # 4. scikit-learn (sklearn) #pip3 install -U scikit-learn # 5. seaborn #pip3 install -U seaborn |
1_MacOS_Terminal.txt
########## Run Terminal on MacOS and execute ### TO UPDATE cd "YOUR_WORKING_DIRECTORY" python3 pca01.py X.csv 2 x1 x2 |
Input data files
X.csv
x1,x2 -0.6253016177147778,-0.1700636571125488 0.9606950333275389,0.5909005970088841 -0.5985433850075956,-0.40259339311989306 -2.2280593766522214,-0.5325767401368825 -0.4614300598767516,-0.4988672444678538 -0.9589290278490424,-0.2693310237319169 -0.6730799091546266,-0.33830854748013184 1.3050186115481948,0.5913578455364443 0.37454559743575305,-0.09854420488874384 -1.8262862664844461,-0.40617025383032 0.6682622844811562,0.33687739585814436 -0.582646676021939,-0.17736921748075757 -0.4181289762061614,-0.37381138862526 0.17220937064575279,0.2646688363445262 0.3771166872946183,0.18844296908147612 -0.6793962296056062,-0.1316019778374699 1.0314895989546813,0.42555001754472105 0.336041798802642,0.03909827210839272 0.7057459850229555,0.4887306489211889 0.8395115474671283,0.15212587178745451 1.4988289811446753,0.47138080860235554 0.28835663844440446,0.03313347136138862 -0.5029350109723469,-0.3686654262572918 1.4792106688843745,0.7404457237454994 -0.4443824292899965,-0.16501936382926874 -0.5334642282766763,-0.06022219108193748 -0.6162294222666116,-0.2117839215105133 0.0746598965247852,-0.06143210770485435 -0.11363701103744236,0.07328776784924641 -0.020071729783539105,0.060974458601556945 0.18958296683876208,0.19976936885949087 0.9384661030448095,0.5417311316419882 -0.3666979887816997,-0.03649713755342033 -0.893528485543383,-0.37281401282437254 -1.3175957070283486,-0.35758107302395725 -0.38180278184117733,-0.20125246497579233 -0.513790215738037,-0.5059740860360368 0.9037379337854844,0.24951826048596157 -0.438875079467635,-0.15064812956892226 -0.5941709069988491,-0.09412241094079501 -0.3129439309072686,-0.0006573129658781017 -0.3002290892889903,-0.42205027177225224 -0.5992393195183092,-0.2603981800756424 0.22037003644161027,0.20391441508487246 0.5251648442025796,0.18702636260776917 0.5566250109115648,-0.1337450936293308 1.3542815721236108,0.5473974215855587 -0.6831598059463396,-0.3946931871585983 -0.12332510881871668,-0.10426653884571879 -0.313175458181276,-0.06983066191967202 -2.4883827672198118,-0.844571248983918 1.2499658587687152,0.585738114758799 0.17676443083245547,0.11073020815351371 -0.5078085381859656,-0.3157463002557443 1.8273850163544514,0.6202325024679164 0.810156740130441,0.17717119383530391 0.42849439812666407,0.12989240692362633 -0.0681124709347002,-0.18341414945223794 -0.23808571181520388,0.03202830530282251 -0.5765113112191823,-0.46111955845296854 0.786579790722323,0.24042095069142833 -0.3940283640280845,-0.11323899454137867 0.3002810160826966,0.04058301029479762 0.8343544924408379,0.36341169543257545 0.677025394441299,0.08621805880954383 0.8228587852634555,0.07952628678979308 1.1106022591442504,0.08384472094572175 -1.1105631980164716,-0.22185161445683957 -3.255811716715632e-05,0.25258628730620064 0.9013460042196363,0.466550032273377 0.6133252822589438,0.22942997155559533 1.0028032134638973,0.2676369700820755 -0.5003006472680256,-0.2651946778474655 -1.4683907966664256,-0.2625916324829058 -1.1655921035052261,-0.43565877729570385 1.232905714340008,0.3726449404855601 -0.08713741656430898,-0.07682053624162678 1.3261173087851847,0.42330557530293655 -0.655945864481047,-0.2364227993382929 -0.3988689687977965,-0.13229617799556867 0.14653723286275153,0.028871450029168452 0.834743613889784,0.2786312693918669 0.1264552008350165,0.0183894434782219 0.20170755625559836,0.06384350191987659 -0.2677757195594144,0.004920115066358239 0.2850850619265961,0.05361280309889841 -0.7534185494319212,-0.3375313194877926 0.5293134165710455,0.024596083148549396 0.36557771876219164,-0.05627178109385569 0.03628223499348943,-0.01715871426325706 0.19813999803683843,0.14880645440940787 -0.7566737721004273,-0.20586422817915984 0.11573326248000207,-0.02550109224389107 -0.18194453723128004,-0.08989333726537009 0.15719830623491635,0.12612466052713817 0.5834703702968259,0.23726671623004414 -0.9467702130017053,-0.2889270900141682 0.7131403526818803,0.177219645253482 1.3015718732510495,0.6245024454537768 -0.5490637095425713,-0.4446386052870516 -0.42972825662244346,-0.25094185555443876 -0.3867663078355488,-0.26615940072768385 -0.6576432577750558,-0.08450556251223318 1.2396141475684104,0.4906595122345453 0.31844659565746924,0.00412165000038993 -0.898118948974779,-0.21013758118618273 0.3368813191969155,0.18796100443519898 -0.41845291136338925,-0.18133671667206272 0.23660115895408176,0.33948703902729255 0.35131972759867486,0.0923282159145785 1.3532394247496584,0.4199700271922287 -0.8348195176596596,-0.2000442134064468 2.329386000927877,0.9163669216952257 0.04671744343525102,0.2491486988635611 0.45537323798218987,0.197896475733475 -0.7106849190944476,-0.01566073872219537 -0.05500951648619287,-0.2192433817766162 -0.45455451107980727,-0.26230796883093094 -0.10821361083201804,-0.04111319037942166 -0.8975524067664928,-0.24112928429837818 0.7348024907303494,0.08560776090645643 0.22526650079886087,-0.24953117491168642 -0.32743823994506444,0.1875610244254125 1.2042254843133238,0.28920812059234957 0.17033197408830542,-0.21321241421310294 1.0000793029093251,0.3606265072324854 -0.6715165633594682,-0.0720747636351032 1.1922161563991507,0.34937990369445265 0.2401742050919358,0.13245126629075182 0.5565923888898513,0.339247335512669 -1.2311390888032698,-0.30157224742195027 0.15265992580652274,-0.03222433313903002 1.3761179554699428,0.43890501948806576 -0.705213114119547,-0.18692398635416027 -1.2503541532412021,-0.6158527467053451 -1.8765101084780167,-0.587438868639975 -1.4710568984778531,-0.7576869379635249 -1.5034644679660039,-0.6391752913522825 -0.3743174720024021,-0.12447326608637561 0.8785248258922201,0.3865290473020056 0.043796065302285726,-0.13360093707284143 0.5459828435062852,0.09789868532533491 0.14156912654701717,-0.03319345268900965 0.0635824494337492,0.0026003742177041914 -0.0887046212662992,-0.05084308715641993 0.13241180824589355,-0.052589664467018696 0.42914314386081237,0.1394515202742384 -0.5624501660995315,-0.3554597997348278 0.5982938287413969,0.30535887865961914 -0.3526694737890958,0.2778729045016903 0.29270813774315824,-0.05885337230364588 1.492163246956624,0.24374680132654183 -0.32113688351077574,-0.21198878371371824 -0.42874183870380655,-0.16243147412180026 0.055660218158465974,0.047228046018850105 -0.1866690795903888,-0.05752362868355279 -0.31597688972246374,-0.13566613265746788 -0.9522872924809886,-0.2034312762147448 -0.6172168705296928,-0.16860795605042092 0.26072412351108243,0.28382726780703793 -1.2960687227320984,-0.5875015083327422 0.13020443641657642,0.10653659974038616 0.03620269153100732,-0.0714259522021235 0.4514068288039194,0.22002174620909223 0.7823436171641021,0.15583634261331558 -1.915279395290569,-0.8412263811413127 1.3347052662111705,0.1770082747074838 -0.5572415535553438,0.09782177032582899 -0.25403015749338176,0.006535856664278692 0.03834264146846239,-0.141592253201906 1.6690068187623457,0.25826388380318754 -0.4012013017360435,-0.12490065133457608 1.339802976104429,0.5546998947511015 0.31216077538561204,0.17060543600074968 2.0951768340115513,0.6464873446375877 -0.6856530796067453,-0.23752537496307333 -0.9812321593591005,-0.5307957332980906 0.3815161508468817,0.21606760064890315 0.08353861810384086,0.257654663999833 0.15622704295661638,0.010726371074333238 -0.898591325618534,-0.4651697690633494 0.21667377802103935,-0.13526539099053692 0.3990912763057593,0.1868245239609231 -0.390120283917219,-0.0557531130260491 0.1511367907084867,-0.035001733813416036 -0.025994727909912866,-0.053019427125884186 -0.5556780881206816,-0.2823207194372078 -0.4144372026664577,-0.1611577687020534 -0.5436960940610972,-0.4311614567758672 1.427591447499168,0.5345541401448728 0.45186186734666195,-0.14350285930798962 0.807369404052474,0.1445241918910205 -0.46125120826088006,-0.3088740278796747 -0.10635944761748291,0.24002562874429395 -1.0981033386108714,-0.5662486871643857 0.8384085456738412,0.2783800067455952 0.3127083097370869,-0.010600123267777883 1.9558758522219668,0.6382241135386741 -1.1083035213676484,-0.39505365784405105 0.39864720557220024,0.02318655255760607 |
Python files
pca01.py
#################### Principal Component Analysis in Python #################### #Run this script on Terminal of MacOS as follows: # #python3 pca01.py X.csv 2 x1 x2 #python3 pca01.py (X file that includes columns x1 and x2) (number of PCA components) (column x1) (column x2) #Reference # #In Depth: Principal Component Analysis #https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html ########## import import sys # #print(sys.argv[0]) #pca01.py # Xfilename = sys.argv[1] n = int(sys.argv[2]) x1name = sys.argv[3] x2name = sys.argv[4] import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns; sns.set() from sklearn.decomposition import PCA ########## ########## raw data plotting #rng = np.random.RandomState(1) # #X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T # # #print(type(X)) #<class 'numpy.ndarray'> #print(X.shape) # #colnames=['x1', 'x2'] #pd.DataFrame(X).to_csv('X.csv', header=False, index=False) # #add x1, x2 to the columns of X.csv X = pd.read_csv(Xfilename, header=0) # #print(X) #print(X[x1name]) #print(X[x2name]) #plt.scatter(X[:, 0], X[:, 1]) plt.scatter(X[x1name], X[x2name]) plt.xlabel(x1name) plt.ylabel(x2name) plt.axis('equal') plt.savefig('Fig_1.png') plt.show() plt.close() ########## ########## PCA fitting pca = PCA(n_components=n) pca.fit(X) print(pca.components_) ''' [[-0.94446029 -0.32862557] [-0.32862557 0.94446029]] ''' ''' x1 x2 pc1 -0.94446029 -0.32862557 pc2 -0.32862557 0.94446029 Namely, pc1 (calculated) = ((-0.94446029) * x1) + ((-0.32862557) * x2)) pc2 (calculated) = ((-0.32862557) * x1) + (( 0.94446029) * x2)) ''' print(pca.explained_variance_) #[0.7625315 0.0184779] print(pca.explained_variance_ratio_) #[0.97634101 0.02365899] print(np.cumsum(pca.explained_variance_ratio_)) #[0.97634101 1. ] ########## vector drawing function def draw_vector(v0, v1, ax=None): ax = ax or plt.gca() arrowprops=dict(arrowstyle='->', linewidth=2, color='k', shrinkA=0, shrinkB=0) ax.annotate('', v1, v0, arrowprops=arrowprops) ########## ########## plot data x1 & x2 and principal components PC1 & PC2 fig, ax = plt.subplots(1, 2, figsize=(16, 6)) fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1) ##### PCA X_pca = pca.transform(X) #print(type(X_pca[:, 0])) #<class 'numpy.ndarray'> # #print(max(X_pca[:, 0])) #2.6580358349697173 # #print(np.argmax(X_pca[:, 0])) #50 # #print(X_pca[:, 0][np.argmax(X_pca[:, 0])]) #2.6580358349697173 # #print(max(X_pca[:, 1])) #0.393203001154575 # #print(X_pca[:, 1][np.argmax(X_pca[:, 1])]) #0.393203001154575 ##### raw data #X[:, 0] #print(X[x1name][np.argmax(X_pca[:, 0])]) #-2.488382767219812 # #print(X[x2name][np.argmax(X_pca[:, 0])]) #-0.844571248983918 #X[:, 1] #print(X[x1name][np.argmax(X_pca[:, 1])]) #-0.3526694737890958 # #print(X[x2name][np.argmax(X_pca[:, 1])]) #0.2778729045016903 ##### validation: PCA data and raw data ###pc1 # #print(((X[x1name][np.argmax(X_pca[:, 0])]) ** 2) + ((X[x2name][np.argmax(X_pca[:, 0])]) ** 2)) #6.905349390806785 #(-2.488382767219812) ** 2 + (-0.844571248983918) ** 2 # #print(max(X_pca[:, 0]) ** 2) #7.065154499983162 #(2.6580358349697173) ** 2 ###pc2 # #print(((X[x1name][np.argmax(X_pca[:, 1])]) ** 2) + ((X[x2name][np.argmax(X_pca[:, 1])]) ** 2)) #0.2015891087988832 #(-0.3526694737890958) ** 2 + (0.2778729045016903) ** 2 # #print((max(X_pca[:, 1])) ** 2) #0.15460860011696473 #(0.393203001154575) ** 2 ##### plot raw data #ax[0].scatter(X[:, 0], X[:, 1], alpha=0.2) ax[0].scatter(X[x1name], X[x2name], alpha=0.2) for length, vector in zip(pca.explained_variance_, pca.components_): v = vector * 3 * np.sqrt(length) draw_vector(pca.mean_, pca.mean_ + v, ax=ax[0]) # ax[0].axis('equal') #ax[0].set(xlabel='x', ylabel='y', title='input') ax[0].set(xlabel=x1name, ylabel=x2name, title='Raw Data') # ax[0].text(X[x1name][np.argmax(X_pca[:, 0])] * 1.00, X[x2name][np.argmax(X_pca[:, 0])] * 1.15, '(' + str(X[x1name][np.argmax(X_pca[:, 0])]) + ', ' + str(X[x2name][np.argmax(X_pca[:, 0])]) + ')') ax[0].text(X[x1name][np.argmax(X_pca[:, 1])] * 1.10, X[x2name][np.argmax(X_pca[:, 1])] * 1.30, '(' + str(X[x1name][np.argmax(X_pca[:, 1])]) + ', ' + str(X[x2name][np.argmax(X_pca[:, 1])]) + ')') # ##### plot principal components ax[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.2) # #draw_vector([0, 0], [0, 3], ax=ax[1]) draw_vector([0, 0], [0, max(X_pca[:, 1])], ax=ax[1]) # #draw_vector([0, 0], [3, 0], ax=ax[1]) draw_vector([0, 0], [max(X_pca[:, 0]),0], ax=ax[1]) # ax[1].axis('equal') ''' ax[1].set(xlabel='PC1', ylabel='PC2', title='Principal Components', xlim=(-5, 5), ylim=(-3, 3.1)) ''' #print(max(X_pca[:, 0])) #print(min(X_pca[:, 0])) #print(max(X_pca[:, 1])) #print(min(X_pca[:, 1])) # ax[1].set( xlabel='PC1', ylabel='PC2', title='Principal Components', xlim=(min(X_pca[:, 0]), max(X_pca[:, 0])), ylim=(min(X_pca[:, 1]), max(X_pca[:, 1])) ) # ax[1].text(max(X_pca[:, 0])/2, 0.05, '(' + str(max(X_pca[:, 0])) + ', 0)') ax[1].text(0.05, max(X_pca[:, 1])/2, '(0, ' + str(max(X_pca[:, 1])) + ')') # plt.savefig('Fig_2.png') plt.show() plt.close() ########## output ##### output principal components pc1 = pd.DataFrame(X_pca[:, 0]) pc1.rename({0:'pc1'},axis=1,inplace=True) pc2 = pd.DataFrame(X_pca[:, 1]) pc2.rename({0:'pc2'},axis=1,inplace=True) Xpca = pd.concat( [ pc1, pc2 ], axis=1 ) #print(Xpca) Xpca.to_csv('Xpca.csv', header=True, index=False) ##### output raw data and principal components XXpca = pd.concat( [ X, Xpca ], axis=1 ) #print(XXpca) XXpca.to_csv('XXpca.csv', header=True, index=False) |
Figures
Fig_1.png
Fig_2.png
References
In Depth: Principal Component Analysis
https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html
https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html
No comments:
Post a Comment