# Install on Terminal of MacOS #pip3 install -U pandas #pip3 install -U numpy #pip3 install -U matplotlib #pip3 install -U scikit-learn #pip3 install -U seaborn |
1_MacOS_Terminal.txt
########## Run Terminal on MacOS and execute ### TO UPDATE cd "YOUR_WORKING_DIRECTORY" python3 mlrprob.py Xy.csv Species XTEST.csv sepal_length petal_length |
Data files
Xy.csv
sepal_length,sepal_width,petal_length,petal_width,Species 5.1,3.5,1.4,0.2,0 4.9,3.0,1.4,0.2,0 4.7,3.2,1.3,0.2,0 4.6,3.1,1.5,0.2,0 5.0,3.6,1.4,0.2,0 5.4,3.9,1.7,0.4,0 4.6,3.4,1.4,0.3,0 5.0,3.4,1.5,0.2,0 4.4,2.9,1.4,0.2,0 4.9,3.1,1.5,0.1,0 5.4,3.7,1.5,0.2,0 4.8,3.4,1.6,0.2,0 4.8,3.0,1.4,0.1,0 4.3,3.0,1.1,0.1,0 5.8,4.0,1.2,0.2,0 5.7,4.4,1.5,0.4,0 5.4,3.9,1.3,0.4,0 5.1,3.5,1.4,0.3,0 5.7,3.8,1.7,0.3,0 5.1,3.8,1.5,0.3,0 5.4,3.4,1.7,0.2,0 5.1,3.7,1.5,0.4,0 4.6,3.6,1.0,0.2,0 5.1,3.3,1.7,0.5,0 4.8,3.4,1.9,0.2,0 5.0,3.0,1.6,0.2,0 5.0,3.4,1.6,0.4,0 5.2,3.5,1.5,0.2,0 5.2,3.4,1.4,0.2,0 4.7,3.2,1.6,0.2,0 4.8,3.1,1.6,0.2,0 5.4,3.4,1.5,0.4,0 5.2,4.1,1.5,0.1,0 5.5,4.2,1.4,0.2,0 4.9,3.1,1.5,0.2,0 5.0,3.2,1.2,0.2,0 5.5,3.5,1.3,0.2,0 4.9,3.6,1.4,0.1,0 4.4,3.0,1.3,0.2,0 5.1,3.4,1.5,0.2,0 5.0,3.5,1.3,0.3,0 4.5,2.3,1.3,0.3,0 4.4,3.2,1.3,0.2,0 5.0,3.5,1.6,0.6,0 5.1,3.8,1.9,0.4,0 4.8,3.0,1.4,0.3,0 5.1,3.8,1.6,0.2,0 4.6,3.2,1.4,0.2,0 5.3,3.7,1.5,0.2,0 5.0,3.3,1.4,0.2,0 7.0,3.2,4.7,1.4,1 6.4,3.2,4.5,1.5,1 6.9,3.1,4.9,1.5,1 5.5,2.3,4.0,1.3,1 6.5,2.8,4.6,1.5,1 5.7,2.8,4.5,1.3,1 6.3,3.3,4.7,1.6,1 4.9,2.4,3.3,1.0,1 6.6,2.9,4.6,1.3,1 5.2,2.7,3.9,1.4,1 5.0,2.0,3.5,1.0,1 5.9,3.0,4.2,1.5,1 6.0,2.2,4.0,1.0,1 6.1,2.9,4.7,1.4,1 5.6,2.9,3.6,1.3,1 6.7,3.1,4.4,1.4,1 5.6,3.0,4.5,1.5,1 5.8,2.7,4.1,1.0,1 6.2,2.2,4.5,1.5,1 5.6,2.5,3.9,1.1,1 5.9,3.2,4.8,1.8,1 6.1,2.8,4.0,1.3,1 6.3,2.5,4.9,1.5,1 6.1,2.8,4.7,1.2,1 6.4,2.9,4.3,1.3,1 6.6,3.0,4.4,1.4,1 6.8,2.8,4.8,1.4,1 6.7,3.0,5.0,1.7,1 6.0,2.9,4.5,1.5,1 5.7,2.6,3.5,1.0,1 5.5,2.4,3.8,1.1,1 5.5,2.4,3.7,1.0,1 5.8,2.7,3.9,1.2,1 6.0,2.7,5.1,1.6,1 5.4,3.0,4.5,1.5,1 6.0,3.4,4.5,1.6,1 6.7,3.1,4.7,1.5,1 6.3,2.3,4.4,1.3,1 5.6,3.0,4.1,1.3,1 5.5,2.5,4.0,1.3,1 5.5,2.6,4.4,1.2,1 6.1,3.0,4.6,1.4,1 5.8,2.6,4.0,1.2,1 5.0,2.3,3.3,1.0,1 5.6,2.7,4.2,1.3,1 5.7,3.0,4.2,1.2,1 5.7,2.9,4.2,1.3,1 6.2,2.9,4.3,1.3,1 5.1,2.5,3.0,1.1,1 5.7,2.8,4.1,1.3,1 6.3,3.3,6.0,2.5,2 5.8,2.7,5.1,1.9,2 7.1,3.0,5.9,2.1,2 6.3,2.9,5.6,1.8,2 6.5,3.0,5.8,2.2,2 7.6,3.0,6.6,2.1,2 4.9,2.5,4.5,1.7,2 7.3,2.9,6.3,1.8,2 6.7,2.5,5.8,1.8,2 7.2,3.6,6.1,2.5,2 6.5,3.2,5.1,2.0,2 6.4,2.7,5.3,1.9,2 6.8,3.0,5.5,2.1,2 5.7,2.5,5.0,2.0,2 5.8,2.8,5.1,2.4,2 6.4,3.2,5.3,2.3,2 6.5,3.0,5.5,1.8,2 7.7,3.8,6.7,2.2,2 7.7,2.6,6.9,2.3,2 6.0,2.2,5.0,1.5,2 6.9,3.2,5.7,2.3,2 5.6,2.8,4.9,2.0,2 7.7,2.8,6.7,2.0,2 6.3,2.7,4.9,1.8,2 6.7,3.3,5.7,2.1,2 7.2,3.2,6.0,1.8,2 6.2,2.8,4.8,1.8,2 6.1,3.0,4.9,1.8,2 6.4,2.8,5.6,2.1,2 7.2,3.0,5.8,1.6,2 7.4,2.8,6.1,1.9,2 7.9,3.8,6.4,2.0,2 6.4,2.8,5.6,2.2,2 6.3,2.8,5.1,1.5,2 6.1,2.6,5.6,1.4,2 7.7,3.0,6.1,2.3,2 6.3,3.4,5.6,2.4,2 6.4,3.1,5.5,1.8,2 6.0,3.0,4.8,1.8,2 6.9,3.1,5.4,2.1,2 6.7,3.1,5.6,2.4,2 6.9,3.1,5.1,2.3,2 5.8,2.7,5.1,1.9,2 6.8,3.2,5.9,2.3,2 6.7,3.3,5.7,2.5,2 6.7,3.0,5.2,2.3,2 6.3,2.5,5.0,1.9,2 6.5,3.0,5.2,2.0,2 6.2,3.4,5.4,2.3,2 5.9,3.0,5.1,1.8,2 |
sepal_length,sepal_width,petal_length,petal_width 5.0,2.5,3.0,1.5 7.0,3.0,4.0,1.0 |
Python files
mlrprob.py
#################### Multinomial Logistic Regression in Python (Probabilities in Each Class for New Observations) #################### # #Run this script on Terminal of MacOS as follows: #python3 mlrprob.py Xy.csv Species XTEST.csv sepal_length petal_length #python3 mlrprob.py (Training Data File that has X and y) (y column name of Training Data File for labelling) (Test Data X without y) (x1 name) (x2 name) # #Reference #https://chrisalbon.com/machine_learning/naive_bayes/multinomial_logistic_regression/ #https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html ########## import from sklearn.linear_model import LogisticRegression from sklearn import datasets from sklearn.preprocessing import StandardScaler import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import sys ########## arguments TrainFname = str(sys.argv[1]) yname = str(sys.argv[2]) TestFname = str(sys.argv[3]) x1name = str(sys.argv[4]) x2name = str(sys.argv[5]) #################### Multinomial Logistic Regression (Probabilities in Each Class for New Observations) #################### ########## Generate Training Data (without a header) ''' iris = datasets.load_iris() with open('iris.DESCR.txt', 'w') as f: print(iris.DESCR, file=f) ''' #print(type(iris)) #<class 'sklearn.utils.Bunch'> #X = iris.data #y = iris.target # #print(type(X)) #<class 'numpy.ndarray'> # #print(type(y)) #<class 'numpy.ndarray'> # #pd.DataFrame(data=X).to_csv("X.csv", header=False, index=False) #pd.DataFrame(data=y).to_csv("y.csv", header=False, index=False) # #X = pd.read_csv('X.csv', header=None).values #y = pd.read_csv('y.csv', header=None).values.ravel() # #print(type(X)) #<class 'numpy.ndarray'> # #print(type(y)) #<class 'numpy.ndarray'> #print(pd.DataFrame(X).head()) # a header is added with values 0, 1, 2, and 3 ''' 0 1 2 3 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2 ''' #print(pd.DataFrame(X).tail()) # a header is added with values 0, 1, 2, and 3 ''' 0 1 2 3 145 6.7 3.0 5.2 2.3 146 6.3 2.5 5.0 1.9 147 6.5 3.0 5.2 2.0 148 6.2 3.4 5.4 2.3 149 5.9 3.0 5.1 1.8 ''' # #print(pd.DataFrame(y).head()) # a header is added with value 0 ''' 0 0 0 1 0 2 0 3 0 4 0 ''' #print(pd.DataFrame(y).tail()) # a header is added with value 0 ''' 0 145 2 146 2 147 2 148 2 149 2 ''' # #print(iris.feature_names) #['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] #print(iris.target_names) #['setosa' 'versicolor' 'virginica'] #0(=setosa), 1(=versicolor), 2(=virginica) #print(iris.data.shape) #(150, 4) ########## Load Training Data (with a header) Xy = pd.read_csv(TrainFname, header=0) y = Xy[yname] #X = Xy[Xy.columns[Xy.columns != yname]] X = Xy.drop(yname, axis=1) #print(X.describe()) ''' sepal_length sepal_width petal_length petal_width count 150.000000 150.000000 150.000000 150.000000 mean 5.843333 3.057333 3.758000 1.199333 std 0.828066 0.435866 1.765298 0.762238 min 4.300000 2.000000 1.000000 0.100000 25% 5.100000 2.800000 1.600000 0.300000 50% 5.800000 3.000000 4.350000 1.300000 75% 6.400000 3.300000 5.100000 1.800000 max 7.900000 4.400000 6.900000 2.500000 ''' #print(type(X)) #<class 'pandas.core.frame.DataFrame'> #print(type(y)) #<class 'pandas.core.frame.DataFrame'> #print(X.head()) ''' sepal_length sepal_width petal_length petal_width 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2 ''' #print(y.head()) ''' 0 0 1 0 2 0 3 0 4 0 ''' ###### Two Specified X columns #If you comment this out, then all Xs will be considered. X = Xy.loc[:, [x1name, x2name]] ###### Data Conversion From pd.DataFrame to np.ndarray Xnp = X.values ynp = y.values.ravel() #print(type(Xnp)) #<class 'numpy.ndarray'> # #print(type(ynp)) #<class 'numpy.ndarray'> #print(pd.DataFrame(Xnp).head()) ''' 0 1 0 5.1 1.4 1 4.9 1.4 2 4.7 1.3 3 4.6 1.5 4 5.0 1.4 ''' # #print(pd.DataFrame(ynp).head()) ''' 0 0 0 1 0 2 0 3 0 4 0 ''' ########## Standardize Features X scaler = StandardScaler() # Standardize Features X # #scaler = MinMaxScaler() # Normalizing Features X # #print(type(scaler)) #<class 'sklearn.preprocessing._data.StandardScaler'> X_std = scaler.fit_transform(Xnp) #X_std = scaler.inverse_transform(Xnp) ########## Create Multinomial Logistic Regression clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg') ########## Train Multinomial Logistic Regression (by using all features X) # Train model model = clf.fit(X_std, ynp) ########## Test Data without Answers: Previously Unseen Observations #XTEST = pd.read_csv('XTEST.csv', header=0) XTEST = pd.read_csv(TestFname, header=0) #If you comment this out, then all Xs will be considered. XTEST = XTEST.loc[:, [x1name, x2name]] XTESTnp = XTEST.values #print(type(XTESTnp)) #<class 'numpy.ndarray'> #print(XTESTnp) ''' [[5. 3.] [7. 4.]] ''' print(pd.DataFrame(XTESTnp).head()) ''' 0 1 0 5.0 3.0 1 7.0 4.0 ''' ########## Predict Observations’ Class #model.predict(new_observation) model.predict(XTESTnp) ########## Probabilities in Each Class for Test Data #model.predict_proba(new_observation) #print(model.predict_proba(new_observation)) print(model.predict_proba(XTESTnp)) ''' [[5.93479270e-03 6.37847758e-01 3.56217450e-01] [7.50267624e-17 7.48974219e-07 9.99999251e-01]] ''' ''' [[5.93479270e-03 63.7847758 35.6217450] [7.50267624e-17 7.48974219e-07 99.9999251]] ''' #0(=setosa), 1(=versicolor), 2(=virginica) with open('model.predict_proba.txt', 'w') as f: print(model.predict_proba(XTESTnp), file=f) #print(iris.target_names) #['setosa' 'versicolor' 'virginica'] #0(=setosa), 1(=versicolor), 2(=virginica) yclass = pd.DataFrame(ynp).drop_duplicates() yclass = yclass.reset_index(drop=True) # #print(type(yclass)) #<class 'pandas.core.frame.DataFrame'> # #print(yclass) ''' 0 0 0 1 1 2 2 ''' ########## Pair Plot sns.pairplot(Xy, hue=yname, height=2) plt.legend() plt.savefig('Figure_1_Pair_Plot.png') plt.show() #################### Multinominal Logistic Regression (Two Independent X) #################### ########## Data For Logistic Regression #print(Xy) #print(Xy.columns) ''' Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'Species'], dtype='object') ''' #print(Xy[x1name]) #print(Xy[x2name]) #print(Xy.loc[:, [x1name, x2name]]) #X = Xy[:, [x1name, x2name]] X = Xy.loc[:, [x1name, x2name]] #print(X) #print(type(X)) #<class 'pandas.core.frame.DataFrame'> X = X.values #print(type(X)) #<class 'numpy.ndarray'> #print(X) y = y.values.ravel() #print(type(y)) #<class 'numpy.ndarray'> ########## A Function for Decision Boundary Drawing # graph common settings h = .02 # step size in the mesh x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) #def decision_boundary(clf, X, y, ax, title): def decision_boundary(clf, X, y, ax, title, x1name, x2name): clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) ax.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points ax.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired) #ax.scatter(X[:, 0], X[:, 1], c=y, label=y, edgecolors='k', cmap=plt.cm.Paired) #ax.scatter(X[:, 0], X[:, 1], c=y, label=ax, edgecolors='k', cmap=plt.cm.Paired) # #ax.legend() #plt.legend() # #print(X[:, 0]) #print(X[:, 1]) #print(y) # label ax.set_title(title) # #ax.set_xlabel('sepal length') ax.set_xlabel(x1name) #ax.set_ylabel('petal length') ax.set_ylabel(x2name) #Test Data #ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired) ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1]) ########## Logistic Regression fig, axes = plt.subplots(1, 3, figsize=(12, 4)) #fig, axes = plt.subplots(1, 1, figsize=(10, 3)) #C is a parameter for regularization. Larger C means weak regularization and could cause over-fitting. for ax, C in zip(axes, [0.01, 1, 100]): title = "C=%s"% (C) # clf = LogisticRegression(C=C) #print(type(clf)) #<class 'sklearn.linear_model._logistic.LogisticRegression'> # #decision_boundary(clf, X, y, ax, title) decision_boundary(clf, X, y, ax, title, x1name, x2name) #ax.legend() #plt.legend() #ax.legend() #plt.legend() #ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired) plt.savefig('Figure_2_Logistic_Regression.png') plt.show() |
Figures
Figure_1_Pair_Plot.png
Figure_2_Logistic_Regression.png
No comments:
Post a Comment