The Financial Journal (Global): Multinomial Logistic Regression in Python (Probabilities in Each Class for New Observations)

Multinomial Logistic Regression in Python (Probabilities in Each Class for New Observations)

**0_MacOS_Python_setup.txt**
# Install on Terminal of MacOS #pip3 install -U pandas #pip3 install -U numpy #pip3 install -U matplotlib #pip3 install -U scikit-learn #pip3 install -U seaborn

1_MacOS_Terminal.txt

########## Run Terminal on MacOS and execute
### TO UPDATE
cd "YOUR_WORKING_DIRECTORY"

python3 mlrprob.py Xy.csv Species XTEST.csv sepal_length petal_length

Data files

Xy.csv

sepal_length,sepal_width,petal_length,petal_width,Species
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
4.7,3.2,1.3,0.2,0
4.6,3.1,1.5,0.2,0
5.0,3.6,1.4,0.2,0
5.4,3.9,1.7,0.4,0
4.6,3.4,1.4,0.3,0
5.0,3.4,1.5,0.2,0
4.4,2.9,1.4,0.2,0
4.9,3.1,1.5,0.1,0
5.4,3.7,1.5,0.2,0
4.8,3.4,1.6,0.2,0
4.8,3.0,1.4,0.1,0
4.3,3.0,1.1,0.1,0
5.8,4.0,1.2,0.2,0
5.7,4.4,1.5,0.4,0
5.4,3.9,1.3,0.4,0
5.1,3.5,1.4,0.3,0
5.7,3.8,1.7,0.3,0
5.1,3.8,1.5,0.3,0
5.4,3.4,1.7,0.2,0
5.1,3.7,1.5,0.4,0
4.6,3.6,1.0,0.2,0
5.1,3.3,1.7,0.5,0
4.8,3.4,1.9,0.2,0
5.0,3.0,1.6,0.2,0
5.0,3.4,1.6,0.4,0
5.2,3.5,1.5,0.2,0
5.2,3.4,1.4,0.2,0
4.7,3.2,1.6,0.2,0
4.8,3.1,1.6,0.2,0
5.4,3.4,1.5,0.4,0
5.2,4.1,1.5,0.1,0
5.5,4.2,1.4,0.2,0
4.9,3.1,1.5,0.2,0
5.0,3.2,1.2,0.2,0
5.5,3.5,1.3,0.2,0
4.9,3.6,1.4,0.1,0
4.4,3.0,1.3,0.2,0
5.1,3.4,1.5,0.2,0
5.0,3.5,1.3,0.3,0
4.5,2.3,1.3,0.3,0
4.4,3.2,1.3,0.2,0
5.0,3.5,1.6,0.6,0
5.1,3.8,1.9,0.4,0
4.8,3.0,1.4,0.3,0
5.1,3.8,1.6,0.2,0
4.6,3.2,1.4,0.2,0
5.3,3.7,1.5,0.2,0
5.0,3.3,1.4,0.2,0
7.0,3.2,4.7,1.4,1
6.4,3.2,4.5,1.5,1
6.9,3.1,4.9,1.5,1
5.5,2.3,4.0,1.3,1
6.5,2.8,4.6,1.5,1
5.7,2.8,4.5,1.3,1
6.3,3.3,4.7,1.6,1
4.9,2.4,3.3,1.0,1
6.6,2.9,4.6,1.3,1
5.2,2.7,3.9,1.4,1
5.0,2.0,3.5,1.0,1
5.9,3.0,4.2,1.5,1
6.0,2.2,4.0,1.0,1
6.1,2.9,4.7,1.4,1
5.6,2.9,3.6,1.3,1
6.7,3.1,4.4,1.4,1
5.6,3.0,4.5,1.5,1
5.8,2.7,4.1,1.0,1
6.2,2.2,4.5,1.5,1
5.6,2.5,3.9,1.1,1
5.9,3.2,4.8,1.8,1
6.1,2.8,4.0,1.3,1
6.3,2.5,4.9,1.5,1
6.1,2.8,4.7,1.2,1
6.4,2.9,4.3,1.3,1
6.6,3.0,4.4,1.4,1
6.8,2.8,4.8,1.4,1
6.7,3.0,5.0,1.7,1
6.0,2.9,4.5,1.5,1
5.7,2.6,3.5,1.0,1
5.5,2.4,3.8,1.1,1
5.5,2.4,3.7,1.0,1
5.8,2.7,3.9,1.2,1
6.0,2.7,5.1,1.6,1
5.4,3.0,4.5,1.5,1
6.0,3.4,4.5,1.6,1
6.7,3.1,4.7,1.5,1
6.3,2.3,4.4,1.3,1
5.6,3.0,4.1,1.3,1
5.5,2.5,4.0,1.3,1
5.5,2.6,4.4,1.2,1
6.1,3.0,4.6,1.4,1
5.8,2.6,4.0,1.2,1
5.0,2.3,3.3,1.0,1
5.6,2.7,4.2,1.3,1
5.7,3.0,4.2,1.2,1
5.7,2.9,4.2,1.3,1
6.2,2.9,4.3,1.3,1
5.1,2.5,3.0,1.1,1
5.7,2.8,4.1,1.3,1
6.3,3.3,6.0,2.5,2
5.8,2.7,5.1,1.9,2
7.1,3.0,5.9,2.1,2
6.3,2.9,5.6,1.8,2
6.5,3.0,5.8,2.2,2
7.6,3.0,6.6,2.1,2
4.9,2.5,4.5,1.7,2
7.3,2.9,6.3,1.8,2
6.7,2.5,5.8,1.8,2
7.2,3.6,6.1,2.5,2
6.5,3.2,5.1,2.0,2
6.4,2.7,5.3,1.9,2
6.8,3.0,5.5,2.1,2
5.7,2.5,5.0,2.0,2
5.8,2.8,5.1,2.4,2
6.4,3.2,5.3,2.3,2
6.5,3.0,5.5,1.8,2
7.7,3.8,6.7,2.2,2
7.7,2.6,6.9,2.3,2
6.0,2.2,5.0,1.5,2
6.9,3.2,5.7,2.3,2
5.6,2.8,4.9,2.0,2
7.7,2.8,6.7,2.0,2
6.3,2.7,4.9,1.8,2
6.7,3.3,5.7,2.1,2
7.2,3.2,6.0,1.8,2
6.2,2.8,4.8,1.8,2
6.1,3.0,4.9,1.8,2
6.4,2.8,5.6,2.1,2
7.2,3.0,5.8,1.6,2
7.4,2.8,6.1,1.9,2
7.9,3.8,6.4,2.0,2
6.4,2.8,5.6,2.2,2
6.3,2.8,5.1,1.5,2
6.1,2.6,5.6,1.4,2
7.7,3.0,6.1,2.3,2
6.3,3.4,5.6,2.4,2
6.4,3.1,5.5,1.8,2
6.0,3.0,4.8,1.8,2
6.9,3.1,5.4,2.1,2
6.7,3.1,5.6,2.4,2
6.9,3.1,5.1,2.3,2
5.8,2.7,5.1,1.9,2
6.8,3.2,5.9,2.3,2
6.7,3.3,5.7,2.5,2
6.7,3.0,5.2,2.3,2
6.3,2.5,5.0,1.9,2
6.5,3.0,5.2,2.0,2
6.2,3.4,5.4,2.3,2
5.9,3.0,5.1,1.8,2

**XTEST.csv**
sepal_length,sepal_width,petal_length,petal_width 5.0,2.5,3.0,1.5 7.0,3.0,4.0,1.0

Python files

mlrprob.py

#################### Multinomial Logistic Regression in Python (Probabilities in Each Class for New Observations) ####################
#
#Run this script on Terminal of MacOS as follows:
#python3 mlrprob.py Xy.csv Species XTEST.csv sepal_length petal_length
#python3 mlrprob.py (Training Data File that has X and y) (y column name of Training Data File for labelling) (Test Data X without y) (x1 name) (x2 name)
#
#Reference
#https://chrisalbon.com/machine_learning/naive_bayes/multinomial_logistic_regression/
#https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html

########## import
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

########## arguments
TrainFname = str(sys.argv[1])
yname = str(sys.argv[2])
TestFname = str(sys.argv[3])
x1name = str(sys.argv[4])
x2name = str(sys.argv[5])

#################### Multinomial Logistic Regression (Probabilities in Each Class for New Observations) ####################

########## Generate Training Data (without a header)

'''
iris = datasets.load_iris()
with open('iris.DESCR.txt', 'w') as f:
print(iris.DESCR, file=f)
'''

#print(type(iris))
#<class 'sklearn.utils.Bunch'>

#X = iris.data
#y = iris.target
#
#print(type(X))
#<class 'numpy.ndarray'>
#
#print(type(y))
#<class 'numpy.ndarray'>
#
#pd.DataFrame(data=X).to_csv("X.csv", header=False, index=False)
#pd.DataFrame(data=y).to_csv("y.csv", header=False, index=False)
#
#X = pd.read_csv('X.csv', header=None).values
#y = pd.read_csv('y.csv', header=None).values.ravel()
#
#print(type(X))
#<class 'numpy.ndarray'>
#
#print(type(y))
#<class 'numpy.ndarray'>

#print(pd.DataFrame(X).head()) # a header is added with values 0, 1, 2, and 3
'''
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
'''
#print(pd.DataFrame(X).tail()) # a header is added with values 0, 1, 2, and 3
'''
0 1 2 3
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
'''
#
#print(pd.DataFrame(y).head()) # a header is added with value 0
'''
0
0 0
1 0
2 0
3 0
4 0
'''
#print(pd.DataFrame(y).tail()) # a header is added with value 0
'''
0
145 2
146 2
147 2
148 2
149 2
'''
#

#print(iris.feature_names)
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

#print(iris.target_names)
#['setosa' 'versicolor' 'virginica']
#0(=setosa), 1(=versicolor), 2(=virginica)

#print(iris.data.shape)
#(150, 4)

########## Load Training Data (with a header)

Xy = pd.read_csv(TrainFname, header=0)

y = Xy[yname]

#X = Xy[Xy.columns[Xy.columns != yname]]
X = Xy.drop(yname, axis=1)

#print(X.describe())
'''
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
'''

#print(type(X))
#<class 'pandas.core.frame.DataFrame'>

#print(type(y))
#<class 'pandas.core.frame.DataFrame'>

#print(X.head())
'''
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
'''

#print(y.head())
'''
0 0
1 0
2 0
3 0
4 0
'''

###### Two Specified X columns

#If you comment this out, then all Xs will be considered.
X = Xy.loc[:, [x1name, x2name]]

###### Data Conversion From pd.DataFrame to np.ndarray

Xnp = X.values
ynp = y.values.ravel()

#print(type(Xnp))
#<class 'numpy.ndarray'>
#
#print(type(ynp))
#<class 'numpy.ndarray'>

#print(pd.DataFrame(Xnp).head())
'''
0 1
0 5.1 1.4
1 4.9 1.4
2 4.7 1.3
3 4.6 1.5
4 5.0 1.4
'''
#
#print(pd.DataFrame(ynp).head())
'''
0
0 0
1 0
2 0
3 0
4 0
'''

########## Standardize Features X

scaler = StandardScaler() # Standardize Features X
#
#scaler = MinMaxScaler() # Normalizing Features X
#
#print(type(scaler))
#<class 'sklearn.preprocessing._data.StandardScaler'>

X_std = scaler.fit_transform(Xnp)
#X_std = scaler.inverse_transform(Xnp)

########## Create Multinomial Logistic Regression
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')

########## Train Multinomial Logistic Regression (by using all features X)
# Train model
model = clf.fit(X_std, ynp)

########## Test Data without Answers: Previously Unseen Observations
#XTEST = pd.read_csv('XTEST.csv', header=0)
XTEST = pd.read_csv(TestFname, header=0)

#If you comment this out, then all Xs will be considered.
XTEST = XTEST.loc[:, [x1name, x2name]]

XTESTnp = XTEST.values

#print(type(XTESTnp))
#<class 'numpy.ndarray'>

#print(XTESTnp)
'''
[[5. 3.]
[7. 4.]]
'''

print(pd.DataFrame(XTESTnp).head())
'''
0 1
0 5.0 3.0
1 7.0 4.0
'''

########## Predict Observations’ Class
#model.predict(new_observation)
model.predict(XTESTnp)

########## Probabilities in Each Class for Test Data

#model.predict_proba(new_observation)
#print(model.predict_proba(new_observation))
print(model.predict_proba(XTESTnp))
'''
[[5.93479270e-03 6.37847758e-01 3.56217450e-01]
[7.50267624e-17 7.48974219e-07 9.99999251e-01]]
'''
'''
[[5.93479270e-03 63.7847758 35.6217450]
[7.50267624e-17 7.48974219e-07 99.9999251]]
'''
#0(=setosa), 1(=versicolor), 2(=virginica)

with open('model.predict_proba.txt', 'w') as f:
print(model.predict_proba(XTESTnp), file=f)

#print(iris.target_names)
#['setosa' 'versicolor' 'virginica']
#0(=setosa), 1(=versicolor), 2(=virginica)

yclass = pd.DataFrame(ynp).drop_duplicates()
yclass = yclass.reset_index(drop=True)
#
#print(type(yclass))
#<class 'pandas.core.frame.DataFrame'>
#
#print(yclass)
'''
0
0 0
1 1
2 2
'''

########## Pair Plot

sns.pairplot(Xy, hue=yname, height=2)
plt.legend()
plt.savefig('Figure_1_Pair_Plot.png')
plt.show()

#################### Multinominal Logistic Regression (Two Independent X) ####################

########## Data For Logistic Regression

#print(Xy)
#print(Xy.columns)
'''
Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'Species'],
dtype='object')
'''
#print(Xy[x1name])
#print(Xy[x2name])

#print(Xy.loc[:, [x1name, x2name]])
#X = Xy[:, [x1name, x2name]]
X = Xy.loc[:, [x1name, x2name]]
#print(X)
#print(type(X))
#<class 'pandas.core.frame.DataFrame'>

X = X.values
#print(type(X))
#<class 'numpy.ndarray'>
#print(X)

y = y.values.ravel()
#print(type(y))
#<class 'numpy.ndarray'>

########## A Function for Decision Boundary Drawing

# graph common settings
h = .02 # step size in the mesh
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

#def decision_boundary(clf, X, y, ax, title):
def decision_boundary(clf, X, y, ax, title, x1name, x2name):
clf.fit(X, y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
ax.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
#ax.scatter(X[:, 0], X[:, 1], c=y, label=y, edgecolors='k', cmap=plt.cm.Paired)
#ax.scatter(X[:, 0], X[:, 1], c=y, label=ax, edgecolors='k', cmap=plt.cm.Paired)
#
#ax.legend()
#plt.legend()
#
#print(X[:, 0])
#print(X[:, 1])
#print(y)

# label
ax.set_title(title)
#
#ax.set_xlabel('sepal length')
ax.set_xlabel(x1name)
#ax.set_ylabel('petal length')
ax.set_ylabel(x2name)

#Test Data
#ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1])

########## Logistic Regression

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
#fig, axes = plt.subplots(1, 1, figsize=(10, 3))

#C is a parameter for regularization. Larger C means weak regularization and could cause over-fitting.
for ax, C in zip(axes, [0.01, 1, 100]):
title = "C=%s"% (C)
#
clf = LogisticRegression(C=C)
#print(type(clf))
#<class 'sklearn.linear_model._logistic.LogisticRegression'>
#
#decision_boundary(clf, X, y, ax, title)
decision_boundary(clf, X, y, ax, title, x1name, x2name)
#ax.legend()
#plt.legend()

#ax.legend()
#plt.legend()
#ax.scatter(XTESTnp[:, 0], XTESTnp[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
plt.savefig('Figure_2_Logistic_Regression.png')
plt.show()

Figures

Figure_1_Pair_Plot.png

Figure_2_Logistic_Regression.png

References

https://chrisalbon.com/machine_learning/naive_bayes/multinomial_logistic_regression/
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html

The Financial Journal (Global)

AdSense

Monday, June 15, 2020

Multinomial Logistic Regression in Python (Probabilities in Each Class for New Observations)

No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Report Abuse