The Financial Journal (Global): July 2019

Saturday, July 13, 2019

Python: Linear and Regularized Logistic Regression

0_runme.txt

##### Open Terminal on Mac OS

# Run the following commands on Terminal.

#Current working directory
pwd
#For instance, your MacOS environment goes like this:
#/Users/xxx/

#Change your working directory
#cd /Users/xxx/Downloads

which python3
#For instance, my MacOS environment goes like this:
#/Library/Frameworks/Python.framework/Versions/3.7/bin/python3

which pip3
#For instance, my MacOS environment goes like this:
#/Library/Frameworks/Python.framework/Versions/3.7/bin/pip3

#Run after connecting to the Internet
pip3 install matplotlib
pip3 install sklearn
pip3 install dtreeviz
pip3 install IPython

pip3 install pandas
pip3 install scipy

python3 -V
#For instance, my MacOS environment goes like this:
#Python 3.7.4

#Starting Python 3
#python3
#You do not use this since you're running py scripts on Terminal, rather than running your scripts on Pyton IDLE.

#Download files:
#ex1data1.txt
#ex1data2.txt
#ex2data1.txt
#ex2data2.txt
#
#From:
#https://github.com/LilianYe/Andrew-Ng-Machine-Learning-Programming-solutions-/find/master

#Running Python py scripts
python3 1a_liner_regression_w_one_variable.py
python3 1b_liner_regression_w_multiple_variables.py

python3 2.1_logistic_regression_or_classification.py
python3 2.2_regularized_logistic_regression.py

1a_liner_regression_w_one_variable.py

########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 1)
########## Linear Regression with One Variable
#
# Reference:
#https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-1-6b8dd1c73d80
#
#Here we will implement linear regression with one variable to predict profits for a food truck.
#
#ex1data1.txt
#contains the dataset for our linear regression exercise.
#column data
#1 population of a city
#2 profit of a food truck in that city

# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

###Reading and Plotting the data
data = pd.read_csv('ex1data1.txt', header = None) #read from dataset
X = data.iloc[:,0] # read first column
y = data.iloc[:,1] # read second column
m = len(y) # number of training example
data.head() # view first few rows of the data

plt.scatter(X, y)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.savefig('fig_1a.1.png') # Save an image file
plt.show()

###Adding the intercept term
X = X[:,np.newaxis] #(*1)
y = y[:,np.newaxis] #(*1)
theta = np.zeros([2,1]) #set one initial parameter theta to 0
iterations = 1500
alpha = 0.01 # set another initial parameter, the learning rate alpha, to 0.01
ones = np.ones((m,1))
X = np.hstack((ones, X)) # adding the intercept term

#(*1)
#Note on np.newaxis:
# When you read data into X, you will observe that X, y are rank 1 arrays.
#rank 1 array will have a shape of (m, ) whereas rank 2 arrays will have a shape of (m,1).
#When operating on arrays its good to convert rank 1 arrays to rank 2 arrays because rank 1 arrays often give unexpected results.
#To convert rank 1 to rank 2 array we use someArray[:,np.newaxis].

###Computing the cost
def computeCost(X, y, theta):
temp = np.dot(X, theta) - y
return np.sum(np.power(temp, 2)) / (2*m)
J = computeCost(X, y, theta)
print(J)
#You should expect to see a cost of 32.07. More precisely, 32.072733877455676.

###Finding the optimal parameters using Gradient Descent
def gradientDescent(X, y, theta, alpha, iterations):
for _ in range(iterations):
temp = np.dot(X, theta) - y
temp = np.dot(X.T, temp)
theta = theta - (alpha/m) * temp
return theta
theta = gradientDescent(X, y, theta, alpha, iterations)
print(theta)
#Expected theta values [-3.6303, 1.1664]
#Technically,
#[[-3.63029144]
# [ 1.16636235]]
#
# So, for instance, the first row of actual data goes like this.
# Population (10,000) Profit ($10,000)
# 6.1101 17.592
#
# An estimated profit for this population is
# -3.6303 * 1 + 1.1664 * 6.1101 = 3.49652064

#We now have the optimized value of theta . Use this value in the above cost function.

J = computeCost(X, y, theta)
print(J)
#It should give you a value of 4.483 (4.483388256587726) which is much better than 32.07

### Plot showing the best fit line
plt.scatter(X[:,1], y)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.plot(X[:,1], np.dot(X, theta))
plt.savefig('fig_1a.2.png') # Save an image file
plt.show()

1b_liner_regression_w_multiple_variables.py

########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 1)
########## Linear Regression with multiple variables
#(also called Multivariate Linear Regression)

# ex1data2.txt
# a training set of housing prices in Portland, Oregon
# column data
# 1 size of the house (in square feet)
# 2 number of bedrooms
# 3 price of the house

### import
import numpy as np
import pandas as pd

### data loading
data = pd.read_csv('ex1data2.txt', sep = ',', header = None)
X = data.iloc[:,0:2] # read first two columns into X
y = data.iloc[:,2] # read the third column into y
m = len(y) # no. of training samples
data.head()

### Feature Normalization
X = (X - np.mean(X))/np.std(X)

###Adding the intercept term and initializing parameters
ones = np.ones((m,1))
X = np.hstack((ones, X))
alpha = 0.01
num_iters = 400
theta = np.zeros((3,1))
y = y[:,np.newaxis]

###Computing the cost
def computeCostMulti(X, y, theta):
temp = np.dot(X, theta) - y
return np.sum(np.power(temp, 2)) / (2*m)
J = computeCostMulti(X, y, theta)
print(J)
#You should expect to see a cost of 65591548106.45744.

###Finding the optimal parameters using Gradient Descent
def gradientDescentMulti(X, y, theta, alpha, iterations):
m = len(y)
for _ in range(iterations):
temp = np.dot(X, theta) - y
temp = np.dot(X.T, temp)
theta = theta - (alpha/m) * temp
return theta
theta = gradientDescentMulti(X, y, theta, alpha, num_iters)
print(theta)
# your optimal parameters will be [[334302.06399328],[ 99411.44947359], [3267.01285407]]
# For instance, if you look at the sample data in the first row,
# 2104 3 399900
# first column's average: 2000.68085106383
# first column's SD: 794.70235353389
# second column's average: 3.17021276595745
# second column's SD: 0.7609818867801
#
# So, the first column (2104-2000.68085106383)/794.70235353389 = 0.1300 (SD)
# Then the second column (3-3.17021276595745)/0.7609818867801 = -0.2237 (SD)

# 334302.06399328 * 1 + 99411.44947359 * 0.1300 + 3267.01285407 * (-0.2237)
# = 346494.721649391 (estimated data) ~ 399,900 (actual data)

#We now have the optimized value of theta . Use this value in the above cost function.
J = computeCostMulti(X, y, theta)
print(J)
#This should give you a value of 2105448288.6292474 which is much better than 65591548106.45744

2.1_logistic_regression_or_classification.py

########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 2.1)
########## Logistic Regression or Classification (part 2.1)
#
# Reference:
# https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-2-1-1a666f049ad6
#
#

##### Logistic Regression

### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt # more on this later

### data
#ex2data1.txt
#column data
#1 Exam 1 score
#2 Exam 2 score
#3 0 (fail) or 1 (pass)

data = pd.read_csv('ex2data1.txt', header = None)
X = data.iloc[:,:-1]
y = data.iloc[:,2]
data.head()

### plotting

mask = y == 1
adm = plt.scatter(X[mask][0].values, X[mask][1].values)
not_adm = plt.scatter(X[~mask][0].values, X[~mask][1].values)
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend((adm, not_adm), ('Admitted', 'Not admitted'))
plt.savefig('fig_2.1.1.png') # Save an image file
plt.show()

### Implementation

## Sigmoid Function

def sigmoid(x):
return 1/(1+np.exp(-x))

## Cost Function
def costFunction(theta, X, y):
J = (-1/m) * np.sum(np.multiply(y, np.log(sigmoid(X @ theta)))
+ np.multiply((1-y), np.log(1 - sigmoid(X @ theta))))
return J

## Gradient Function
def gradient(theta, X, y):
return ((1/m) * X.T @ (sigmoid(X @ theta) - y))

(m, n) = X.shape
X = np.hstack((np.ones((m,1)), X))
y = y[:, np.newaxis]
theta = np.zeros((n+1,1)) # intializing theta with all zeros
J = costFunction(theta, X, y)
print(J)
# This should give us a value of 0.693 for J.
# More precisely, 0.6931471805599453

### Learning parameters using fmin_tnc
temp = opt.fmin_tnc(func = costFunction,
x0 = theta.flatten(),fprime = gradient,
args = (X, y.flatten()))
#the output of above function is a tuple whose first element #contains the optimized values of theta
theta_optimized = temp[0]
print(theta_optimized)
# The above code should give [-25.16131862, 0.20623159, 0.20147149].

J = costFunction(theta_optimized[:,np.newaxis], X, y)
print(J)
#You should see a value of 0.203 (0.20349770158947486).
# Compare this with the cost 0.693 obtained using initial theta.

### Plotting Decision Boundary (Optional)
plot_x = [np.min(X[:,1]-2), np.max(X[:,2]+2)]
plot_y = -1/theta_optimized[2]*(theta_optimized[0]
+ np.dot(theta_optimized[1],plot_x))
mask = y.flatten() == 1
adm = plt.scatter(X[mask][:,1], X[mask][:,2])
not_adm = plt.scatter(X[~mask][:,1], X[~mask][:,2])
decision_boun = plt.plot(plot_x, plot_y)
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend((adm, not_adm), ('Admitted', 'Not admitted'))
plt.savefig('fig_2.1.2.png') # Save an image file
plt.show()

def accuracy(X, y, theta, cutoff):
pred = [sigmoid(np.dot(X, theta)) >= cutoff]
acc = np.mean(pred == y)
print(acc * 100)
accuracy(X, y.flatten(), theta_optimized, 0.5)

# This should give us an accuracy score of 89% . Hmm… not bad.

2.2_regularized_logistic_regression.py

########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 2.2)
########## Regularized Logistic Regression (part 2.2)
#
# Reference:
# https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-2-2-dceff1a12a12
#
#

##### Regularized logistic regression

### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt # more on this later

### data
#ex2data2.txt
#column data
#1 test 1 result
#2 test 2 result
#3 0 (rejected) or 1 (accepted)

data = pd.read_csv('ex2data2.txt', header = None)
X = data.iloc[:,:-1]
y = data.iloc[:,2]
data.head()

### plotting
mask = y == 1
passed = plt.scatter(X[mask][0].values, X[mask][1].values)
failed = plt.scatter(X[~mask][0].values, X[~mask][1].values)
plt.xlabel('Microchip Test1')
plt.ylabel('Microchip Test2')
plt.legend((passed, failed), ('Passed', 'Failed'))
plt.savefig('fig_2.2.1.png') # Save an image file
plt.show()

### Feature mapping
def mapFeature(X1, X2):
degree = 6
out = np.ones(X.shape[0])[:,np.newaxis]
for i in range(1, degree+1):
for j in range(i+1):
out = np.hstack((out, np.multiply(np.power(X1, i-j), np.power(X2, j))[:,np.newaxis]))
return out
X = mapFeature(X.iloc[:,0], X.iloc[:,1])

### Implementation

## Sigmoid Function
def sigmoid(x):
return 1/(1+np.exp(-x))

## Cost Function
def lrCostFunction(theta_t, X_t, y_t, lambda_t):
m = len(y_t)
J = (-1/m) * (y_t.T @ np.log(sigmoid(X_t @ theta_t)) + (1 - y_t.T) @ np.log(1 - sigmoid(X_t @ theta_t)))
reg = (lambda_t/(2*m)) * (theta_t[1:].T @ theta_t[1:])
J = J + reg
return J

## Gradient Function
def lrGradientDescent(theta, X, y, lambda_t):
m = len(y)
grad = np.zeros([m,1])
grad = (1/m) * X.T @ (sigmoid(X @ theta) - y)
grad[1:] = grad[1:] + (lambda_t / m) * theta[1:]
return grad

(m, n) = X.shape
y = y[:, np.newaxis]
theta = np.zeros((n,1))
lmbda = 1
J = lrCostFunction(theta, X, y, lmbda)
print(J)
#This gives us a values of 0.69314718.

## Learning parameters using fmin_tnc
output = opt.fmin_tnc(func = lrCostFunction, x0 = theta.flatten(), fprime = lrGradientDescent, \
args = (X, y.flatten(), lmbda))
theta = output[0]
print(theta) # theta contains the optimized values

## Accuracy of model
pred = [sigmoid(np.dot(X, theta)) >= 0.5]
np.mean(pred == y.flatten()) * 100
print(np.mean(pred == y.flatten()) * 100)
# This gives our model accuracy as 83.05% (83.05084745762711).

## Plotting Decision Boundary (optional)
u = np.linspace(-1, 1.5, 50)
v = np.linspace(-1, 1.5, 50)
z = np.zeros((len(u), len(v)))
def mapFeatureForPlotting(X1, X2):
degree = 6
out = np.ones(1)
for i in range(1, degree+1):
for j in range(i+1):
out = np.hstack((out, np.multiply(np.power(X1, i-j), np.power(X2, j))))
return out
for i in range(len(u)):
for j in range(len(v)):
z[i,j] = np.dot(mapFeatureForPlotting(u[i], v[j]), theta)
mask = y.flatten() == 1
X = data.iloc[:,:-1]
passed = plt.scatter(X[mask][0], X[mask][1])
failed = plt.scatter(X[~mask][0], X[~mask][1])
plt.contour(u,v,z,0)
plt.xlabel('Microchip Test1')
plt.ylabel('Microchip Test2')
plt.legend((passed, failed), ('Passed', 'Failed'))
plt.savefig('fig_2.2.2.png') # Save an image file
plt.show()

data

ex1data1.txt

6.1101,17.592
5.5277,9.1302
8.5186,13.662
7.0032,11.854
5.8598,6.8233
8.3829,11.886
7.4764,4.3483
8.5781,12
6.4862,6.5987
5.0546,3.8166
5.7107,3.2522
14.164,15.505
5.734,3.1551
8.4084,7.2258
5.6407,0.71618
5.3794,3.5129
6.3654,5.3048
5.1301,0.56077
6.4296,3.6518
7.0708,5.3893
6.1891,3.1386
20.27,21.767
5.4901,4.263
6.3261,5.1875
5.5649,3.0825
18.945,22.638
12.828,13.501
10.957,7.0467
13.176,14.692
22.203,24.147
5.2524,-1.22
6.5894,5.9966
9.2482,12.134
5.8918,1.8495
8.2111,6.5426
7.9334,4.5623
8.0959,4.1164
5.6063,3.3928
12.836,10.117
6.3534,5.4974
5.4069,0.55657
6.8825,3.9115
11.708,5.3854
5.7737,2.4406
7.8247,6.7318
7.0931,1.0463
5.0702,5.1337
5.8014,1.844
11.7,8.0043
5.5416,1.0179
7.5402,6.7504
5.3077,1.8396
7.4239,4.2885
7.6031,4.9981
6.3328,1.4233
6.3589,-1.4211
6.2742,2.4756
5.6397,4.6042
9.3102,3.9624
9.4536,5.4141
8.8254,5.1694
5.1793,-0.74279
21.279,17.929
14.908,12.054
18.959,17.054
7.2182,4.8852
8.2951,5.7442
10.236,7.7754
5.4994,1.0173
20.341,20.992
10.136,6.6799
7.3345,4.0259
6.0062,1.2784
7.2259,3.3411
5.0269,-2.6807
6.5479,0.29678
7.5386,3.8845
5.0365,5.7014
10.274,6.7526
5.1077,2.0576
5.7292,0.47953
5.1884,0.20421
6.3557,0.67861
9.7687,7.5435
6.5159,5.3436
8.5172,4.2415
9.1802,6.7981
6.002,0.92695
5.5204,0.152
5.0594,2.8214
5.7077,1.8451
7.6366,4.2959
5.8707,7.2029
5.3054,1.9869
8.2934,0.14454
13.394,9.0551
5.4369,0.61705

ex1data2.txt

2104,3,399900
1600,3,329900
2400,3,369000
1416,2,232000
3000,4,539900
1985,4,299900
1534,3,314900
1427,3,198999
1380,3,212000
1494,3,242500
1940,4,239999
2000,3,347000
1890,3,329999
4478,5,699900
1268,3,259900
2300,4,449900
1320,2,299900
1236,3,199900
2609,4,499998
3031,4,599000
1767,3,252900
1888,2,255000
1604,3,242900
1962,4,259900
3890,3,573900
1100,3,249900
1458,3,464500
2526,3,469000
2200,3,475000
2637,3,299900
1839,2,349900
1000,1,169900
2040,4,314900
3137,3,579900
1811,4,285900
1437,3,249900
1239,3,229900
2132,4,345000
4215,4,549000
2162,4,287000
1664,2,368500
2238,3,329900
2567,4,314000
1200,3,299000
852,2,179900
1852,4,299900
1203,3,239500

ex2data1.txt

34.62365962451697,78.0246928153624,0
30.28671076822607,43.89499752400101,0
35.84740876993872,72.90219802708364,0
60.18259938620976,86.30855209546826,1
79.0327360507101,75.3443764369103,1
45.08327747668339,56.3163717815305,0
61.10666453684766,96.51142588489624,1
75.02474556738889,46.55401354116538,1
76.09878670226257,87.42056971926803,1
84.43281996120035,43.53339331072109,1
95.86155507093572,38.22527805795094,0
75.01365838958247,30.60326323428011,0
82.30705337399482,76.48196330235604,1
69.36458875970939,97.71869196188608,1
39.53833914367223,76.03681085115882,0
53.9710521485623,89.20735013750205,1
69.07014406283025,52.74046973016765,1
67.94685547711617,46.67857410673128,0
70.66150955499435,92.92713789364831,1
76.97878372747498,47.57596364975532,1
67.37202754570876,42.83843832029179,0
89.67677575072079,65.79936592745237,1
50.534788289883,48.85581152764205,0
34.21206097786789,44.20952859866288,0
77.9240914545704,68.9723599933059,1
62.27101367004632,69.95445795447587,1
80.1901807509566,44.82162893218353,1
93.114388797442,38.80067033713209,0
61.83020602312595,50.25610789244621,0
38.78580379679423,64.99568095539578,0
61.379289447425,72.80788731317097,1
85.40451939411645,57.05198397627122,1
52.10797973193984,63.12762376881715,0
52.04540476831827,69.43286012045222,1
40.23689373545111,71.16774802184875,0
54.63510555424817,52.21388588061123,0
33.91550010906887,98.86943574220611,0
64.17698887494485,80.90806058670817,1
74.78925295941542,41.57341522824434,0
34.1836400264419,75.2377203360134,0
83.90239366249155,56.30804621605327,1
51.54772026906181,46.85629026349976,0
94.44336776917852,65.56892160559052,1
82.36875375713919,40.61825515970618,0
51.04775177128865,45.82270145776001,0
62.22267576120188,52.06099194836679,0
77.19303492601364,70.45820000180959,1
97.77159928000232,86.7278223300282,1
62.07306379667647,96.76882412413983,1
91.56497449807442,88.69629254546599,1
79.94481794066932,74.16311935043758,1
99.2725269292572,60.99903099844988,1
90.54671411399852,43.39060180650027,1
34.52451385320009,60.39634245837173,0
50.2864961189907,49.80453881323059,0
49.58667721632031,59.80895099453265,0
97.64563396007767,68.86157272420604,1
32.57720016809309,95.59854761387875,0
74.24869136721598,69.82457122657193,1
71.79646205863379,78.45356224515052,1
75.3956114656803,85.75993667331619,1
35.28611281526193,47.02051394723416,0
56.25381749711624,39.26147251058019,0
30.05882244669796,49.59297386723685,0
44.66826172480893,66.45008614558913,0
66.56089447242954,41.09209807936973,0
40.45755098375164,97.53518548909936,1
49.07256321908844,51.88321182073966,0
80.27957401466998,92.11606081344084,1
66.74671856944039,60.99139402740988,1
32.72283304060323,43.30717306430063,0
64.0393204150601,78.03168802018232,1
72.34649422579923,96.22759296761404,1
60.45788573918959,73.09499809758037,1
58.84095621726802,75.85844831279042,1
99.82785779692128,72.36925193383885,1
47.26426910848174,88.47586499559782,1
50.45815980285988,75.80985952982456,1
60.45555629271532,42.50840943572217,0
82.22666157785568,42.71987853716458,0
88.9138964166533,69.80378889835472,1
94.83450672430196,45.69430680250754,1
67.31925746917527,66.58935317747915,1
57.23870631569862,59.51428198012956,1
80.36675600171273,90.96014789746954,1
68.46852178591112,85.59430710452014,1
42.0754545384731,78.84478600148043,0
75.47770200533905,90.42453899753964,1
78.63542434898018,96.64742716885644,1
52.34800398794107,60.76950525602592,0
94.09433112516793,77.15910509073893,1
90.44855097096364,87.50879176484702,1
55.48216114069585,35.57070347228866,0
74.49269241843041,84.84513684930135,1
89.84580670720979,45.35828361091658,1
83.48916274498238,48.38028579728175,1
42.2617008099817,87.10385094025457,1
99.31500880510394,68.77540947206617,1
55.34001756003703,64.9319380069486,1
74.77589300092767,89.52981289513276,1

ex2data2.txt

0.051267,0.69956,1
-0.092742,0.68494,1
-0.21371,0.69225,1
-0.375,0.50219,1
-0.51325,0.46564,1
-0.52477,0.2098,1
-0.39804,0.034357,1
-0.30588,-0.19225,1
0.016705,-0.40424,1
0.13191,-0.51389,1
0.38537,-0.56506,1
0.52938,-0.5212,1
0.63882,-0.24342,1
0.73675,-0.18494,1
0.54666,0.48757,1
0.322,0.5826,1
0.16647,0.53874,1
-0.046659,0.81652,1
-0.17339,0.69956,1
-0.47869,0.63377,1
-0.60541,0.59722,1
-0.62846,0.33406,1
-0.59389,0.005117,1
-0.42108,-0.27266,1
-0.11578,-0.39693,1
0.20104,-0.60161,1
0.46601,-0.53582,1
0.67339,-0.53582,1
-0.13882,0.54605,1
-0.29435,0.77997,1
-0.26555,0.96272,1
-0.16187,0.8019,1
-0.17339,0.64839,1
-0.28283,0.47295,1
-0.36348,0.31213,1
-0.30012,0.027047,1
-0.23675,-0.21418,1
-0.06394,-0.18494,1
0.062788,-0.16301,1
0.22984,-0.41155,1
0.2932,-0.2288,1
0.48329,-0.18494,1
0.64459,-0.14108,1
0.46025,0.012427,1
0.6273,0.15863,1
0.57546,0.26827,1
0.72523,0.44371,1
0.22408,0.52412,1
0.44297,0.67032,1
0.322,0.69225,1
0.13767,0.57529,1
-0.0063364,0.39985,1
-0.092742,0.55336,1
-0.20795,0.35599,1
-0.20795,0.17325,1
-0.43836,0.21711,1
-0.21947,-0.016813,1
-0.13882,-0.27266,1
0.18376,0.93348,0
0.22408,0.77997,0
0.29896,0.61915,0
0.50634,0.75804,0
0.61578,0.7288,0
0.60426,0.59722,0
0.76555,0.50219,0
0.92684,0.3633,0
0.82316,0.27558,0
0.96141,0.085526,0
0.93836,0.012427,0
0.86348,-0.082602,0
0.89804,-0.20687,0
0.85196,-0.36769,0
0.82892,-0.5212,0
0.79435,-0.55775,0
0.59274,-0.7405,0
0.51786,-0.5943,0
0.46601,-0.41886,0
0.35081,-0.57968,0
0.28744,-0.76974,0
0.085829,-0.75512,0
0.14919,-0.57968,0
-0.13306,-0.4481,0
-0.40956,-0.41155,0
-0.39228,-0.25804,0
-0.74366,-0.25804,0
-0.69758,0.041667,0
-0.75518,0.2902,0
-0.69758,0.68494,0
-0.4038,0.70687,0
-0.38076,0.91886,0
-0.50749,0.90424,0
-0.54781,0.70687,0
0.10311,0.77997,0
0.057028,0.91886,0
-0.10426,0.99196,0
-0.081221,1.1089,0
0.28744,1.087,0
0.39689,0.82383,0
0.63882,0.88962,0
0.82316,0.66301,0
0.67339,0.64108,0
1.0709,0.10015,0
-0.046659,-0.57968,0
-0.23675,-0.63816,0
-0.15035,-0.36769,0
-0.49021,-0.3019,0
-0.46717,-0.13377,0
-0.28859,-0.060673,0
-0.61118,-0.067982,0
-0.66302,-0.21418,0
-0.59965,-0.41886,0
-0.72638,-0.082602,0
-0.83007,0.31213,0
-0.72062,0.53874,0
-0.59389,0.49488,0
-0.48445,0.99927,0
-0.0063364,0.99927,0
0.63265,-0.030612,0

After running r scripts above, you'll get these png files on your working directory on your Terminal:

Saturday, July 6, 2019

R: Logistic Regression (OLD)

data.csv

number,age,blood_pressure,lung_capacity,sex,illness,weight
1,22,110,4300,M,1,79
2,23,128,4500,M,1,65
3,24,104,3900,F,0,53
4,25,112,3000,F,0,45
5,27,108,4800,M,0,80
6,28,126,3800,F,0,50
7,28,126,3800,F,1,43
8,29,104,4000,F,1,55
9,30,125,3600,F,1,47
10,31,120,3400,F,1,49
11,32,116,3600,M,1,64
12,32,124,3900,M,0,61
13,33,106,3100,F,0,48
14,33,134,2900,F,0,41
15,34,128,4100,M,1,70
16,36,128,3420,M,1,55
17,37,116,3800,M,1,70
18,37,132,4150,M,1,90
19,38,134,2700,F,0,39
20,39,116,4550,M,1,86
21,40,120,2900,F,1,50
22,42,130,3950,F,1,65
23,46,126,3100,M,0,58
24,49,140,3000,F,0,45
25,50,156,3400,M,1,60
26,53,124,3400,M,1,71
27,56,118,3470,M,1,62
28,58,144,2800,M,0,51
29,64,142,2500,F,1,40
30,65,144,2350,F,0,42

0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

##### Background
#
# We use machine learning models to learn training data.
# The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above).
# If the machined learning model is over-fitting the training data (including noises and outlier),
# the model's prediction accuracy for new data could be lowered.
# This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful.
# To explain noises and outliers, the model is overly optimized.
#
# Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients).
#
# To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models.
#
# Regularization: A way to find a model while avoiding over-fitting
# L1 (Lasso): A penalty term is sum of absolute parameter values of the model
# By setting weight = 0 of certain data, deleting unnecessary data.
# "Dimension comperession to delete unnecessary explanatory variables"
# L2 (Ridge): A penalty term is sum of squared parameter values of the model.
# This is to have a smoother model.
# "More accurate prediction while avoiding over-fitting"
# Under both L1 regularization and L2 regularization,
# models with lower dimensions have smaller penalty.
# If training data have exceptional data such as noises and outliers,
# models have to increase its dimensions to explain data including such exceptional data
# while trying not to be penalized for increased dimensions.
# (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.)
#
#
# Regression:
# A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn}
# Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X
#
# Logistic regression:
# Generally, hθ(X) above is a continuous value without any upper and lower boundaries.
# To make 0 ≤ hθ(X) ≤ 1,
# Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z))
# When doing logistic regressions,
# hθ(X) = 1/(1 + e^(−θT X))
#
# hθ(x)≥0.5, then Y = 1
# hθ(x)<0.5, then Y = 0

# Set your working directory on your R Console
##### The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/XXX/Downloads/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

#source('2_library.r')

source('3_logi_fun.r')

source('4_logistic_regression.r')

Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5

1_install_packages.r

########## install packages

#install.packages("glmnet")
#install.packages('glmnet_2.0-18.tgz')
#zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html

2_library.r

########## library setting

#library('glmnet')

3_logi_fun.r

logi_fun <- function(data,file,disease){
ans <- glm(data$Y~.,data=data,family=binomial) # family=binomial for logistics regression
s.ans <- summary(ans)
coe <- s.ans$coefficient
RR <- exp(coe[,1])
RRlow <- exp(coe[,1]-1.96*coe[,2])
RRup <- exp(coe[,1]+1.96*coe[,2])
N <- nrow(data)
aic <- AIC(ans)
result <- cbind(coe,RR,RRlow,RRup,aic,N)
colnames(result)[6:7] <- c("RR95%CI.low","RR95%CI.up")
if(nrow(result)>=2){
result[2:nrow(result),8:9] <- ""
}
write.table(disease,file,append=T,quote=F,sep=",",row.names=F,col.names=F)
write.table(matrix(c("",colnames(result)),nrow=1),file,append=T,quote=F,sep=",",row.names=F,col.names=F)
write.table(result,file,append=T,quote=F,sep=",",row.names=T,col.names=F)
write.table("",file,append=T,quote=F,sep=",",row.names=F,col.names=F)
}

4_logistic_regression.r

df <- read.csv("data.csv",header=T,row.names=1)

dat <- df[,c(5,1,2,6)]
# 5th column (illness): explanined (target) variable
# 1(age),2(blood_pressure),6(weight): explanatory variables

colnames(dat)[1] <- "Y"

logi_fun(dat,"results_logistic_reg.csv","illness")
# See this csv file.
# If significance level = 0.05, then only "age" has Pr (p-value) which is less than 0.05 (0.038665).

results_logistic_reg.csv

illness
,Estimate,Std. Error,z value,Pr(>|z|),RR,RR95%CI.low,RR95%CI.up,aic,N
(Intercept),-6.27037164366909,5.6269544356187,-1.11434555147231,0.265130972267466,0.00189152547909671,3.06938865463388e-08,116.566164818212,42.6982377386664,30
age,0.00171984691269398,0.0447696844915921,0.0384154351817462,0.969356454570465,1.00172132669761,0.91756786481243,1.09359280641977,,
blood_pressure,0.016973167573557,0.0445691192249947,0.380827978400756,0.703330897109487,1.01711803021436,0.932037428183773,1.10996517532894,,
weight,0.0801901371302698,0.0387817328646643,2.06772960378298,0.038665456531477,1.08349306035207,1.004186680477,1.16906271976586,,

R: Logistic Regression (without regularization) - simple sample

data1.csv

y,x1,x2
0,3.6,60.1
1,4.1,52.0
0,3.7,62.5
0,4.9,60.6
1,4.4,54.1
0,3.6,63.6
1,4.9,68.0
0,4.8,38.2
1,4.1,59.5
0,4.3,47.3

binary.csv

admit,gre,gpa,rank
0,380,3.61,3
1,660,3.67,3
1,800,4,1
1,640,3.19,4
0,520,2.93,4
1,760,3,2
1,560,2.98,1
0,400,3.08,2
1,540,3.39,3
0,700,3.92,2
0,800,4,4
0,440,3.22,1
1,760,4,1
0,700,3.08,2
1,700,4,1
0,480,3.44,3
0,780,3.87,4
0,360,2.56,3
0,800,3.75,2
1,540,3.81,1
0,500,3.17,3
1,660,3.63,2
0,600,2.82,4
0,680,3.19,4
1,760,3.35,2
1,800,3.66,1
1,620,3.61,1
1,520,3.74,4
1,780,3.22,2
0,520,3.29,1
0,540,3.78,4
0,760,3.35,3
0,600,3.4,3
1,800,4,3
0,360,3.14,1
0,400,3.05,2
0,580,3.25,1
0,520,2.9,3
1,500,3.13,2
1,520,2.68,3
0,560,2.42,2
1,580,3.32,2
1,600,3.15,2
0,500,3.31,3
0,700,2.94,2
1,460,3.45,3
1,580,3.46,2
0,500,2.97,4
0,440,2.48,4
0,400,3.35,3
0,640,3.86,3
0,440,3.13,4
0,740,3.37,4
1,680,3.27,2
0,660,3.34,3
1,740,4,3
0,560,3.19,3
0,380,2.94,3
0,400,3.65,2
0,600,2.82,4
1,620,3.18,2
0,560,3.32,4
0,640,3.67,3
1,680,3.85,3
0,580,4,3
0,600,3.59,2
0,740,3.62,4
0,620,3.3,1
0,580,3.69,1
0,800,3.73,1
0,640,4,3
0,300,2.92,4
0,480,3.39,4
0,580,4,2
0,720,3.45,4
0,720,4,3
0,560,3.36,3
1,800,4,3
0,540,3.12,1
1,620,4,1
0,700,2.9,4
0,620,3.07,2
0,500,2.71,2
0,380,2.91,4
1,500,3.6,3
0,520,2.98,2
0,600,3.32,2
0,600,3.48,2
0,700,3.28,1
1,660,4,2
0,700,3.83,2
1,720,3.64,1
0,800,3.9,2
0,580,2.93,2
1,660,3.44,2
0,660,3.33,2
0,640,3.52,4
0,480,3.57,2
0,700,2.88,2
0,400,3.31,3
0,340,3.15,3
0,580,3.57,3
0,380,3.33,4
0,540,3.94,3
1,660,3.95,2
1,740,2.97,2
1,700,3.56,1
0,480,3.13,2
0,400,2.93,3
0,480,3.45,2
0,680,3.08,4
0,420,3.41,4
0,360,3,3
0,600,3.22,1
0,720,3.84,3
0,620,3.99,3
1,440,3.45,2
0,700,3.72,2
1,800,3.7,1
0,340,2.92,3
1,520,3.74,2
1,480,2.67,2
0,520,2.85,3
0,500,2.98,3
0,720,3.88,3
0,540,3.38,4
1,600,3.54,1
0,740,3.74,4
0,540,3.19,2
0,460,3.15,4
1,620,3.17,2
0,640,2.79,2
0,580,3.4,2
0,500,3.08,3
0,560,2.95,2
0,500,3.57,3
0,560,3.33,4
0,700,4,3
0,620,3.4,2
1,600,3.58,1
0,640,3.93,2
1,700,3.52,4
0,620,3.94,4
0,580,3.4,3
0,580,3.4,4
0,380,3.43,3
0,480,3.4,2
0,560,2.71,3
1,480,2.91,1
0,740,3.31,1
1,800,3.74,1
0,400,3.38,2
1,640,3.94,2
0,580,3.46,3
0,620,3.69,3
1,580,2.86,4
0,560,2.52,2
1,480,3.58,1
0,660,3.49,2
0,700,3.82,3
0,600,3.13,2
0,640,3.5,2
1,700,3.56,2
0,520,2.73,2
0,580,3.3,2
0,700,4,1
0,440,3.24,4
0,720,3.77,3
0,500,4,3
0,600,3.62,3
0,400,3.51,3
0,540,2.81,3
0,680,3.48,3
1,800,3.43,2
0,500,3.53,4
1,620,3.37,2
0,520,2.62,2
1,620,3.23,3
0,620,3.33,3
0,300,3.01,3
0,620,3.78,3
0,500,3.88,4
0,700,4,2
1,540,3.84,2
0,500,2.79,4
0,800,3.6,2
0,560,3.61,3
0,580,2.88,2
0,560,3.07,2
0,500,3.35,2
1,640,2.94,2
0,800,3.54,3
0,640,3.76,3
0,380,3.59,4
1,600,3.47,2
0,560,3.59,2
0,660,3.07,3
1,400,3.23,4
0,600,3.63,3
0,580,3.77,4
0,800,3.31,3
1,580,3.2,2
1,700,4,1
0,420,3.92,4
1,600,3.89,1
1,780,3.8,3
0,740,3.54,1
1,640,3.63,1
0,540,3.16,3
0,580,3.5,2
0,740,3.34,4
0,580,3.02,2
0,460,2.87,2
0,640,3.38,3
1,600,3.56,2
1,660,2.91,3
0,340,2.9,1
1,460,3.64,1
0,460,2.98,1
1,560,3.59,2
0,540,3.28,3
0,680,3.99,3
1,480,3.02,1
0,800,3.47,3
0,800,2.9,2
1,720,3.5,3
0,620,3.58,2
0,540,3.02,4
0,480,3.43,2
1,720,3.42,2
0,580,3.29,4
0,600,3.28,3
0,380,3.38,2
0,420,2.67,3
1,800,3.53,1
0,620,3.05,2
1,660,3.49,2
0,480,4,2
0,500,2.86,4
0,700,3.45,3
0,440,2.76,2
1,520,3.81,1
1,680,2.96,3
0,620,3.22,2
0,540,3.04,1
0,800,3.91,3
0,680,3.34,2
0,440,3.17,2
0,680,3.64,3
0,640,3.73,3
0,660,3.31,4
0,620,3.21,4
1,520,4,2
1,540,3.55,4
1,740,3.52,4
0,640,3.35,3
1,520,3.3,2
1,620,3.95,3
0,520,3.51,2
0,640,3.81,2
0,680,3.11,2
0,440,3.15,2
1,520,3.19,3
1,620,3.95,3
1,520,3.9,3
0,380,3.34,3
0,560,3.24,4
1,600,3.64,3
1,680,3.46,2
0,500,2.81,3
1,640,3.95,2
0,540,3.33,3
1,680,3.67,2
0,660,3.32,1
0,520,3.12,2
1,600,2.98,2
0,460,3.77,3
1,580,3.58,1
1,680,3,4
1,660,3.14,2
0,660,3.94,2
0,360,3.27,3
0,660,3.45,4
0,520,3.1,4
1,440,3.39,2
0,600,3.31,4
1,800,3.22,1
1,660,3.7,4
0,800,3.15,4
0,420,2.26,4
1,620,3.45,2
0,800,2.78,2
0,680,3.7,2
0,800,3.97,1
0,480,2.55,1
0,520,3.25,3
0,560,3.16,1
0,460,3.07,2
0,540,3.5,2
0,720,3.4,3
0,640,3.3,2
1,660,3.6,3
1,400,3.15,2
1,680,3.98,2
0,220,2.83,3
0,580,3.46,4
1,540,3.17,1
0,580,3.51,2
0,540,3.13,2
0,440,2.98,3
0,560,4,3
0,660,3.67,2
0,660,3.77,3
1,520,3.65,4
0,540,3.46,4
1,300,2.84,2
1,340,3,2
1,780,3.63,4
1,480,3.71,4
0,540,3.28,1
0,460,3.14,3
0,460,3.58,2
0,500,3.01,4
0,420,2.69,2
0,520,2.7,3
0,680,3.9,1
0,680,3.31,2
1,560,3.48,2
0,580,3.34,2
0,500,2.93,4
0,740,4,3
0,660,3.59,3
0,420,2.96,1
0,560,3.43,3
1,460,3.64,3
1,620,3.71,1
0,520,3.15,3
0,620,3.09,4
0,540,3.2,1
1,660,3.47,3
0,500,3.23,4
1,560,2.65,3
0,500,3.95,4
0,580,3.06,2
0,520,3.35,3
0,500,3.03,3
0,600,3.35,2
0,580,3.8,2
0,400,3.36,2
0,620,2.85,2
1,780,4,2
0,620,3.43,3
1,580,3.12,3
0,700,3.52,2
1,540,3.78,2
1,760,2.81,1
0,700,3.27,2
0,720,3.31,1
1,560,3.69,3
0,720,3.94,3
1,520,4,1
1,540,3.49,1
0,680,3.14,2
0,460,3.44,2
1,560,3.36,1
0,480,2.78,3
0,460,2.93,3
0,620,3.63,3
0,580,4,1
0,800,3.89,2
1,540,3.77,2
1,680,3.76,3
1,680,2.42,1
1,620,3.37,1
0,560,3.78,2
0,560,3.49,4
0,620,3.63,2
1,800,4,2
0,640,3.12,3
0,540,2.7,2
0,700,3.65,2
1,540,3.49,2
0,540,3.51,2
0,660,4,1
1,480,2.62,2
0,420,3.02,1
1,740,3.86,2
0,580,3.36,2
0,640,3.17,2
0,640,3.51,2
1,800,3.05,2
1,660,3.88,2
1,600,3.38,3
1,620,3.75,2
1,460,3.99,3
0,620,4,2
0,560,3.04,3
0,460,2.63,2
0,700,3.65,2
0,600,3.89,3

0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
#setwd('/Users/XXX/Downloads/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

#source('2_library.r')

source('3_logistics_regression.r')

source('4_logistic_regression.r')

Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5

1_install_packages.r

2_library.r

3_logistics_regression.r

# data loading
data1 <- read.csv("data1.csv", header=TRUE)

# Logistic Regression
mylogit = glm(y ~ x1 + x2, data=data1, family=binomial(link="logit"))

# Summary
summary(mylogit)
#=>
# Call:
# glm(formula = y ~ x1 + x2, family = binomial(link ="logit"), data = data1)
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -1.4754 -0.8584 -0.8007 1.1905 1.5719

# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -9.44589 9.12237 -1.035 0.300
# x1 1.27158 1.49423 0.851 0.395
# x2 0.06424 0.08739 0.735 0.462

# (Dispersion parameter for binomial family taken to be 1)

# Null deviance: 13.460 on 9 degrees of freedom
# Residual deviance: 12.345 on 7 degrees of freedom
# AIC: 18.345

# Number of Fisher Scoring iterations: 4

# See fitted numbers
fitted(mylogit)
#=>
# 1 2 3 4 5 6 7 8 9 10
# 0.2675020 0.2907201 0.3260736 0.6632579 0.4072105 0.3137831 0.7600997 0.2914609 0.3988916 0.2810008

# Fitted (predicted) numbers are calculated by the following equation.
# logit(π)=−9.44589+1.27158x1+0.06424x2
# However, as you can see Pr(>|z|) in the result of summary, this is not significant.

4_logistic_regression.r

# Data loading
#mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata <- read.csv("binary.csv")
mydata$rank <- factor(mydata$rank)

# Logistic Regression
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")

# Summary
summary(mylogit)
#=>
# Call:
# glm(formula = admit ~ gre + gpa + rank, family = "binomial", data = mydata)

# Deviance Residuals:
# Min 1Q Median 3Q Max
# -1.6268 -0.8662 -0.6388 1.1490 2.0790

# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
# gre 0.002264 0.001094 2.070 0.038465 *
# gpa 0.804038 0.331819 2.423 0.015388 *
# rank2 -0.675443 0.316490 -2.134 0.032829 *
# rank3 -1.340204 0.345306 -3.881 0.000104 ***
# rank4 -1.551464 0.417832 -3.713 0.000205 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

# Null deviance: 499.98 on 399 degrees of freedom
# Residual deviance: 458.52 on 394 degrees of freedom
# AIC: 470.52

# Number of Fisher Scoring iterations: 4

newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(mylogit, newdata = newdata1, type = "response")
newdata2 <- with(mydata, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(mylogit, newdata = newdata2, type = "link",
se = TRUE))
newdata3 <- within(newdata3, {
PredictedProb <- plogis(fit)
LL <- plogis(fit - (1.96 * se.fit))
UL <- plogis(fit + (1.96 * se.fit))
})
# Using graphic library
library("ggplot2")
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
size = 1)
# Save as an image file
ggsave("image.png")

#Reference
#https://stats.idre.ucla.edu/r/dae/logit-regression/

R: Logistic Regression

0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/logistic_regression/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data.r')

source('4_logistic_regression.r')

#Reference
#https://www.datacamp.com/community/tutorials/logistic-regression-R

Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5

1_install_packages.r

########## install packages

install.packages("ISLR")
#install.packages('ISLR_1.2.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/ISLR/index.html

install.packages("Amelia")
#install.packages('Amelia_1.7.5.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/Amelia/index.html

install.packages("mlbench")
#install.packages('mlbench_2.1-1.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/mlbench/

install.packages("corrplot")
#install.packages('corrplot_0.84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/corrplot/

install.packages("caret")
#install.packages('caret_6.0-84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/caret/

2_library.r

########## library setting

library('ISLR')

library(Amelia)
library(mlbench)

library(corrplot)

library("caret")

3_data.r

#For this tutorial, you're going to work with the Smarket dataset within RStudio.
# The dataset shows daily percentage returns for the S&P 500 stock index between 2001 and 2005.

names(Smarket)
#[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume" "Today" "Direction"

head(Smarket)
# Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
#1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
#2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
#3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
#4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
#5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
#6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up

summary(Smarket)
# Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
# Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000 Down:602
# 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
# Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500 Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
# Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716 Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
# 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
# Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000

# Response vairable: Direction - that shows whether the market went up or down since the previous day.

# Visualizing Data
#
# histogram

##### Start: saving as a png file
png("fig_3_data_1.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
hist(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

# It's extremely hard to see, but most of the variables show a Gaussian or double Gaussian distribution.

##### Start: saving as a png file
png("fig_3_data_2.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
boxplot(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_3.png", width = 1600, height = 1600)
png("fig_3_data_3.png")

missmap(Smarket, col=c("blue", "red"), legend=FALSE)

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_4.png", width = 1600, height = 1600)
png("fig_3_data_4.png")

correlations <- cor(Smarket[,1:8])
corrplot(correlations, method="circle")

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_5.png", width = 1600, height = 1600)
png("fig_3_data_5.png")

pairs(Smarket, col=Smarket$Direction)

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_6.png", width = 1600, height = 1600)
png("fig_3_data_6.png")

x <- Smarket[,1:8]
y <- Smarket[,9]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

dev.off()
##### End: saving as a png file

4_logistic_regression.r

########## Logistics Regression

##### Building Logistic Regression Model

glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial)

summary(glm.fit)

# You look at the first 5 probabilities and they are very close to 50%:
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:5]

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

attach(Smarket)
table(glm.pred,Direction)

mean(glm.pred == Direction)

##### Creating Training and Test Samples

# Make training and test set
train = Year<2005
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket,
family = binomial,
subset = train)

glm.probs <- predict(glm.fit,
newdata = Smarket[!train,],
type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)

## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44

mean(glm.pred == Direction.2005)

## [1] 0.4801587

##### Solving Overfitting

# Fit a smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train,], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)

## Direction.2005
## glm.pred Down Up
## Down 39 31
## Up 72 110

mean(glm.pred == Direction.2005)

## [1] 0.5912698

summary(glm.fit)

##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3, family = binomial,
## data = Smarket, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.338 -1.189 1.072 1.163 1.335
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.032230 0.063377 0.509 0.611
## Lag1 -0.055523 0.051709 -1.074 0.283
## Lag2 -0.044300 0.051674 -0.857 0.391
## Lag3 0.008815 0.051495 0.171 0.864
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.4 on 994 degrees of freedom
## AIC: 1389.4
##
## Number of Fisher Scoring iterations: 3

R: Regularization and Logistics Regression

0_runme.txt

########## R: L1 Regularization for Logistic Regression

##### Run this script on your R Console

##### Background
#
# We use machine learning models to learn training data.
# The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above).
# If the machined learning model is over-fitting the training data (including noises and outlier),
# the model's prediction accuracy for new data could be lowered.
# This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful.
# To explain noises and outliers, the model is overly optimized.
#
# Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients).
#
# To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models.
#
# Regularization: A way to find a model while avoiding over-fitting
# L1 (Lasso): A penalty term is sum of absolute parameter values of the model
# By setting weight = 0 of certain data, deleting unnecessary data.
# "Dimension comperession to delete unnecessary explanatory variables"
# L2 (Ridge): A penalty term is sum of squared parameter values of the model.
# This is to have a smoother model.
# "More accurate prediction while avoiding over-fitting"
# Under both L1 regularization and L2 regularization,
# models with lower dimensions have smaller penalty.
# If training data have exceptional data such as noises and outliers,
# models have to increase its dimensions to explain data including such exceptional data
# while trying not to be penalized for increased dimensions.
# (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.)
#
#
# Regression:
# A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn}
# Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X
#
# Logistic regression:
# Generally, hθ(X) above is a continuous value without any upper and lower boundaries.
# To make 0 ≤ hθ(X) ≤ 1,
# Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z))
# When doing logistic regressions,
# hθ(X) = 1/(1 + e^(−θT X))
#
# hθ(x)≥0.5, then Y = 1
# hθ(x)<0.5, then Y = 0

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/regularization_and_logistic_regression/')

# source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_quick_start.r')

source('4_logistic_regression_binominal.r')

source('5_logistic_regression_multinominal.r')

Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5

1_install_packages.r

########## install packages

install.packages("glmnet")
#install.packages('glmnet_2.0-18.tgz')
#zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html

# Reference
#http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html

# Dowload files
#
# QuickStartExample.RData
# https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData
#
# BinomialExample.RData
# https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData
#
# MultinomialExample.RData
# https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData

2_library.r

########## library setting

library('glmnet')

3_quick_start.r

# Dowload QuickStartExample.RData from the follwowing Github site.
# https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData

load("QuickStartExample.RData")

fit = glmnet(x, y)

##### Start: saving as a png file
png("fig_3_quick_start_1.png")

#draw a figure
plot(fit)
# Each curve corresponds to a variable.
# It shows the path of its coefficient against the ℓ1-norm of the whole coefficient vector at as λ varies.
# (The tuning parameter λ controls the overall strength of the penalty.)

dev.off()
##### End: saving as a png file

#A summary of the glmnet path at each step is displayed if we just enter the object name or use the print function:
print(fit)

coef(fit,s=0.1)

nx = matrix(rnorm(10*20),10,20)
predict(fit,newx=nx,s=c(0.1,0.05))

cvfit = cv.glmnet(x, y)

##### Start: saving as a png file
png("fig_3_quick_start_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file

cvfit$lambda.min

coef(cvfit, s = "lambda.min")

predict(cvfit, newx = x[1:5,], s = "lambda.min")

4_logistic_regression_binominal.r

# Download from https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData
load("BinomialExample.RData")

fit = glmnet(x, y, family = "binomial")

##### Start: saving as a png file
png("fig_4_logistic_regression_binominal_1.png")

plot(fit, xvar = "dev", label = TRUE)

dev.off()
##### End: saving as a png file

predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))

cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")

##### Start: saving as a png file
png("fig_4_logistic_regression_binominal_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file

cvfit$lambda.min

cvfit$lambda.1se

coef(cvfit, s = "lambda.min")

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")

5_logistic_regression_multinominal.r

# Download from https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData
load("MultinomialExample.RData")

fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped")

##### Start: saving as a png file
png("fig_5_logistic_regression_multinominal_1.png")

plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm")

dev.off()
##### End: saving as a png file

cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE)

##### Start: saving as a png file
png("fig_5_logistic_regression_multinominal_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")

Figures