AdSense

Saturday, July 13, 2019

Python: Linear and Regularized Logistic Regression


0_runme.txt

##### Open Terminal on Mac OS

# Run the following commands on Terminal.

#Current working directory
pwd
#For instance, your MacOS environment goes like this:
#/Users/xxx/


#Change your working directory
#cd /Users/xxx/Downloads


which python3
#For instance, my MacOS environment goes like this:
#/Library/Frameworks/Python.framework/Versions/3.7/bin/python3


which pip3
#For instance, my MacOS environment goes like this:
#/Library/Frameworks/Python.framework/Versions/3.7/bin/pip3


#Run after connecting to the Internet
pip3 install matplotlib
pip3 install sklearn
pip3 install dtreeviz
pip3 install IPython

pip3 install pandas
pip3 install scipy



python3 -V
#For instance, my MacOS environment goes like this:
#Python 3.7.4


#Starting Python 3
#python3
#You do not use this since you're running py scripts on Terminal, rather than running your scripts on Pyton IDLE.



#Download files:
#ex1data1.txt
#ex1data2.txt
#ex2data1.txt
#ex2data2.txt
#
#From:
#https://github.com/LilianYe/Andrew-Ng-Machine-Learning-Programming-solutions-/find/master


#Running Python py scripts
python3 1a_liner_regression_w_one_variable.py
python3 1b_liner_regression_w_multiple_variables.py

python3 2.1_logistic_regression_or_classification.py
python3 2.2_regularized_logistic_regression.py



1a_liner_regression_w_one_variable.py
########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 1)
########## Linear Regression with One Variable
#
# Reference:
#https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-1-6b8dd1c73d80
#
#Here we will implement linear regression with one variable to predict profits for a food truck.
#
#ex1data1.txt
#contains the dataset for our linear regression exercise.
#column data
#1      population of a city
#2      profit of a food truck in that city


# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



###Reading and Plotting the data
data = pd.read_csv('ex1data1.txt', header = None) #read from dataset
X = data.iloc[:,0] # read first column
y = data.iloc[:,1] # read second column
m = len(y) # number of training example
data.head() # view first few rows of the data

plt.scatter(X, y)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.savefig('fig_1a.1.png')    # Save an image file
plt.show()


###Adding the intercept term
X = X[:,np.newaxis]    #(*1)
y = y[:,np.newaxis]    #(*1)
theta = np.zeros([2,1])    #set one initial parameter theta to 0
iterations = 1500
alpha = 0.01    # set another initial parameter, the learning rate alpha, to 0.01
ones = np.ones((m,1))
X = np.hstack((ones, X)) # adding the intercept term

#(*1)
#Note on np.newaxis:
# When you read data into X, you will observe that X, y are rank 1 arrays.
#rank 1 array will have a shape of (m, ) whereas rank 2 arrays will have a shape of (m,1).
#When operating on arrays its good to convert rank 1 arrays to rank 2 arrays because rank 1 arrays often give unexpected results.
#To convert rank 1 to rank 2 array we use someArray[:,np.newaxis].


###Computing the cost
def computeCost(X, y, theta):
    temp = np.dot(X, theta) - y
    return np.sum(np.power(temp, 2)) / (2*m)
J = computeCost(X, y, theta)
print(J)
#You should expect to see a cost of 32.07. More precisely, 32.072733877455676.


###Finding the optimal parameters using Gradient Descent
def gradientDescent(X, y, theta, alpha, iterations):
    for _ in range(iterations):
        temp = np.dot(X, theta) - y
        temp = np.dot(X.T, temp)
        theta = theta - (alpha/m) * temp
    return theta
theta = gradientDescent(X, y, theta, alpha, iterations)
print(theta)
#Expected theta values [-3.6303, 1.1664]
#Technically,
#[[-3.63029144]
# [ 1.16636235]]
#
# So, for instance, the first row of actual data goes like this.
# Population (10,000)    Profit ($10,000)
# 6.1101 17.592
#
# An estimated profit for this population is
# -3.6303 * 1 + 1.1664 * 6.1101 = 3.49652064

#We now have the optimized value of theta . Use this value in the above cost function.

J = computeCost(X, y, theta)
print(J)
#It should give you a value of 4.483 (4.483388256587726) which is much better than 32.07



### Plot showing the best fit line
plt.scatter(X[:,1], y)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.plot(X[:,1], np.dot(X, theta))
plt.savefig('fig_1a.2.png')    # Save an image file
plt.show()


1b_liner_regression_w_multiple_variables.py
########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 1)
########## Linear Regression with multiple variables
#(also called Multivariate Linear Regression)


# ex1data2.txt
# a training set of housing prices in Portland, Oregon
# column data
# 1      size of the house (in square feet)
# 2      number of bedrooms
# 3      price of the house


### import
import numpy as np
import pandas as pd


### data loading
data = pd.read_csv('ex1data2.txt', sep = ',', header = None)
X = data.iloc[:,0:2] # read first two columns into X
y = data.iloc[:,2] # read the third column into y
m = len(y) # no. of training samples
data.head()


### Feature Normalization
X = (X - np.mean(X))/np.std(X)


###Adding the intercept term and initializing parameters
ones = np.ones((m,1))
X = np.hstack((ones, X))
alpha = 0.01
num_iters = 400
theta = np.zeros((3,1))
y = y[:,np.newaxis]


###Computing the cost
def computeCostMulti(X, y, theta):
    temp = np.dot(X, theta) - y
    return np.sum(np.power(temp, 2)) / (2*m)
J = computeCostMulti(X, y, theta)
print(J)
#You should expect to see a cost of 65591548106.45744.


###Finding the optimal parameters using Gradient Descent
def gradientDescentMulti(X, y, theta, alpha, iterations):
    m = len(y)
    for _ in range(iterations):
        temp = np.dot(X, theta) - y
        temp = np.dot(X.T, temp)
        theta = theta - (alpha/m) * temp
    return theta
theta = gradientDescentMulti(X, y, theta, alpha, num_iters)
print(theta)
# your optimal parameters will be [[334302.06399328],[ 99411.44947359], [3267.01285407]]
# For instance, if you look at the sample data in the first row,
# 2104 3 399900
# first column's average: 2000.68085106383
# first column's SD: 794.70235353389
# second column's average: 3.17021276595745
# second column's SD: 0.7609818867801
#
# So, the first column (2104-2000.68085106383)/794.70235353389 = 0.1300 (SD)
# Then the second column (3-3.17021276595745)/0.7609818867801 = -0.2237 (SD)

# 334302.06399328 * 1 + 99411.44947359 * 0.1300 + 3267.01285407 * (-0.2237)
# = 346494.721649391 (estimated data) ~ 399,900 (actual data)


#We now have the optimized value of theta . Use this value in the above cost function.
J = computeCostMulti(X, y, theta)
print(J)
#This should give you a value of 2105448288.6292474 which is much better than 65591548106.45744



2.1_logistic_regression_or_classification.py
########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 2.1)
########## Logistic Regression or Classification (part 2.1)
#
# Reference:
# https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-2-1-1a666f049ad6
#
#


##### Logistic Regression


### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt    # more on this later



### data
#ex2data1.txt
#column data
#1      Exam 1 score
#2      Exam 2 score
#3      0 (fail) or 1 (pass)

data = pd.read_csv('ex2data1.txt', header = None)
X = data.iloc[:,:-1]
y = data.iloc[:,2]
data.head()


### plotting

mask = y == 1
adm = plt.scatter(X[mask][0].values, X[mask][1].values)
not_adm = plt.scatter(X[~mask][0].values, X[~mask][1].values)
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend((adm, not_adm), ('Admitted', 'Not admitted'))
plt.savefig('fig_2.1.1.png')    # Save an image file
plt.show()


### Implementation

## Sigmoid Function

def sigmoid(x):
  return 1/(1+np.exp(-x))

## Cost Function
def costFunction(theta, X, y):
    J = (-1/m) * np.sum(np.multiply(y, np.log(sigmoid(X @ theta)))
        + np.multiply((1-y), np.log(1 - sigmoid(X @ theta))))
    return J


## Gradient Function
def gradient(theta, X, y):
    return ((1/m) * X.T @ (sigmoid(X @ theta) - y))

(m, n) = X.shape
X = np.hstack((np.ones((m,1)), X))
y = y[:, np.newaxis]
theta = np.zeros((n+1,1)) # intializing theta with all zeros
J = costFunction(theta, X, y)
print(J)
# This should give us a value of 0.693 for J.
# More precisely, 0.6931471805599453


### Learning parameters using fmin_tnc
temp = opt.fmin_tnc(func = costFunction,
                    x0 = theta.flatten(),fprime = gradient,
                    args = (X, y.flatten()))
#the output of above function is a tuple whose first element #contains the optimized values of theta
theta_optimized = temp[0]
print(theta_optimized)
# The above code should give [-25.16131862, 0.20623159, 0.20147149].

J = costFunction(theta_optimized[:,np.newaxis], X, y)
print(J)
#You should see a value of 0.203 (0.20349770158947486).
# Compare this with the cost 0.693 obtained using initial theta.


### Plotting Decision Boundary (Optional)
plot_x = [np.min(X[:,1]-2), np.max(X[:,2]+2)]
plot_y = -1/theta_optimized[2]*(theta_optimized[0]
          + np.dot(theta_optimized[1],plot_x))
mask = y.flatten() == 1
adm = plt.scatter(X[mask][:,1], X[mask][:,2])
not_adm = plt.scatter(X[~mask][:,1], X[~mask][:,2])
decision_boun = plt.plot(plot_x, plot_y)
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend((adm, not_adm), ('Admitted', 'Not admitted'))
plt.savefig('fig_2.1.2.png')    # Save an image file
plt.show()


def accuracy(X, y, theta, cutoff):
    pred = [sigmoid(np.dot(X, theta)) >= cutoff]
    acc = np.mean(pred == y)
    print(acc * 100)
accuracy(X, y.flatten(), theta_optimized, 0.5)

# This should give us an accuracy score of 89% . Hmm… not bad.


2.2_regularized_logistic_regression.py
########## Python Implementation of Andrew Ng’s Machine Learning Course (Part 2.2)
########## Regularized Logistic Regression (part 2.2)
#
# Reference:
# https://medium.com/analytics-vidhya/python-implementation-of-andrew-ngs-machine-learning-course-part-2-2-dceff1a12a12
#
#


##### Regularized logistic regression


### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt    # more on this later


### data
#ex2data2.txt
#column data
#1      test 1 result
#2      test 2 result
#3      0 (rejected) or 1 (accepted)

data = pd.read_csv('ex2data2.txt', header = None)
X = data.iloc[:,:-1]
y = data.iloc[:,2]
data.head()


### plotting
mask = y == 1
passed = plt.scatter(X[mask][0].values, X[mask][1].values)
failed = plt.scatter(X[~mask][0].values, X[~mask][1].values)
plt.xlabel('Microchip Test1')
plt.ylabel('Microchip Test2')
plt.legend((passed, failed), ('Passed', 'Failed'))
plt.savefig('fig_2.2.1.png')    # Save an image file
plt.show()


### Feature mapping
def mapFeature(X1, X2):
    degree = 6
    out = np.ones(X.shape[0])[:,np.newaxis]
    for i in range(1, degree+1):
        for j in range(i+1):
            out = np.hstack((out, np.multiply(np.power(X1, i-j),                                     np.power(X2, j))[:,np.newaxis]))
    return out
X = mapFeature(X.iloc[:,0], X.iloc[:,1])


### Implementation

## Sigmoid Function
def sigmoid(x):
  return 1/(1+np.exp(-x))


## Cost Function
def lrCostFunction(theta_t, X_t, y_t, lambda_t):
    m = len(y_t)
    J = (-1/m) * (y_t.T @ np.log(sigmoid(X_t @ theta_t)) + (1 - y_t.T) @ np.log(1 - sigmoid(X_t @ theta_t)))
    reg = (lambda_t/(2*m)) * (theta_t[1:].T @ theta_t[1:])
    J = J + reg
    return J

## Gradient Function
def lrGradientDescent(theta, X, y, lambda_t):
    m = len(y)
    grad = np.zeros([m,1])
    grad = (1/m) * X.T @ (sigmoid(X @ theta) - y)
    grad[1:] = grad[1:] + (lambda_t / m) * theta[1:]
    return grad

(m, n) = X.shape
y = y[:, np.newaxis]
theta = np.zeros((n,1))
lmbda = 1
J = lrCostFunction(theta, X, y, lmbda)
print(J)
#This gives us a values of 0.69314718.


## Learning parameters using fmin_tnc
output = opt.fmin_tnc(func = lrCostFunction, x0 = theta.flatten(), fprime = lrGradientDescent, \
                         args = (X, y.flatten(), lmbda))
theta = output[0]
print(theta) # theta contains the optimized values


## Accuracy of model
pred = [sigmoid(np.dot(X, theta)) >= 0.5]
np.mean(pred == y.flatten()) * 100
print(np.mean(pred == y.flatten()) * 100)
# This gives our model accuracy as 83.05% (83.05084745762711).


## Plotting Decision Boundary (optional)
u = np.linspace(-1, 1.5, 50)
v = np.linspace(-1, 1.5, 50)
z = np.zeros((len(u), len(v)))
def mapFeatureForPlotting(X1, X2):
    degree = 6
    out = np.ones(1)
    for i in range(1, degree+1):
        for j in range(i+1):
            out = np.hstack((out, np.multiply(np.power(X1, i-j), np.power(X2, j))))
    return out
for i in range(len(u)):
    for j in range(len(v)):
        z[i,j] = np.dot(mapFeatureForPlotting(u[i], v[j]), theta)
mask = y.flatten() == 1
X = data.iloc[:,:-1]
passed = plt.scatter(X[mask][0], X[mask][1])
failed = plt.scatter(X[~mask][0], X[~mask][1])
plt.contour(u,v,z,0)
plt.xlabel('Microchip Test1')
plt.ylabel('Microchip Test2')
plt.legend((passed, failed), ('Passed', 'Failed'))
plt.savefig('fig_2.2.2.png')    # Save an image file
plt.show()



data

ex1data1.txt
6.1101,17.592
5.5277,9.1302
8.5186,13.662
7.0032,11.854
5.8598,6.8233
8.3829,11.886
7.4764,4.3483
8.5781,12
6.4862,6.5987
5.0546,3.8166
5.7107,3.2522
14.164,15.505
5.734,3.1551
8.4084,7.2258
5.6407,0.71618
5.3794,3.5129
6.3654,5.3048
5.1301,0.56077
6.4296,3.6518
7.0708,5.3893
6.1891,3.1386
20.27,21.767
5.4901,4.263
6.3261,5.1875
5.5649,3.0825
18.945,22.638
12.828,13.501
10.957,7.0467
13.176,14.692
22.203,24.147
5.2524,-1.22
6.5894,5.9966
9.2482,12.134
5.8918,1.8495
8.2111,6.5426
7.9334,4.5623
8.0959,4.1164
5.6063,3.3928
12.836,10.117
6.3534,5.4974
5.4069,0.55657
6.8825,3.9115
11.708,5.3854
5.7737,2.4406
7.8247,6.7318
7.0931,1.0463
5.0702,5.1337
5.8014,1.844
11.7,8.0043
5.5416,1.0179
7.5402,6.7504
5.3077,1.8396
7.4239,4.2885
7.6031,4.9981
6.3328,1.4233
6.3589,-1.4211
6.2742,2.4756
5.6397,4.6042
9.3102,3.9624
9.4536,5.4141
8.8254,5.1694
5.1793,-0.74279
21.279,17.929
14.908,12.054
18.959,17.054
7.2182,4.8852
8.2951,5.7442
10.236,7.7754
5.4994,1.0173
20.341,20.992
10.136,6.6799
7.3345,4.0259
6.0062,1.2784
7.2259,3.3411
5.0269,-2.6807
6.5479,0.29678
7.5386,3.8845
5.0365,5.7014
10.274,6.7526
5.1077,2.0576
5.7292,0.47953
5.1884,0.20421
6.3557,0.67861
9.7687,7.5435
6.5159,5.3436
8.5172,4.2415
9.1802,6.7981
6.002,0.92695
5.5204,0.152
5.0594,2.8214
5.7077,1.8451
7.6366,4.2959
5.8707,7.2029
5.3054,1.9869
8.2934,0.14454
13.394,9.0551
5.4369,0.61705

ex1data2.txt
2104,3,399900
1600,3,329900
2400,3,369000
1416,2,232000
3000,4,539900
1985,4,299900
1534,3,314900
1427,3,198999
1380,3,212000
1494,3,242500
1940,4,239999
2000,3,347000
1890,3,329999
4478,5,699900
1268,3,259900
2300,4,449900
1320,2,299900
1236,3,199900
2609,4,499998
3031,4,599000
1767,3,252900
1888,2,255000
1604,3,242900
1962,4,259900
3890,3,573900
1100,3,249900
1458,3,464500
2526,3,469000
2200,3,475000
2637,3,299900
1839,2,349900
1000,1,169900
2040,4,314900
3137,3,579900
1811,4,285900
1437,3,249900
1239,3,229900
2132,4,345000
4215,4,549000
2162,4,287000
1664,2,368500
2238,3,329900
2567,4,314000
1200,3,299000
852,2,179900
1852,4,299900
1203,3,239500
ex2data1.txt
34.62365962451697,78.0246928153624,0
30.28671076822607,43.89499752400101,0
35.84740876993872,72.90219802708364,0
60.18259938620976,86.30855209546826,1
79.0327360507101,75.3443764369103,1
45.08327747668339,56.3163717815305,0
61.10666453684766,96.51142588489624,1
75.02474556738889,46.55401354116538,1
76.09878670226257,87.42056971926803,1
84.43281996120035,43.53339331072109,1
95.86155507093572,38.22527805795094,0
75.01365838958247,30.60326323428011,0
82.30705337399482,76.48196330235604,1
69.36458875970939,97.71869196188608,1
39.53833914367223,76.03681085115882,0
53.9710521485623,89.20735013750205,1
69.07014406283025,52.74046973016765,1
67.94685547711617,46.67857410673128,0
70.66150955499435,92.92713789364831,1
76.97878372747498,47.57596364975532,1
67.37202754570876,42.83843832029179,0
89.67677575072079,65.79936592745237,1
50.534788289883,48.85581152764205,0
34.21206097786789,44.20952859866288,0
77.9240914545704,68.9723599933059,1
62.27101367004632,69.95445795447587,1
80.1901807509566,44.82162893218353,1
93.114388797442,38.80067033713209,0
61.83020602312595,50.25610789244621,0
38.78580379679423,64.99568095539578,0
61.379289447425,72.80788731317097,1
85.40451939411645,57.05198397627122,1
52.10797973193984,63.12762376881715,0
52.04540476831827,69.43286012045222,1
40.23689373545111,71.16774802184875,0
54.63510555424817,52.21388588061123,0
33.91550010906887,98.86943574220611,0
64.17698887494485,80.90806058670817,1
74.78925295941542,41.57341522824434,0
34.1836400264419,75.2377203360134,0
83.90239366249155,56.30804621605327,1
51.54772026906181,46.85629026349976,0
94.44336776917852,65.56892160559052,1
82.36875375713919,40.61825515970618,0
51.04775177128865,45.82270145776001,0
62.22267576120188,52.06099194836679,0
77.19303492601364,70.45820000180959,1
97.77159928000232,86.7278223300282,1
62.07306379667647,96.76882412413983,1
91.56497449807442,88.69629254546599,1
79.94481794066932,74.16311935043758,1
99.2725269292572,60.99903099844988,1
90.54671411399852,43.39060180650027,1
34.52451385320009,60.39634245837173,0
50.2864961189907,49.80453881323059,0
49.58667721632031,59.80895099453265,0
97.64563396007767,68.86157272420604,1
32.57720016809309,95.59854761387875,0
74.24869136721598,69.82457122657193,1
71.79646205863379,78.45356224515052,1
75.3956114656803,85.75993667331619,1
35.28611281526193,47.02051394723416,0
56.25381749711624,39.26147251058019,0
30.05882244669796,49.59297386723685,0
44.66826172480893,66.45008614558913,0
66.56089447242954,41.09209807936973,0
40.45755098375164,97.53518548909936,1
49.07256321908844,51.88321182073966,0
80.27957401466998,92.11606081344084,1
66.74671856944039,60.99139402740988,1
32.72283304060323,43.30717306430063,0
64.0393204150601,78.03168802018232,1
72.34649422579923,96.22759296761404,1
60.45788573918959,73.09499809758037,1
58.84095621726802,75.85844831279042,1
99.82785779692128,72.36925193383885,1
47.26426910848174,88.47586499559782,1
50.45815980285988,75.80985952982456,1
60.45555629271532,42.50840943572217,0
82.22666157785568,42.71987853716458,0
88.9138964166533,69.80378889835472,1
94.83450672430196,45.69430680250754,1
67.31925746917527,66.58935317747915,1
57.23870631569862,59.51428198012956,1
80.36675600171273,90.96014789746954,1
68.46852178591112,85.59430710452014,1
42.0754545384731,78.84478600148043,0
75.47770200533905,90.42453899753964,1
78.63542434898018,96.64742716885644,1
52.34800398794107,60.76950525602592,0
94.09433112516793,77.15910509073893,1
90.44855097096364,87.50879176484702,1
55.48216114069585,35.57070347228866,0
74.49269241843041,84.84513684930135,1
89.84580670720979,45.35828361091658,1
83.48916274498238,48.38028579728175,1
42.2617008099817,87.10385094025457,1
99.31500880510394,68.77540947206617,1
55.34001756003703,64.9319380069486,1
74.77589300092767,89.52981289513276,1
ex2data2.txt
0.051267,0.69956,1
-0.092742,0.68494,1
-0.21371,0.69225,1
-0.375,0.50219,1
-0.51325,0.46564,1
-0.52477,0.2098,1
-0.39804,0.034357,1
-0.30588,-0.19225,1
0.016705,-0.40424,1
0.13191,-0.51389,1
0.38537,-0.56506,1
0.52938,-0.5212,1
0.63882,-0.24342,1
0.73675,-0.18494,1
0.54666,0.48757,1
0.322,0.5826,1
0.16647,0.53874,1
-0.046659,0.81652,1
-0.17339,0.69956,1
-0.47869,0.63377,1
-0.60541,0.59722,1
-0.62846,0.33406,1
-0.59389,0.005117,1
-0.42108,-0.27266,1
-0.11578,-0.39693,1
0.20104,-0.60161,1
0.46601,-0.53582,1
0.67339,-0.53582,1
-0.13882,0.54605,1
-0.29435,0.77997,1
-0.26555,0.96272,1
-0.16187,0.8019,1
-0.17339,0.64839,1
-0.28283,0.47295,1
-0.36348,0.31213,1
-0.30012,0.027047,1
-0.23675,-0.21418,1
-0.06394,-0.18494,1
0.062788,-0.16301,1
0.22984,-0.41155,1
0.2932,-0.2288,1
0.48329,-0.18494,1
0.64459,-0.14108,1
0.46025,0.012427,1
0.6273,0.15863,1
0.57546,0.26827,1
0.72523,0.44371,1
0.22408,0.52412,1
0.44297,0.67032,1
0.322,0.69225,1
0.13767,0.57529,1
-0.0063364,0.39985,1
-0.092742,0.55336,1
-0.20795,0.35599,1
-0.20795,0.17325,1
-0.43836,0.21711,1
-0.21947,-0.016813,1
-0.13882,-0.27266,1
0.18376,0.93348,0
0.22408,0.77997,0
0.29896,0.61915,0
0.50634,0.75804,0
0.61578,0.7288,0
0.60426,0.59722,0
0.76555,0.50219,0
0.92684,0.3633,0
0.82316,0.27558,0
0.96141,0.085526,0
0.93836,0.012427,0
0.86348,-0.082602,0
0.89804,-0.20687,0
0.85196,-0.36769,0
0.82892,-0.5212,0
0.79435,-0.55775,0
0.59274,-0.7405,0
0.51786,-0.5943,0
0.46601,-0.41886,0
0.35081,-0.57968,0
0.28744,-0.76974,0
0.085829,-0.75512,0
0.14919,-0.57968,0
-0.13306,-0.4481,0
-0.40956,-0.41155,0
-0.39228,-0.25804,0
-0.74366,-0.25804,0
-0.69758,0.041667,0
-0.75518,0.2902,0
-0.69758,0.68494,0
-0.4038,0.70687,0
-0.38076,0.91886,0
-0.50749,0.90424,0
-0.54781,0.70687,0
0.10311,0.77997,0
0.057028,0.91886,0
-0.10426,0.99196,0
-0.081221,1.1089,0
0.28744,1.087,0
0.39689,0.82383,0
0.63882,0.88962,0
0.82316,0.66301,0
0.67339,0.64108,0
1.0709,0.10015,0
-0.046659,-0.57968,0
-0.23675,-0.63816,0
-0.15035,-0.36769,0
-0.49021,-0.3019,0
-0.46717,-0.13377,0
-0.28859,-0.060673,0
-0.61118,-0.067982,0
-0.66302,-0.21418,0
-0.59965,-0.41886,0
-0.72638,-0.082602,0
-0.83007,0.31213,0
-0.72062,0.53874,0
-0.59389,0.49488,0
-0.48445,0.99927,0
-0.0063364,0.99927,0
0.63265,-0.030612,0


After running r scripts above, you'll get these png files on your working directory on your Terminal:






Saturday, July 6, 2019

R: Logistic Regression (OLD)

data.csv
number,age,blood_pressure,lung_capacity,sex,illness,weight
1,22,110,4300,M,1,79
2,23,128,4500,M,1,65
3,24,104,3900,F,0,53
4,25,112,3000,F,0,45
5,27,108,4800,M,0,80
6,28,126,3800,F,0,50
7,28,126,3800,F,1,43
8,29,104,4000,F,1,55
9,30,125,3600,F,1,47
10,31,120,3400,F,1,49
11,32,116,3600,M,1,64
12,32,124,3900,M,0,61
13,33,106,3100,F,0,48
14,33,134,2900,F,0,41
15,34,128,4100,M,1,70
16,36,128,3420,M,1,55
17,37,116,3800,M,1,70
18,37,132,4150,M,1,90
19,38,134,2700,F,0,39
20,39,116,4550,M,1,86
21,40,120,2900,F,1,50
22,42,130,3950,F,1,65
23,46,126,3100,M,0,58
24,49,140,3000,F,0,45
25,50,156,3400,M,1,60
26,53,124,3400,M,1,71
27,56,118,3470,M,1,62
28,58,144,2800,M,0,51
29,64,142,2500,F,1,40
30,65,144,2350,F,0,42


0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console


##### Background
#
# We use machine learning models to learn training data.
# The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above).
# If the machined learning model is over-fitting the training data (including noises and outlier),
# the model's prediction accuracy for new data could be lowered.
# This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful.
# To explain noises and outliers, the model is overly optimized.
#
# Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients).
#
# To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models.
#
# Regularization: A way to find a model while avoiding over-fitting
#     L1 (Lasso): A penalty term is sum of absolute parameter values of the model
#                 By setting weight = 0 of certain data, deleting unnecessary data.
#                 "Dimension comperession to delete unnecessary explanatory variables"
#     L2 (Ridge): A penalty term is sum of squared parameter values of the model.
#                 This is to have a smoother model.
#                 "More accurate prediction while avoiding over-fitting"
# Under both L1 regularization and L2 regularization,
# models with lower dimensions have smaller penalty.
# If training data have exceptional data such as noises and outliers,
# models have to increase its dimensions to explain data including such exceptional data
# while trying not to be penalized for increased dimensions.
# (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.)
#
#
# Regression:
#     A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn}
#     Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X
#
# Logistic regression:
#     Generally, hθ(X) above is a continuous value without any upper and lower boundaries.
#     To make 0 ≤ hθ(X) ≤ 1,
#     Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z))
#     When doing logistic regressions,
#     hθ(X) = 1/(1 + e^(−θT X))
#
#     hθ(x)≥0.5, then Y = 1
#     hθ(x)<0.5, then Y = 0


# Set your working directory on your R Console
##### The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/XXX/Downloads/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

#source('2_library.r')

source('3_logi_fun.r')

source('4_logistic_regression.r')



Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5



1_install_packages.r
########## install packages

#install.packages("glmnet")
#install.packages('glmnet_2.0-18.tgz')
#zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html


2_library.r
########## library setting

#library('glmnet')


3_logi_fun.r
logi_fun <- function(data,file,disease){
ans <- glm(data$Y~.,data=data,family=binomial) # family=binomial for logistics regression
s.ans <- summary(ans)
coe <- s.ans$coefficient
RR <- exp(coe[,1])
RRlow <- exp(coe[,1]-1.96*coe[,2])
RRup <- exp(coe[,1]+1.96*coe[,2])
N <- nrow(data)
aic <- AIC(ans)
result <- cbind(coe,RR,RRlow,RRup,aic,N)
colnames(result)[6:7] <- c("RR95%CI.low","RR95%CI.up")
if(nrow(result)>=2){
result[2:nrow(result),8:9] <- ""
}
write.table(disease,file,append=T,quote=F,sep=",",row.names=F,col.names=F)
write.table(matrix(c("",colnames(result)),nrow=1),file,append=T,quote=F,sep=",",row.names=F,col.names=F)
write.table(result,file,append=T,quote=F,sep=",",row.names=T,col.names=F)
write.table("",file,append=T,quote=F,sep=",",row.names=F,col.names=F)
}


4_logistic_regression.r
df <- read.csv("data.csv",header=T,row.names=1)

dat <- df[,c(5,1,2,6)]
# 5th column (illness): explanined (target) variable
# 1(age),2(blood_pressure),6(weight): explanatory variables

colnames(dat)[1] <- "Y"

logi_fun(dat,"results_logistic_reg.csv","illness")
# See this csv file.
# If significance level = 0.05, then only "age" has Pr (p-value) which is less than 0.05 (0.038665).





results_logistic_reg.csv
illness
,Estimate,Std. Error,z value,Pr(>|z|),RR,RR95%CI.low,RR95%CI.up,aic,N
(Intercept),-6.27037164366909,5.6269544356187,-1.11434555147231,0.265130972267466,0.00189152547909671,3.06938865463388e-08,116.566164818212,42.6982377386664,30
age,0.00171984691269398,0.0447696844915921,0.0384154351817462,0.969356454570465,1.00172132669761,0.91756786481243,1.09359280641977,,
blood_pressure,0.016973167573557,0.0445691192249947,0.380827978400756,0.703330897109487,1.01711803021436,0.932037428183773,1.10996517532894,,
weight,0.0801901371302698,0.0387817328646643,2.06772960378298,0.038665456531477,1.08349306035207,1.004186680477,1.16906271976586,,





R: Logistic Regression (without regularization) - simple sample

data1.csv
y,x1,x2
0,3.6,60.1
1,4.1,52.0
0,3.7,62.5
0,4.9,60.6
1,4.4,54.1
0,3.6,63.6
1,4.9,68.0
0,4.8,38.2
1,4.1,59.5
0,4.3,47.3

binary.csv
admit,gre,gpa,rank
0,380,3.61,3
1,660,3.67,3
1,800,4,1
1,640,3.19,4
0,520,2.93,4
1,760,3,2
1,560,2.98,1
0,400,3.08,2
1,540,3.39,3
0,700,3.92,2
0,800,4,4
0,440,3.22,1
1,760,4,1
0,700,3.08,2
1,700,4,1
0,480,3.44,3
0,780,3.87,4
0,360,2.56,3
0,800,3.75,2
1,540,3.81,1
0,500,3.17,3
1,660,3.63,2
0,600,2.82,4
0,680,3.19,4
1,760,3.35,2
1,800,3.66,1
1,620,3.61,1
1,520,3.74,4
1,780,3.22,2
0,520,3.29,1
0,540,3.78,4
0,760,3.35,3
0,600,3.4,3
1,800,4,3
0,360,3.14,1
0,400,3.05,2
0,580,3.25,1
0,520,2.9,3
1,500,3.13,2
1,520,2.68,3
0,560,2.42,2
1,580,3.32,2
1,600,3.15,2
0,500,3.31,3
0,700,2.94,2
1,460,3.45,3
1,580,3.46,2
0,500,2.97,4
0,440,2.48,4
0,400,3.35,3
0,640,3.86,3
0,440,3.13,4
0,740,3.37,4
1,680,3.27,2
0,660,3.34,3
1,740,4,3
0,560,3.19,3
0,380,2.94,3
0,400,3.65,2
0,600,2.82,4
1,620,3.18,2
0,560,3.32,4
0,640,3.67,3
1,680,3.85,3
0,580,4,3
0,600,3.59,2
0,740,3.62,4
0,620,3.3,1
0,580,3.69,1
0,800,3.73,1
0,640,4,3
0,300,2.92,4
0,480,3.39,4
0,580,4,2
0,720,3.45,4
0,720,4,3
0,560,3.36,3
1,800,4,3
0,540,3.12,1
1,620,4,1
0,700,2.9,4
0,620,3.07,2
0,500,2.71,2
0,380,2.91,4
1,500,3.6,3
0,520,2.98,2
0,600,3.32,2
0,600,3.48,2
0,700,3.28,1
1,660,4,2
0,700,3.83,2
1,720,3.64,1
0,800,3.9,2
0,580,2.93,2
1,660,3.44,2
0,660,3.33,2
0,640,3.52,4
0,480,3.57,2
0,700,2.88,2
0,400,3.31,3
0,340,3.15,3
0,580,3.57,3
0,380,3.33,4
0,540,3.94,3
1,660,3.95,2
1,740,2.97,2
1,700,3.56,1
0,480,3.13,2
0,400,2.93,3
0,480,3.45,2
0,680,3.08,4
0,420,3.41,4
0,360,3,3
0,600,3.22,1
0,720,3.84,3
0,620,3.99,3
1,440,3.45,2
0,700,3.72,2
1,800,3.7,1
0,340,2.92,3
1,520,3.74,2
1,480,2.67,2
0,520,2.85,3
0,500,2.98,3
0,720,3.88,3
0,540,3.38,4
1,600,3.54,1
0,740,3.74,4
0,540,3.19,2
0,460,3.15,4
1,620,3.17,2
0,640,2.79,2
0,580,3.4,2
0,500,3.08,3
0,560,2.95,2
0,500,3.57,3
0,560,3.33,4
0,700,4,3
0,620,3.4,2
1,600,3.58,1
0,640,3.93,2
1,700,3.52,4
0,620,3.94,4
0,580,3.4,3
0,580,3.4,4
0,380,3.43,3
0,480,3.4,2
0,560,2.71,3
1,480,2.91,1
0,740,3.31,1
1,800,3.74,1
0,400,3.38,2
1,640,3.94,2
0,580,3.46,3
0,620,3.69,3
1,580,2.86,4
0,560,2.52,2
1,480,3.58,1
0,660,3.49,2
0,700,3.82,3
0,600,3.13,2
0,640,3.5,2
1,700,3.56,2
0,520,2.73,2
0,580,3.3,2
0,700,4,1
0,440,3.24,4
0,720,3.77,3
0,500,4,3
0,600,3.62,3
0,400,3.51,3
0,540,2.81,3
0,680,3.48,3
1,800,3.43,2
0,500,3.53,4
1,620,3.37,2
0,520,2.62,2
1,620,3.23,3
0,620,3.33,3
0,300,3.01,3
0,620,3.78,3
0,500,3.88,4
0,700,4,2
1,540,3.84,2
0,500,2.79,4
0,800,3.6,2
0,560,3.61,3
0,580,2.88,2
0,560,3.07,2
0,500,3.35,2
1,640,2.94,2
0,800,3.54,3
0,640,3.76,3
0,380,3.59,4
1,600,3.47,2
0,560,3.59,2
0,660,3.07,3
1,400,3.23,4
0,600,3.63,3
0,580,3.77,4
0,800,3.31,3
1,580,3.2,2
1,700,4,1
0,420,3.92,4
1,600,3.89,1
1,780,3.8,3
0,740,3.54,1
1,640,3.63,1
0,540,3.16,3
0,580,3.5,2
0,740,3.34,4
0,580,3.02,2
0,460,2.87,2
0,640,3.38,3
1,600,3.56,2
1,660,2.91,3
0,340,2.9,1
1,460,3.64,1
0,460,2.98,1
1,560,3.59,2
0,540,3.28,3
0,680,3.99,3
1,480,3.02,1
0,800,3.47,3
0,800,2.9,2
1,720,3.5,3
0,620,3.58,2
0,540,3.02,4
0,480,3.43,2
1,720,3.42,2
0,580,3.29,4
0,600,3.28,3
0,380,3.38,2
0,420,2.67,3
1,800,3.53,1
0,620,3.05,2
1,660,3.49,2
0,480,4,2
0,500,2.86,4
0,700,3.45,3
0,440,2.76,2
1,520,3.81,1
1,680,2.96,3
0,620,3.22,2
0,540,3.04,1
0,800,3.91,3
0,680,3.34,2
0,440,3.17,2
0,680,3.64,3
0,640,3.73,3
0,660,3.31,4
0,620,3.21,4
1,520,4,2
1,540,3.55,4
1,740,3.52,4
0,640,3.35,3
1,520,3.3,2
1,620,3.95,3
0,520,3.51,2
0,640,3.81,2
0,680,3.11,2
0,440,3.15,2
1,520,3.19,3
1,620,3.95,3
1,520,3.9,3
0,380,3.34,3
0,560,3.24,4
1,600,3.64,3
1,680,3.46,2
0,500,2.81,3
1,640,3.95,2
0,540,3.33,3
1,680,3.67,2
0,660,3.32,1
0,520,3.12,2
1,600,2.98,2
0,460,3.77,3
1,580,3.58,1
1,680,3,4
1,660,3.14,2
0,660,3.94,2
0,360,3.27,3
0,660,3.45,4
0,520,3.1,4
1,440,3.39,2
0,600,3.31,4
1,800,3.22,1
1,660,3.7,4
0,800,3.15,4
0,420,2.26,4
1,620,3.45,2
0,800,2.78,2
0,680,3.7,2
0,800,3.97,1
0,480,2.55,1
0,520,3.25,3
0,560,3.16,1
0,460,3.07,2
0,540,3.5,2
0,720,3.4,3
0,640,3.3,2
1,660,3.6,3
1,400,3.15,2
1,680,3.98,2
0,220,2.83,3
0,580,3.46,4
1,540,3.17,1
0,580,3.51,2
0,540,3.13,2
0,440,2.98,3
0,560,4,3
0,660,3.67,2
0,660,3.77,3
1,520,3.65,4
0,540,3.46,4
1,300,2.84,2
1,340,3,2
1,780,3.63,4
1,480,3.71,4
0,540,3.28,1
0,460,3.14,3
0,460,3.58,2
0,500,3.01,4
0,420,2.69,2
0,520,2.7,3
0,680,3.9,1
0,680,3.31,2
1,560,3.48,2
0,580,3.34,2
0,500,2.93,4
0,740,4,3
0,660,3.59,3
0,420,2.96,1
0,560,3.43,3
1,460,3.64,3
1,620,3.71,1
0,520,3.15,3
0,620,3.09,4
0,540,3.2,1
1,660,3.47,3
0,500,3.23,4
1,560,2.65,3
0,500,3.95,4
0,580,3.06,2
0,520,3.35,3
0,500,3.03,3
0,600,3.35,2
0,580,3.8,2
0,400,3.36,2
0,620,2.85,2
1,780,4,2
0,620,3.43,3
1,580,3.12,3
0,700,3.52,2
1,540,3.78,2
1,760,2.81,1
0,700,3.27,2
0,720,3.31,1
1,560,3.69,3
0,720,3.94,3
1,520,4,1
1,540,3.49,1
0,680,3.14,2
0,460,3.44,2
1,560,3.36,1
0,480,2.78,3
0,460,2.93,3
0,620,3.63,3
0,580,4,1
0,800,3.89,2
1,540,3.77,2
1,680,3.76,3
1,680,2.42,1
1,620,3.37,1
0,560,3.78,2
0,560,3.49,4
0,620,3.63,2
1,800,4,2
0,640,3.12,3
0,540,2.7,2
0,700,3.65,2
1,540,3.49,2
0,540,3.51,2
0,660,4,1
1,480,2.62,2
0,420,3.02,1
1,740,3.86,2
0,580,3.36,2
0,640,3.17,2
0,640,3.51,2
1,800,3.05,2
1,660,3.88,2
1,600,3.38,3
1,620,3.75,2
1,460,3.99,3
0,620,4,2
0,560,3.04,3
0,460,2.63,2
0,700,3.65,2
0,600,3.89,3


0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
#setwd('/Users/XXX/Downloads/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

#source('2_library.r')

source('3_logistics_regression.r')

source('4_logistic_regression.r')



Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5



1_install_packages.r
#


2_library.r
#


3_logistics_regression.r
# data loading
data1 <- read.csv("data1.csv", header=TRUE)

# Logistic Regression
mylogit = glm(y ~ x1 + x2, data=data1,  family=binomial(link="logit"))

# Summary
summary(mylogit)
#=>
# Call:
# glm(formula = y ~ x1 + x2, family = binomial(link ="logit"), data = data1)
# Deviance Residuals:
#     Min       1Q   Median       3Q      Max
# -1.4754  -0.8584  -0.8007   1.1905   1.5719

# Coefficients:
#             Estimate Std. Error z value Pr(>|z|)
# (Intercept) -9.44589    9.12237  -1.035    0.300
# x1           1.27158    1.49423   0.851    0.395
# x2           0.06424    0.08739   0.735    0.462

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 13.460  on 9  degrees of freedom
# Residual deviance: 12.345  on 7  degrees of freedom
# AIC: 18.345

# Number of Fisher Scoring iterations: 4


# See fitted numbers
fitted(mylogit)
#=>
#         1         2         3         4         5         6         7         8         9        10
# 0.2675020 0.2907201 0.3260736 0.6632579 0.4072105 0.3137831 0.7600997 0.2914609 0.3988916 0.2810008


# Fitted (predicted) numbers are calculated by the following equation.
# logit(π)=−9.44589+1.27158x1+0.06424x2
# However, as you can see Pr(>|z|) in the result of summary, this is not significant.


4_logistic_regression.r
# Data loading
#mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata <- read.csv("binary.csv")
mydata$rank <- factor(mydata$rank)

# Logistic Regression
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")

# Summary
summary(mylogit)
#=>
# Call:
# glm(formula = admit ~ gre + gpa + rank, family = "binomial", data = mydata)

# Deviance Residuals:
#     Min       1Q   Median       3Q      Max
# -1.6268  -0.8662  -0.6388   1.1490   2.0790

# Coefficients:
#              Estimate Std. Error z value Pr(>|z|)  
# (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
# gre          0.002264   0.001094   2.070 0.038465 *
# gpa          0.804038   0.331819   2.423 0.015388 *
# rank2       -0.675443   0.316490  -2.134 0.032829 *
# rank3       -1.340204   0.345306  -3.881 0.000104 ***
# rank4       -1.551464   0.417832  -3.713 0.000205 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# (Dispersion parameter for binomial family taken to be 1)

#     Null deviance: 499.98  on 399  degrees of freedom
# Residual deviance: 458.52  on 394  degrees of freedom
# AIC: 470.52

# Number of Fisher Scoring iterations: 4

newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(mylogit, newdata = newdata1, type = "response")
newdata2 <- with(mydata, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
    4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(mylogit, newdata = newdata2, type = "link",
    se = TRUE))
newdata3 <- within(newdata3, {
    PredictedProb <- plogis(fit)
    LL <- plogis(fit - (1.96 * se.fit))
    UL <- plogis(fit + (1.96 * se.fit))
})
# Using graphic library
library("ggplot2")
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
    ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
    size = 1)
# Save as an image file
ggsave("image.png")

#Reference
#https://stats.idre.ucla.edu/r/dae/logit-regression/




R: Logistic Regression


0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/logistic_regression/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data.r')

source('4_logistic_regression.r')

#Reference
#https://www.datacamp.com/community/tutorials/logistic-regression-R



Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5



1_install_packages.r
########## install packages

install.packages("ISLR")
#install.packages('ISLR_1.2.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/ISLR/index.html

install.packages("Amelia")
#install.packages('Amelia_1.7.5.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/Amelia/index.html

install.packages("mlbench")
#install.packages('mlbench_2.1-1.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/mlbench/

install.packages("corrplot")
#install.packages('corrplot_0.84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/corrplot/

install.packages("caret")
#install.packages('caret_6.0-84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/caret/


2_library.r
########## library setting

library('ISLR')

library(Amelia)
library(mlbench)

library(corrplot)

library("caret")


3_data.r
#For this tutorial, you're going to work with the Smarket dataset within RStudio.
# The dataset shows daily percentage returns for the S&P 500 stock index between 2001 and 2005.


names(Smarket)
#[1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"      "Volume"    "Today"     "Direction"

head(Smarket)
#  Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
#1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
#2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
#3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
#4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
#5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
#6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up


summary(Smarket)
#      Year           Lag1                Lag2                Lag3                Lag4                Lag5              Volume           Today           Direction
# Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000   Down:602
# 1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648
# Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500   Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500            
# Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138            
# 3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750            
# Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000            


# Response vairable: Direction - that shows whether the market went up or down since the previous day.



# Visualizing Data
#
# histogram

##### Start: saving as a png file
png("fig_3_data_1.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
    hist(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

# It's extremely hard to see, but most of the variables show a Gaussian or double Gaussian distribution.



##### Start: saving as a png file
png("fig_3_data_2.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
    boxplot(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file



##### Start: saving as a png file
#png("fig_3_data_3.png", width = 1600, height = 1600)
png("fig_3_data_3.png")

missmap(Smarket, col=c("blue", "red"), legend=FALSE)

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_4.png", width = 1600, height = 1600)
png("fig_3_data_4.png")

correlations <- cor(Smarket[,1:8])
corrplot(correlations, method="circle")

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_5.png", width = 1600, height = 1600)
png("fig_3_data_5.png")

pairs(Smarket, col=Smarket$Direction)

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_6.png", width = 1600, height = 1600)
png("fig_3_data_6.png")

x <- Smarket[,1:8]
y <- Smarket[,9]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

dev.off()
##### End: saving as a png file



4_logistic_regression.r
########## Logistics Regression


##### Building Logistic Regression Model

glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial)

summary(glm.fit)

# You look at the first 5 probabilities and they are very close to 50%:
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:5]

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")


attach(Smarket)
table(glm.pred,Direction)

mean(glm.pred == Direction)



##### Creating Training and Test Samples

# Make training and test set
train = Year<2005
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
               data = Smarket,
               family = binomial,
               subset = train)

glm.probs <- predict(glm.fit,
                    newdata = Smarket[!train,],
                    type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")


Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)

##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44

mean(glm.pred == Direction.2005)

## [1] 0.4801587



##### Solving Overfitting

# Fit a smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train,], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)

##         Direction.2005
## glm.pred Down  Up
##     Down   39  31
##     Up     72 110

mean(glm.pred == Direction.2005)

## [1] 0.5912698



summary(glm.fit)

##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3, family = binomial,
##     data = Smarket, subset = train)
##
## Deviance Residuals:
##    Min      1Q  Median      3Q     Max
## -1.338  -1.189   1.072   1.163   1.335
##
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.032230   0.063377   0.509    0.611
## Lag1        -0.055523   0.051709  -1.074    0.283
## Lag2        -0.044300   0.051674  -0.857    0.391
## Lag3         0.008815   0.051495   0.171    0.864
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.4  on 994  degrees of freedom
## AIC: 1389.4
##
## Number of Fisher Scoring iterations: 3



R: Regularization and Logistics Regression


0_runme.txt

########## R: L1 Regularization for Logistic Regression

##### Run this script on your R Console


##### Background
#
# We use machine learning models to learn training data.
# The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above).
# If the machined learning model is over-fitting the training data (including noises and outlier),
# the model's prediction accuracy for new data could be lowered.
# This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful.
# To explain noises and outliers, the model is overly optimized.
#
# Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients).
#
# To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models.
#
# Regularization: A way to find a model while avoiding over-fitting
#     L1 (Lasso): A penalty term is sum of absolute parameter values of the model
#                 By setting weight = 0 of certain data, deleting unnecessary data.
#                 "Dimension comperession to delete unnecessary explanatory variables"
#     L2 (Ridge): A penalty term is sum of squared parameter values of the model.
#                 This is to have a smoother model.
#                 "More accurate prediction while avoiding over-fitting"
# Under both L1 regularization and L2 regularization,
# models with lower dimensions have smaller penalty.
# If training data have exceptional data such as noises and outliers,
# models have to increase its dimensions to explain data including such exceptional data
# while trying not to be penalized for increased dimensions.
# (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.)
#
#
# Regression:
#     A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn}
#     Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X
#
# Logistic regression:
#     Generally, hθ(X) above is a continuous value without any upper and lower boundaries.
#     To make 0 ≤ hθ(X) ≤ 1,
#     Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z))
#     When doing logistic regressions,
#     hθ(X) = 1/(1 + e^(−θT X))
#
#     hθ(x)≥0.5, then Y = 1
#     hθ(x)<0.5, then Y = 0


# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/regularization_and_logistic_regression/')

# source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_quick_start.r')

source('4_logistic_regression_binominal.r')

source('5_logistic_regression_multinominal.r')



Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5



1_install_packages.r
########## install packages

install.packages("glmnet")
#install.packages('glmnet_2.0-18.tgz')
#zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html

# Reference
#http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html



# Dowload files
#
# QuickStartExample.RData
# https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData
#
# BinomialExample.RData
# https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData
#
# MultinomialExample.RData
# https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData



2_library.r
########## library setting

library('glmnet')



3_quick_start.r
# Dowload QuickStartExample.RData from the follwowing Github site.
# https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData

load("QuickStartExample.RData")

fit = glmnet(x, y)


##### Start: saving as a png file
png("fig_3_quick_start_1.png")

#draw a figure
plot(fit)
# Each curve corresponds to a variable.
# It shows the path of its coefficient against the ℓ1-norm of the whole coefficient vector at as λ varies.
# (The tuning parameter λ controls the overall strength of the penalty.)

dev.off()
##### End: saving as a png file


#A summary of the glmnet path at each step is displayed if we just enter the object name or use the print function:
print(fit)


coef(fit,s=0.1)


nx = matrix(rnorm(10*20),10,20)
predict(fit,newx=nx,s=c(0.1,0.05))


cvfit = cv.glmnet(x, y)



##### Start: saving as a png file
png("fig_3_quick_start_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file


cvfit$lambda.min


coef(cvfit, s = "lambda.min")


predict(cvfit, newx = x[1:5,], s = "lambda.min")


4_logistic_regression_binominal.r
# Download from https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData
load("BinomialExample.RData")

fit = glmnet(x, y, family = "binomial")


##### Start: saving as a png file
png("fig_4_logistic_regression_binominal_1.png")

plot(fit, xvar = "dev", label = TRUE)

dev.off()
##### End: saving as a png file


predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01))


cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class")


##### Start: saving as a png file
png("fig_4_logistic_regression_binominal_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file


cvfit$lambda.min


cvfit$lambda.1se


coef(cvfit, s = "lambda.min")


predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")





5_logistic_regression_multinominal.r
# Download from https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData
load("MultinomialExample.RData")


fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped")

##### Start: saving as a png file
png("fig_5_logistic_regression_multinominal_1.png")

plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm")

dev.off()
##### End: saving as a png file


cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE)

##### Start: saving as a png file
png("fig_5_logistic_regression_multinominal_2.png")

plot(cvfit)

dev.off()
##### End: saving as a png file

predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class")





Figures






Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights ...