AdSense

Saturday, July 6, 2019

R: Logistic Regression


0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/logistic_regression/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data.r')

source('4_logistic_regression.r')

#Reference
#https://www.datacamp.com/community/tutorials/logistic-regression-R



Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5



1_install_packages.r
########## install packages

install.packages("ISLR")
#install.packages('ISLR_1.2.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/ISLR/index.html

install.packages("Amelia")
#install.packages('Amelia_1.7.5.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/Amelia/index.html

install.packages("mlbench")
#install.packages('mlbench_2.1-1.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/mlbench/

install.packages("corrplot")
#install.packages('corrplot_0.84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/corrplot/

install.packages("caret")
#install.packages('caret_6.0-84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/caret/


2_library.r
########## library setting

library('ISLR')

library(Amelia)
library(mlbench)

library(corrplot)

library("caret")


3_data.r
#For this tutorial, you're going to work with the Smarket dataset within RStudio.
# The dataset shows daily percentage returns for the S&P 500 stock index between 2001 and 2005.


names(Smarket)
#[1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"      "Volume"    "Today"     "Direction"

head(Smarket)
#  Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
#1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
#2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
#3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
#4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
#5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
#6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up


summary(Smarket)
#      Year           Lag1                Lag2                Lag3                Lag4                Lag5              Volume           Today           Direction
# Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000   Down:602
# 1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648
# Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500   Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500            
# Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138            
# 3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750            
# Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000            


# Response vairable: Direction - that shows whether the market went up or down since the previous day.



# Visualizing Data
#
# histogram

##### Start: saving as a png file
png("fig_3_data_1.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
    hist(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

# It's extremely hard to see, but most of the variables show a Gaussian or double Gaussian distribution.



##### Start: saving as a png file
png("fig_3_data_2.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
    boxplot(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file



##### Start: saving as a png file
#png("fig_3_data_3.png", width = 1600, height = 1600)
png("fig_3_data_3.png")

missmap(Smarket, col=c("blue", "red"), legend=FALSE)

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_4.png", width = 1600, height = 1600)
png("fig_3_data_4.png")

correlations <- cor(Smarket[,1:8])
corrplot(correlations, method="circle")

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_5.png", width = 1600, height = 1600)
png("fig_3_data_5.png")

pairs(Smarket, col=Smarket$Direction)

dev.off()
##### End: saving as a png file


##### Start: saving as a png file
#png("fig_3_data_6.png", width = 1600, height = 1600)
png("fig_3_data_6.png")

x <- Smarket[,1:8]
y <- Smarket[,9]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

dev.off()
##### End: saving as a png file



4_logistic_regression.r
########## Logistics Regression


##### Building Logistic Regression Model

glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial)

summary(glm.fit)

# You look at the first 5 probabilities and they are very close to 50%:
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:5]

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")


attach(Smarket)
table(glm.pred,Direction)

mean(glm.pred == Direction)



##### Creating Training and Test Samples

# Make training and test set
train = Year<2005
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
               data = Smarket,
               family = binomial,
               subset = train)

glm.probs <- predict(glm.fit,
                    newdata = Smarket[!train,],
                    type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")


Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)

##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44

mean(glm.pred == Direction.2005)

## [1] 0.4801587



##### Solving Overfitting

# Fit a smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train,], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)

##         Direction.2005
## glm.pred Down  Up
##     Down   39  31
##     Up     72 110

mean(glm.pred == Direction.2005)

## [1] 0.5912698



summary(glm.fit)

##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3, family = binomial,
##     data = Smarket, subset = train)
##
## Deviance Residuals:
##    Min      1Q  Median      3Q     Max
## -1.338  -1.189   1.072   1.163   1.335
##
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.032230   0.063377   0.509    0.611
## Lag1        -0.055523   0.051709  -1.074    0.283
## Lag2        -0.044300   0.051674  -0.857    0.391
## Lag3         0.008815   0.051495   0.171    0.864
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1383.3  on 997  degrees of freedom
## Residual deviance: 1381.4  on 994  degrees of freedom
## AIC: 1389.4
##
## Number of Fisher Scoring iterations: 3



No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights ...