The Financial Journal (Global): R: Logistic Regression

0_runme.txt

########## R: Logistic Regression

##### Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/logistic_regression/')

#source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data.r')

source('4_logistic_regression.r')

#Reference
#https://www.datacamp.com/community/tutorials/logistic-regression-R

Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5

1_install_packages.r

########## install packages

install.packages("ISLR")
#install.packages('ISLR_1.2.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/ISLR/index.html

install.packages("Amelia")
#install.packages('Amelia_1.7.5.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/Amelia/index.html

install.packages("mlbench")
#install.packages('mlbench_2.1-1.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/mlbench/

install.packages("corrplot")
#install.packages('corrplot_0.84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/corrplot/

install.packages("caret")
#install.packages('caret_6.0-84.tgz')
#tgz file dowloaded from
#https://cran.r-project.org/web/packages/caret/

2_library.r

########## library setting

library('ISLR')

library(Amelia)
library(mlbench)

library(corrplot)

library("caret")

3_data.r

#For this tutorial, you're going to work with the Smarket dataset within RStudio.
# The dataset shows daily percentage returns for the S&P 500 stock index between 2001 and 2005.

names(Smarket)
#[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume" "Today" "Direction"

head(Smarket)
# Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
#1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
#2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
#3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
#4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
#5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
#6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up

summary(Smarket)
# Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
# Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000 Down:602
# 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
# Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500 Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
# Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716 Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
# 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
# Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000

# Response vairable: Direction - that shows whether the market went up or down since the previous day.

# Visualizing Data
#
# histogram

##### Start: saving as a png file
png("fig_3_data_1.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
hist(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

# It's extremely hard to see, but most of the variables show a Gaussian or double Gaussian distribution.

##### Start: saving as a png file
png("fig_3_data_2.png", width = 1600, height = 1600)

par(mfrow=c(1,8))
for(i in 1:8) {
boxplot(Smarket[,i], main=names(Smarket)[i])
}

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_3.png", width = 1600, height = 1600)
png("fig_3_data_3.png")

missmap(Smarket, col=c("blue", "red"), legend=FALSE)

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_4.png", width = 1600, height = 1600)
png("fig_3_data_4.png")

correlations <- cor(Smarket[,1:8])
corrplot(correlations, method="circle")

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_5.png", width = 1600, height = 1600)
png("fig_3_data_5.png")

pairs(Smarket, col=Smarket$Direction)

dev.off()
##### End: saving as a png file

##### Start: saving as a png file
#png("fig_3_data_6.png", width = 1600, height = 1600)
png("fig_3_data_6.png")

x <- Smarket[,1:8]
y <- Smarket[,9]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales)

dev.off()
##### End: saving as a png file

4_logistic_regression.r

########## Logistics Regression

##### Building Logistic Regression Model

glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial)

summary(glm.fit)

# You look at the first 5 probabilities and they are very close to 50%:
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:5]

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

attach(Smarket)
table(glm.pred,Direction)

mean(glm.pred == Direction)

##### Creating Training and Test Samples

# Make training and test set
train = Year<2005
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket,
family = binomial,
subset = train)

glm.probs <- predict(glm.fit,
newdata = Smarket[!train,],
type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down")

Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)

## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44

mean(glm.pred == Direction.2005)

## [1] 0.4801587

##### Solving Overfitting

# Fit a smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train,], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)

## Direction.2005
## glm.pred Down Up
## Down 39 31
## Up 72 110

mean(glm.pred == Direction.2005)

## [1] 0.5912698

summary(glm.fit)

##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3, family = binomial,
## data = Smarket, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.338 -1.189 1.072 1.163 1.335
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.032230 0.063377 0.509 0.611
## Lag1 -0.055523 0.051709 -1.074 0.283
## Lag2 -0.044300 0.051674 -0.857 0.391
## Lag3 0.008815 0.051495 0.171 0.864
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1383.3 on 997 degrees of freedom
## Residual deviance: 1381.4 on 994 degrees of freedom
## AIC: 1389.4
##
## Number of Fisher Scoring iterations: 3

The Financial Journal (Global)

AdSense

Saturday, July 6, 2019

R: Logistic Regression

No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Report Abuse