0_runme.txt
########## R: Logistic Regression ##### Run this script on your R Console # Set your working directory on your R Console # The following directory is dummy - set to your own directory where you save all the r files below. setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/logistic_regression/') #source('1_install_packages.r') # You have to run this r script only for the first time. source('2_library.r') source('3_data.r') source('4_logistic_regression.r') #Reference #https://www.datacamp.com/community/tutorials/logistic-regression-R |
Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5
1_install_packages.r
########## install packages install.packages("ISLR") #install.packages('ISLR_1.2.tgz') #tgz file dowloaded from #https://cran.r-project.org/web/packages/ISLR/index.html install.packages("Amelia") #install.packages('Amelia_1.7.5.tgz') #tgz file dowloaded from #https://cran.r-project.org/web/packages/Amelia/index.html install.packages("mlbench") #install.packages('mlbench_2.1-1.tgz') #tgz file dowloaded from #https://cran.r-project.org/web/packages/mlbench/ install.packages("corrplot") #install.packages('corrplot_0.84.tgz') #tgz file dowloaded from #https://cran.r-project.org/web/packages/corrplot/ install.packages("caret") #install.packages('caret_6.0-84.tgz') #tgz file dowloaded from #https://cran.r-project.org/web/packages/caret/ |
########## library setting library('ISLR') library(Amelia) library(mlbench) library(corrplot) library("caret") |
3_data.r
#For this tutorial, you're going to work with the Smarket dataset within RStudio. # The dataset shows daily percentage returns for the S&P 500 stock index between 2001 and 2005. names(Smarket) #[1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume" "Today" "Direction" head(Smarket) # Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction #1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up #2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up #3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down #4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up #5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up #6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up summary(Smarket) # Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction # Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000 Down:602 # 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648 # Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500 Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500 # Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716 Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138 # 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750 # Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000 # Response vairable: Direction - that shows whether the market went up or down since the previous day. # Visualizing Data # # histogram ##### Start: saving as a png file png("fig_3_data_1.png", width = 1600, height = 1600) par(mfrow=c(1,8)) for(i in 1:8) { hist(Smarket[,i], main=names(Smarket)[i]) } dev.off() ##### End: saving as a png file # It's extremely hard to see, but most of the variables show a Gaussian or double Gaussian distribution. ##### Start: saving as a png file png("fig_3_data_2.png", width = 1600, height = 1600) par(mfrow=c(1,8)) for(i in 1:8) { boxplot(Smarket[,i], main=names(Smarket)[i]) } dev.off() ##### End: saving as a png file ##### Start: saving as a png file #png("fig_3_data_3.png", width = 1600, height = 1600) png("fig_3_data_3.png") missmap(Smarket, col=c("blue", "red"), legend=FALSE) dev.off() ##### End: saving as a png file ##### Start: saving as a png file #png("fig_3_data_4.png", width = 1600, height = 1600) png("fig_3_data_4.png") correlations <- cor(Smarket[,1:8]) corrplot(correlations, method="circle") dev.off() ##### End: saving as a png file ##### Start: saving as a png file #png("fig_3_data_5.png", width = 1600, height = 1600) png("fig_3_data_5.png") pairs(Smarket, col=Smarket$Direction) dev.off() ##### End: saving as a png file ##### Start: saving as a png file #png("fig_3_data_6.png", width = 1600, height = 1600) png("fig_3_data_6.png") x <- Smarket[,1:8] y <- Smarket[,9] scales <- list(x=list(relation="free"), y=list(relation="free")) featurePlot(x=x, y=y, plot="density", scales=scales) dev.off() ##### End: saving as a png file |
4_logistic_regression.r
########## Logistics Regression ##### Building Logistic Regression Model glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial) summary(glm.fit) # You look at the first 5 probabilities and they are very close to 50%: glm.probs <- predict(glm.fit,type = "response") glm.probs[1:5] glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down") attach(Smarket) table(glm.pred,Direction) mean(glm.pred == Direction) ##### Creating Training and Test Samples # Make training and test set train = Year<2005 glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, family = binomial, subset = train) glm.probs <- predict(glm.fit, newdata = Smarket[!train,], type = "response") glm.pred <- ifelse(glm.probs > 0.5, "Up", "Down") Direction.2005 = Smarket$Direction[!train] table(glm.pred, Direction.2005) ## Direction.2005 ## glm.pred Down Up ## Down 77 97 ## Up 34 44 mean(glm.pred == Direction.2005) ## [1] 0.4801587 ##### Solving Overfitting # Fit a smaller model glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3, data = Smarket, family = binomial, subset = train) glm.probs = predict(glm.fit, newdata = Smarket[!train,], type = "response") glm.pred = ifelse(glm.probs > 0.5, "Up", "Down") table(glm.pred, Direction.2005) ## Direction.2005 ## glm.pred Down Up ## Down 39 31 ## Up 72 110 mean(glm.pred == Direction.2005) ## [1] 0.5912698 summary(glm.fit) ## ## Call: ## glm(formula = Direction ~ Lag1 + Lag2 + Lag3, family = binomial, ## data = Smarket, subset = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.338 -1.189 1.072 1.163 1.335 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.032230 0.063377 0.509 0.611 ## Lag1 -0.055523 0.051709 -1.074 0.283 ## Lag2 -0.044300 0.051674 -0.857 0.391 ## Lag3 0.008815 0.051495 0.171 0.864 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1383.3 on 997 degrees of freedom ## Residual deviance: 1381.4 on 994 degrees of freedom ## AIC: 1389.4 ## ## Number of Fisher Scoring iterations: 3 |
No comments:
Post a Comment