AdSense

Friday, June 28, 2019

R: linear and polynominal regressions - finding an optimal order of polynominal models


0_runme.txt

########## R: linear and polynominal regressions - finding an optimal order of polynominal models

# Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
#setwd('/Users/XXX/Dropbox/Google Drive/Coding/R/polynomial_regression/')

# source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data_plotting.r')

source('4_linear_regression.r')

source('5_polynomial_regression.r')

source('6_anova.r')



1_install_packages.r
########## install packages

install.packages('labeling')
#install.packages('labeling_0.3.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/labeling/index.html

install.packages('withr')
#install.packages('withr_2.1.2.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/withr/index.html

install.packages('pkgconfig')
#install.packages('pkgconfig_2.0.2.zip')                #zip file dowloaded from https://cran.r-project.org/web/packages/pkgconfig/index.html

install.packages('zeallot')
#install.packages('zeallot_0.1.0.zip')     #zip file dowloaded from https://cran.r-project.org/web/packages/zeallot/index.html

install.packages('vctrs')
#install.packages('vctrs_0.1.0.zip')        #zip file dowloaded from https://cran.r-project.org/web/packages/vctrs/index.html

install.packages('crayon')
#install.packages('crayon_1.3.4.zip')    #zip file dowloaded from https://cran.r-project.org/web/packages/crayon/index.html

install.packages('pillar')
#install.packages('pillar_1.4.1.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/pillar/index.html

install.packages('tibble')
#install.packages('tibble_2.1.3.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/tibble/index.html

install.packages('lazyeval')
#install.packages('lazyeval_0.2.2.zip')  #zip file dowloaded from https://cran.r-project.org/web/packages/lazyeval/index.html

install.packages('colorspace')
#install.packages('colorspace_1.4-1.zip')              #zip file dowloaded from https://cran.r-project.org/web/packages/colorspace/index.html

install.packages('munsell')
#install.packages('munsell_0.5.0.zip')  #zip file dowloaded from https://cran.r-project.org/web/packages/munsell/index.html

install.packages('Rccp')
#install.packages('Rcpp_1.0.1.zip')        #zip file dowloaded from https://cran.r-project.org/web/packages/Rcpp/index.html

install.packages('scales')
#install.packages('scales_1.0.0.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/scales/index.html

install.packages('rlang')
#install.packages('rlang_0.4.0.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/rlang/index.html

install.packages(‘gtable’)
#install.packages('gtable_0.3.0.zip')     #zip file dowloaded from https://cran.r-project.org/web/packages/gtable/index.html

install.packages('ggplot2')
#install.packages('ggplot2_3.2.0.zip')   #zip file dowloaded https://cloud.r-project.org/web/packages/ggplot2/index.html


2_library.r
########## library setting

library('labeling')
library('withr')
library('pkgconfig')
library('zeallot')
library('vctrs')
library('crayon')
library('pillar')
library('tibble')
library('lazyeval')
library('colorspace')
library('munsell')
library('Rcpp')
library('scales')
library('rlang')
library('gtable')
library('ggplot2')


3_data_plotting.r
########## data plotting


gp3 <- ggplot(cars, aes(x = speed, y = dist)) + geom_point(size = 3) + xlim(0,25)
gp3


#draw a figure
print(gp3)


#saving as a png file
ggsave(file = "fig_3_data_plotting.png", plot = gp3)

4_linear_regression.r
########## linear regression
#
#If speed = 0, then dist = 0 in this case, so intercept should be zero
lm1 <- lm(dist ~ speed - 1, cars)           # - 1 to make the intercept zero
summary(lm1)

#Call:
#lm(formula = dist ~ speed - 1, data = cars)
#
#Residuals:
#    Min      1Q  Median      3Q     Max
#-26.183 -12.637  -5.455   4.590  50.181
#
#Coefficients:
#      Estimate Std. Error t value Pr(>|t|)
#speed   2.9091     0.1414   20.58   <2e-16 ***
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 16.26 on 49 degrees of freedom
#Multiple R-squared:  0.8963,    Adjusted R-squared:  0.8942
#F-statistic: 423.5 on 1 and 49 DF,  p-value: < 2.2e-16
#

#coefficient of speed: 2.9091 (dist increases by 2.9091 when speed +1 mph)
#coefficient of speed (std. error): +-0.1414
#p-value is small enough. <2e-16 ***  It is considered to be significant.
#Residual Standard Error 16.26 < (dist average = 42.98), 16.26 is 37% of 42.98
#R-squared:  0.8942 (~1)

gp4 <- gp3 + geom_smooth(method = lm, formula = y ~ x - 1, se = FALSE, fullrange = TRUE)


#draw a figure
print(gp4)


#saving as a png file
ggsave(file = "fig_4_linear_regression.png", plot = gp4)


5_polynomial_regression.r
########## second (2) order polynomial regression

# degree = 2
# raw = TRUE to add "-1" to the model
lm2 <- lm(dist ~ poly(speed, degree = 2, raw = TRUE) - 1, cars)
summary(lm2)

gp5_2 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 2, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)


#draw a figure
print(gp5_2)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_2.png", plot = gp5_2)


#Call:
#lm(formula = dist ~ poly(speed, degree = 2, raw = TRUE) - 1,
#    data = cars)
#
#Residuals:
#    Min      1Q  Median      3Q     Max
#-28.836  -9.071  -3.152   4.570  44.986
#
#Coefficients:
#                                     Estimate Std. Error t value Pr(>|t|)
#poly(speed, degree = 2, raw = TRUE)1  1.23903    0.55997   2.213  0.03171 *
#poly(speed, degree = 2, raw = TRUE)2  0.09014    0.02939   3.067  0.00355 **
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 15.02 on 48 degrees of freedom
#Multiple R-squared:  0.9133,    Adjusted R-squared:  0.9097
#F-statistic: 252.8 on 2 and 48 DF,  p-value: < 2.2e-16

#Residual standard error: 15.02 < 16.26 (previous result)



########## third (3) order polynomial regression
lm3 <- lm(dist ~ poly(speed, degree = 3, raw = TRUE) - 1, cars)
summary(lm3)
gp5_3 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 3, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_3)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_3.png", plot = gp5_3)



########## fifth (5) order polynomial regression
lm5 <- lm(dist ~ poly(speed, degree = 5, raw = TRUE) - 1, cars)
summary(lm5)
gp5_5 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 5, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_5)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_5.png", plot = gp5_5)



########## ninth (9) order polynomial regression
lm9 <- lm(dist ~ poly(speed, degree = 9, raw = TRUE) - 1, cars)
summary(lm9)
gp5_9 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 9, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_9)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_9.png", plot = gp5_9)


#Residual standard error is even smaller, but p-value is now bigger.
#This is like overfitting; too complicated models are not necessarily good.

6_anova.r
########## ANOVA: analysis of variance "So, how can we optimize the number of orders?"

#Option A (the simplest): finding larger number of orders with significant p-value

#Option B (ANOVA: analysis of variance): finding larger number of orders with significant p-value
# We can use ANOVA since regressions here are "nested" - second order polynominal regression is a liner regression plus another term.

capture.output(anova(lm1, lm2, lm3, lm5, lm9), file = "output_6_anova.txt")


#If you run anova on your R Console, then you can use sink() as follows:
#
#sink("output_6_anova.txt") # sink: save results of the following anova command.
#anova(lm1, lm2, lm3, lm5, lm9)
#sink() # This second sink() is to stop saving results.



#Analysis of Variance Table
#
#Model 1: dist ~ speed - 1
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
#Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
#Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
#Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
#  Res.Df   RSS Df Sum of Sq      F   Pr(>F)
#1     49 12954                            
#2     48 10831  1   2122.66 9.1542 0.004272 **
#3     47 10743  1     87.75 0.3784 0.541845
#4     45 10263  2    480.05 1.0351 0.364269
#5     41  9507  4    756.35 0.8155 0.522711
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1

# If adding a variable makes the model have higher explanatory power,
# then you can see ** within Pr(>F) as shown above.
# 0.004272 **
#
# So, in this case, second order polynominal model
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
# is the one we should choose.
#
# We do not go into detail, but this result (degree = 2) is backed by physics.



After running r scripts above, you'll get these png files on your R Console working directory:







After running 6_anova.r above, you'll get a txt file (output_6_anova.txt) on your R Console working directory which include:


Analysis of Variance Table

Model 1: dist ~ speed - 1
Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
  Res.Df   RSS Df Sum of Sq      F   Pr(>F)
1     49 12954                              
2     48 10831  1   2122.66 9.1542 0.004272 **
3     47 10743  1     87.75 0.3784 0.541845
4     45 10263  2    480.05 1.0351 0.364269
5     41  9507  4    756.35 0.8155 0.522711
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights ...