The Financial Journal (Global): R: linear and polynominal regressions - finding an optimal order of polynominal models

0_runme.txt

########## R: linear and polynominal regressions - finding an optimal order of polynominal models

# Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
#setwd('/Users/XXX/Dropbox/Google Drive/Coding/R/polynomial_regression/')

# source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data_plotting.r')

source('4_linear_regression.r')

source('5_polynomial_regression.r')

source('6_anova.r')

1_install_packages.r

########## install packages

install.packages('labeling')
#install.packages('labeling_0.3.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/labeling/index.html

install.packages('withr')
#install.packages('withr_2.1.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/withr/index.html

install.packages('pkgconfig')
#install.packages('pkgconfig_2.0.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/pkgconfig/index.html

install.packages('zeallot')
#install.packages('zeallot_0.1.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/zeallot/index.html

install.packages('vctrs')
#install.packages('vctrs_0.1.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/vctrs/index.html

install.packages('crayon')
#install.packages('crayon_1.3.4.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/crayon/index.html

install.packages('pillar')
#install.packages('pillar_1.4.1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/pillar/index.html

install.packages('tibble')
#install.packages('tibble_2.1.3.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/tibble/index.html

install.packages('lazyeval')
#install.packages('lazyeval_0.2.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/lazyeval/index.html

install.packages('colorspace')
#install.packages('colorspace_1.4-1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/colorspace/index.html

install.packages('munsell')
#install.packages('munsell_0.5.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/munsell/index.html

install.packages('Rccp')
#install.packages('Rcpp_1.0.1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/Rcpp/index.html

install.packages('scales')
#install.packages('scales_1.0.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/scales/index.html

install.packages('rlang')
#install.packages('rlang_0.4.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/rlang/index.html

install.packages(‘gtable’)
#install.packages('gtable_0.3.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/gtable/index.html

install.packages('ggplot2')
#install.packages('ggplot2_3.2.0.zip') #zip file dowloaded https://cloud.r-project.org/web/packages/ggplot2/index.html

2_library.r

########## library setting

library('labeling')
library('withr')
library('pkgconfig')
library('zeallot')
library('vctrs')
library('crayon')
library('pillar')
library('tibble')
library('lazyeval')
library('colorspace')
library('munsell')
library('Rcpp')
library('scales')
library('rlang')
library('gtable')
library('ggplot2')

3_data_plotting.r

########## data plotting

gp3 <- ggplot(cars, aes(x = speed, y = dist)) + geom_point(size = 3) + xlim(0,25)
gp3

#draw a figure
print(gp3)

#saving as a png file
ggsave(file = "fig_3_data_plotting.png", plot = gp3)

4_linear_regression.r

########## linear regression
#
#If speed = 0, then dist = 0 in this case, so intercept should be zero
lm1 <- lm(dist ~ speed - 1, cars) # - 1 to make the intercept zero
summary(lm1)

#Call:
#lm(formula = dist ~ speed - 1, data = cars)
#
#Residuals:
# Min 1Q Median 3Q Max
#-26.183 -12.637 -5.455 4.590 50.181
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#speed 2.9091 0.1414 20.58 <2e-16 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 16.26 on 49 degrees of freedom
#Multiple R-squared: 0.8963, Adjusted R-squared: 0.8942
#F-statistic: 423.5 on 1 and 49 DF, p-value: < 2.2e-16
#

#coefficient of speed: 2.9091 (dist increases by 2.9091 when speed +1 mph)
#coefficient of speed (std. error): +-0.1414
#p-value is small enough. <2e-16 *** It is considered to be significant.
#Residual Standard Error 16.26 < (dist average = 42.98), 16.26 is 37% of 42.98
#R-squared: 0.8942 (~1)

gp4 <- gp3 + geom_smooth(method = lm, formula = y ~ x - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp4)

#saving as a png file
ggsave(file = "fig_4_linear_regression.png", plot = gp4)

5_polynomial_regression.r

########## second (2) order polynomial regression

# degree = 2
# raw = TRUE to add "-1" to the model
lm2 <- lm(dist ~ poly(speed, degree = 2, raw = TRUE) - 1, cars)
summary(lm2)

gp5_2 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 2, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_2)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_2.png", plot = gp5_2)

#Call:
#lm(formula = dist ~ poly(speed, degree = 2, raw = TRUE) - 1,
# data = cars)
#
#Residuals:
# Min 1Q Median 3Q Max
#-28.836 -9.071 -3.152 4.570 44.986
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#poly(speed, degree = 2, raw = TRUE)1 1.23903 0.55997 2.213 0.03171 *
#poly(speed, degree = 2, raw = TRUE)2 0.09014 0.02939 3.067 0.00355 **
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 15.02 on 48 degrees of freedom
#Multiple R-squared: 0.9133, Adjusted R-squared: 0.9097
#F-statistic: 252.8 on 2 and 48 DF, p-value: < 2.2e-16

#Residual standard error: 15.02 < 16.26 (previous result)

########## third (3) order polynomial regression
lm3 <- lm(dist ~ poly(speed, degree = 3, raw = TRUE) - 1, cars)
summary(lm3)
gp5_3 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 3, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_3)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_3.png", plot = gp5_3)

########## fifth (5) order polynomial regression
lm5 <- lm(dist ~ poly(speed, degree = 5, raw = TRUE) - 1, cars)
summary(lm5)
gp5_5 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 5, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_5)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_5.png", plot = gp5_5)

########## ninth (9) order polynomial regression
lm9 <- lm(dist ~ poly(speed, degree = 9, raw = TRUE) - 1, cars)
summary(lm9)
gp5_9 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 9, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_9)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_9.png", plot = gp5_9)

#Residual standard error is even smaller, but p-value is now bigger.
#This is like overfitting; too complicated models are not necessarily good.

6_anova.r

########## ANOVA: analysis of variance "So, how can we optimize the number of orders?"

#Option A (the simplest): finding larger number of orders with significant p-value

#Option B (ANOVA: analysis of variance): finding larger number of orders with significant p-value
# We can use ANOVA since regressions here are "nested" - second order polynominal regression is a liner regression plus another term.

capture.output(anova(lm1, lm2, lm3, lm5, lm9), file = "output_6_anova.txt")

#If you run anova on your R Console, then you can use sink() as follows:
#
#sink("output_6_anova.txt") # sink: save results of the following anova command.
#anova(lm1, lm2, lm3, lm5, lm9)
#sink() # This second sink() is to stop saving results.

#Analysis of Variance Table
#
#Model 1: dist ~ speed - 1
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
#Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
#Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
#Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
# Res.Df RSS Df Sum of Sq F Pr(>F)
#1 49 12954
#2 48 10831 1 2122.66 9.1542 0.004272 **
#3 47 10743 1 87.75 0.3784 0.541845
#4 45 10263 2 480.05 1.0351 0.364269
#5 41 9507 4 756.35 0.8155 0.522711
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1

# If adding a variable makes the model have higher explanatory power,
# then you can see ** within Pr(>F) as shown above.
# 0.004272 **
#
# So, in this case, second order polynominal model
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
# is the one we should choose.
#
# We do not go into detail, but this result (degree = 2) is backed by physics.

After running r scripts above, you'll get these png files on your R Console working directory:

After running 6_anova.r above, you'll get a txt file (output_6_anova.txt) on your R Console working directory which include:

Analysis of Variance Table

Model 1: dist ~ speed - 1
Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
Res.Df RSS Df Sum of Sq F Pr(>F)
1 49 12954
2 48 10831 1 2122.66 9.1542 0.004272 **
3 47 10743 1 87.75 0.3784 0.541845
4 45 10263 2 480.05 1.0351 0.364269
5 41 9507 4 756.35 0.8155 0.522711
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Financial Journal (Global)

AdSense

Friday, June 28, 2019

R: linear and polynominal regressions - finding an optimal order of polynominal models

No comments:

Post a Comment

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Report Abuse