0_runme.txt
########## R: linear and polynominal regressions - finding an optimal order of polynominal models # Run this script on your R Console # Set your working directory on your R Console # The following directory is dummy - set to your own directory where you save all the r files below. #setwd('/Users/XXX/Dropbox/Google Drive/Coding/R/polynomial_regression/') # source('1_install_packages.r') # You have to run this r script only for the first time. source('2_library.r') source('3_data_plotting.r') source('4_linear_regression.r') source('5_polynomial_regression.r') source('6_anova.r') |
1_install_packages.r
########## install packages install.packages('labeling') #install.packages('labeling_0.3.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/labeling/index.html install.packages('withr') #install.packages('withr_2.1.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/withr/index.html install.packages('pkgconfig') #install.packages('pkgconfig_2.0.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/pkgconfig/index.html install.packages('zeallot') #install.packages('zeallot_0.1.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/zeallot/index.html install.packages('vctrs') #install.packages('vctrs_0.1.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/vctrs/index.html install.packages('crayon') #install.packages('crayon_1.3.4.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/crayon/index.html install.packages('pillar') #install.packages('pillar_1.4.1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/pillar/index.html install.packages('tibble') #install.packages('tibble_2.1.3.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/tibble/index.html install.packages('lazyeval') #install.packages('lazyeval_0.2.2.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/lazyeval/index.html install.packages('colorspace') #install.packages('colorspace_1.4-1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/colorspace/index.html install.packages('munsell') #install.packages('munsell_0.5.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/munsell/index.html install.packages('Rccp') #install.packages('Rcpp_1.0.1.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/Rcpp/index.html install.packages('scales') #install.packages('scales_1.0.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/scales/index.html install.packages('rlang') #install.packages('rlang_0.4.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/rlang/index.html install.packages(‘gtable’) #install.packages('gtable_0.3.0.zip') #zip file dowloaded from https://cran.r-project.org/web/packages/gtable/index.html install.packages('ggplot2') #install.packages('ggplot2_3.2.0.zip') #zip file dowloaded https://cloud.r-project.org/web/packages/ggplot2/index.html |
########## library setting library('labeling') library('withr') library('pkgconfig') library('zeallot') library('vctrs') library('crayon') library('pillar') library('tibble') library('lazyeval') library('colorspace') library('munsell') library('Rcpp') library('scales') library('rlang') library('gtable') library('ggplot2') |
3_data_plotting.r
########## data plotting gp3 <- ggplot(cars, aes(x = speed, y = dist)) + geom_point(size = 3) + xlim(0,25) gp3 #draw a figure print(gp3) #saving as a png file ggsave(file = "fig_3_data_plotting.png", plot = gp3) |
4_linear_regression.r
########## linear regression # #If speed = 0, then dist = 0 in this case, so intercept should be zero lm1 <- lm(dist ~ speed - 1, cars) # - 1 to make the intercept zero summary(lm1) #Call: #lm(formula = dist ~ speed - 1, data = cars) # #Residuals: # Min 1Q Median 3Q Max #-26.183 -12.637 -5.455 4.590 50.181 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #speed 2.9091 0.1414 20.58 <2e-16 *** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 16.26 on 49 degrees of freedom #Multiple R-squared: 0.8963, Adjusted R-squared: 0.8942 #F-statistic: 423.5 on 1 and 49 DF, p-value: < 2.2e-16 # #coefficient of speed: 2.9091 (dist increases by 2.9091 when speed +1 mph) #coefficient of speed (std. error): +-0.1414 #p-value is small enough. <2e-16 *** It is considered to be significant. #Residual Standard Error 16.26 < (dist average = 42.98), 16.26 is 37% of 42.98 #R-squared: 0.8942 (~1) gp4 <- gp3 + geom_smooth(method = lm, formula = y ~ x - 1, se = FALSE, fullrange = TRUE) #draw a figure print(gp4) #saving as a png file ggsave(file = "fig_4_linear_regression.png", plot = gp4) |
5_polynomial_regression.r
########## second (2) order polynomial regression # degree = 2 # raw = TRUE to add "-1" to the model lm2 <- lm(dist ~ poly(speed, degree = 2, raw = TRUE) - 1, cars) summary(lm2) gp5_2 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 2, raw = TRUE) - 1, se = FALSE, fullrange = TRUE) #draw a figure print(gp5_2) #saving as a png file ggsave(file = "fig_5_polynomial_regression_2.png", plot = gp5_2) #Call: #lm(formula = dist ~ poly(speed, degree = 2, raw = TRUE) - 1, # data = cars) # #Residuals: # Min 1Q Median 3Q Max #-28.836 -9.071 -3.152 4.570 44.986 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #poly(speed, degree = 2, raw = TRUE)1 1.23903 0.55997 2.213 0.03171 * #poly(speed, degree = 2, raw = TRUE)2 0.09014 0.02939 3.067 0.00355 ** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 15.02 on 48 degrees of freedom #Multiple R-squared: 0.9133, Adjusted R-squared: 0.9097 #F-statistic: 252.8 on 2 and 48 DF, p-value: < 2.2e-16 #Residual standard error: 15.02 < 16.26 (previous result) ########## third (3) order polynomial regression lm3 <- lm(dist ~ poly(speed, degree = 3, raw = TRUE) - 1, cars) summary(lm3) gp5_3 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 3, raw = TRUE) - 1, se = FALSE, fullrange = TRUE) #draw a figure print(gp5_3) #saving as a png file ggsave(file = "fig_5_polynomial_regression_3.png", plot = gp5_3) ########## fifth (5) order polynomial regression lm5 <- lm(dist ~ poly(speed, degree = 5, raw = TRUE) - 1, cars) summary(lm5) gp5_5 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 5, raw = TRUE) - 1, se = FALSE, fullrange = TRUE) #draw a figure print(gp5_5) #saving as a png file ggsave(file = "fig_5_polynomial_regression_5.png", plot = gp5_5) ########## ninth (9) order polynomial regression lm9 <- lm(dist ~ poly(speed, degree = 9, raw = TRUE) - 1, cars) summary(lm9) gp5_9 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 9, raw = TRUE) - 1, se = FALSE, fullrange = TRUE) #draw a figure print(gp5_9) #saving as a png file ggsave(file = "fig_5_polynomial_regression_9.png", plot = gp5_9) #Residual standard error is even smaller, but p-value is now bigger. #This is like overfitting; too complicated models are not necessarily good. |
6_anova.r
########## ANOVA: analysis of variance "So, how can we optimize the number of orders?" #Option A (the simplest): finding larger number of orders with significant p-value #Option B (ANOVA: analysis of variance): finding larger number of orders with significant p-value # We can use ANOVA since regressions here are "nested" - second order polynominal regression is a liner regression plus another term. capture.output(anova(lm1, lm2, lm3, lm5, lm9), file = "output_6_anova.txt") #If you run anova on your R Console, then you can use sink() as follows: # #sink("output_6_anova.txt") # sink: save results of the following anova command. #anova(lm1, lm2, lm3, lm5, lm9) #sink() # This second sink() is to stop saving results. #Analysis of Variance Table # #Model 1: dist ~ speed - 1 #Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1 #Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1 #Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1 #Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1 # Res.Df RSS Df Sum of Sq F Pr(>F) #1 49 12954 #2 48 10831 1 2122.66 9.1542 0.004272 ** #3 47 10743 1 87.75 0.3784 0.541845 #4 45 10263 2 480.05 1.0351 0.364269 #5 41 9507 4 756.35 0.8155 0.522711 #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 # If adding a variable makes the model have higher explanatory power, # then you can see ** within Pr(>F) as shown above. # 0.004272 ** # # So, in this case, second order polynominal model #Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1 # is the one we should choose. # # We do not go into detail, but this result (degree = 2) is backed by physics. |
After running r scripts above, you'll get these png files on your R Console working directory:
After running 6_anova.r above, you'll get a txt file (output_6_anova.txt) on your R Console working directory which include:
Analysis of Variance Table
Model 1: dist ~ speed - 1
Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
Res.Df RSS Df Sum of Sq F Pr(>F)
1 49 12954
2 48 10831 1 2122.66 9.1542 0.004272 **
3 47 10743 1 87.75 0.3784 0.541845
4 45 10263 2 480.05 1.0351 0.364269
5 41 9507 4 756.35 0.8155 0.522711
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1