AdSense

Friday, June 28, 2019

R: linear and polynominal regressions - finding an optimal order of polynominal models


0_runme.txt

########## R: linear and polynominal regressions - finding an optimal order of polynominal models

# Run this script on your R Console

# Set your working directory on your R Console
# The following directory is dummy - set to your own directory where you save all the r files below.
#setwd('/Users/XXX/Dropbox/Google Drive/Coding/R/polynomial_regression/')

# source('1_install_packages.r') # You have to run this r script only for the first time.

source('2_library.r')

source('3_data_plotting.r')

source('4_linear_regression.r')

source('5_polynomial_regression.r')

source('6_anova.r')



1_install_packages.r
########## install packages

install.packages('labeling')
#install.packages('labeling_0.3.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/labeling/index.html

install.packages('withr')
#install.packages('withr_2.1.2.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/withr/index.html

install.packages('pkgconfig')
#install.packages('pkgconfig_2.0.2.zip')                #zip file dowloaded from https://cran.r-project.org/web/packages/pkgconfig/index.html

install.packages('zeallot')
#install.packages('zeallot_0.1.0.zip')     #zip file dowloaded from https://cran.r-project.org/web/packages/zeallot/index.html

install.packages('vctrs')
#install.packages('vctrs_0.1.0.zip')        #zip file dowloaded from https://cran.r-project.org/web/packages/vctrs/index.html

install.packages('crayon')
#install.packages('crayon_1.3.4.zip')    #zip file dowloaded from https://cran.r-project.org/web/packages/crayon/index.html

install.packages('pillar')
#install.packages('pillar_1.4.1.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/pillar/index.html

install.packages('tibble')
#install.packages('tibble_2.1.3.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/tibble/index.html

install.packages('lazyeval')
#install.packages('lazyeval_0.2.2.zip')  #zip file dowloaded from https://cran.r-project.org/web/packages/lazyeval/index.html

install.packages('colorspace')
#install.packages('colorspace_1.4-1.zip')              #zip file dowloaded from https://cran.r-project.org/web/packages/colorspace/index.html

install.packages('munsell')
#install.packages('munsell_0.5.0.zip')  #zip file dowloaded from https://cran.r-project.org/web/packages/munsell/index.html

install.packages('Rccp')
#install.packages('Rcpp_1.0.1.zip')        #zip file dowloaded from https://cran.r-project.org/web/packages/Rcpp/index.html

install.packages('scales')
#install.packages('scales_1.0.0.zip')      #zip file dowloaded from https://cran.r-project.org/web/packages/scales/index.html

install.packages('rlang')
#install.packages('rlang_0.4.0.zip')       #zip file dowloaded from https://cran.r-project.org/web/packages/rlang/index.html

install.packages(‘gtable’)
#install.packages('gtable_0.3.0.zip')     #zip file dowloaded from https://cran.r-project.org/web/packages/gtable/index.html

install.packages('ggplot2')
#install.packages('ggplot2_3.2.0.zip')   #zip file dowloaded https://cloud.r-project.org/web/packages/ggplot2/index.html


2_library.r
########## library setting

library('labeling')
library('withr')
library('pkgconfig')
library('zeallot')
library('vctrs')
library('crayon')
library('pillar')
library('tibble')
library('lazyeval')
library('colorspace')
library('munsell')
library('Rcpp')
library('scales')
library('rlang')
library('gtable')
library('ggplot2')


3_data_plotting.r
########## data plotting


gp3 <- ggplot(cars, aes(x = speed, y = dist)) + geom_point(size = 3) + xlim(0,25)
gp3


#draw a figure
print(gp3)


#saving as a png file
ggsave(file = "fig_3_data_plotting.png", plot = gp3)

4_linear_regression.r
########## linear regression
#
#If speed = 0, then dist = 0 in this case, so intercept should be zero
lm1 <- lm(dist ~ speed - 1, cars)           # - 1 to make the intercept zero
summary(lm1)

#Call:
#lm(formula = dist ~ speed - 1, data = cars)
#
#Residuals:
#    Min      1Q  Median      3Q     Max
#-26.183 -12.637  -5.455   4.590  50.181
#
#Coefficients:
#      Estimate Std. Error t value Pr(>|t|)
#speed   2.9091     0.1414   20.58   <2e-16 ***
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 16.26 on 49 degrees of freedom
#Multiple R-squared:  0.8963,    Adjusted R-squared:  0.8942
#F-statistic: 423.5 on 1 and 49 DF,  p-value: < 2.2e-16
#

#coefficient of speed: 2.9091 (dist increases by 2.9091 when speed +1 mph)
#coefficient of speed (std. error): +-0.1414
#p-value is small enough. <2e-16 ***  It is considered to be significant.
#Residual Standard Error 16.26 < (dist average = 42.98), 16.26 is 37% of 42.98
#R-squared:  0.8942 (~1)

gp4 <- gp3 + geom_smooth(method = lm, formula = y ~ x - 1, se = FALSE, fullrange = TRUE)


#draw a figure
print(gp4)


#saving as a png file
ggsave(file = "fig_4_linear_regression.png", plot = gp4)


5_polynomial_regression.r
########## second (2) order polynomial regression

# degree = 2
# raw = TRUE to add "-1" to the model
lm2 <- lm(dist ~ poly(speed, degree = 2, raw = TRUE) - 1, cars)
summary(lm2)

gp5_2 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 2, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)


#draw a figure
print(gp5_2)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_2.png", plot = gp5_2)


#Call:
#lm(formula = dist ~ poly(speed, degree = 2, raw = TRUE) - 1,
#    data = cars)
#
#Residuals:
#    Min      1Q  Median      3Q     Max
#-28.836  -9.071  -3.152   4.570  44.986
#
#Coefficients:
#                                     Estimate Std. Error t value Pr(>|t|)
#poly(speed, degree = 2, raw = TRUE)1  1.23903    0.55997   2.213  0.03171 *
#poly(speed, degree = 2, raw = TRUE)2  0.09014    0.02939   3.067  0.00355 **
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 15.02 on 48 degrees of freedom
#Multiple R-squared:  0.9133,    Adjusted R-squared:  0.9097
#F-statistic: 252.8 on 2 and 48 DF,  p-value: < 2.2e-16

#Residual standard error: 15.02 < 16.26 (previous result)



########## third (3) order polynomial regression
lm3 <- lm(dist ~ poly(speed, degree = 3, raw = TRUE) - 1, cars)
summary(lm3)
gp5_3 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 3, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_3)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_3.png", plot = gp5_3)



########## fifth (5) order polynomial regression
lm5 <- lm(dist ~ poly(speed, degree = 5, raw = TRUE) - 1, cars)
summary(lm5)
gp5_5 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 5, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_5)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_5.png", plot = gp5_5)



########## ninth (9) order polynomial regression
lm9 <- lm(dist ~ poly(speed, degree = 9, raw = TRUE) - 1, cars)
summary(lm9)
gp5_9 <- gp3 + geom_smooth(method = lm, formula = y ~ poly(x, degree = 9, raw = TRUE) - 1, se = FALSE, fullrange = TRUE)

#draw a figure
print(gp5_9)

#saving as a png file
ggsave(file = "fig_5_polynomial_regression_9.png", plot = gp5_9)


#Residual standard error is even smaller, but p-value is now bigger.
#This is like overfitting; too complicated models are not necessarily good.

6_anova.r
########## ANOVA: analysis of variance "So, how can we optimize the number of orders?"

#Option A (the simplest): finding larger number of orders with significant p-value

#Option B (ANOVA: analysis of variance): finding larger number of orders with significant p-value
# We can use ANOVA since regressions here are "nested" - second order polynominal regression is a liner regression plus another term.

capture.output(anova(lm1, lm2, lm3, lm5, lm9), file = "output_6_anova.txt")


#If you run anova on your R Console, then you can use sink() as follows:
#
#sink("output_6_anova.txt") # sink: save results of the following anova command.
#anova(lm1, lm2, lm3, lm5, lm9)
#sink() # This second sink() is to stop saving results.



#Analysis of Variance Table
#
#Model 1: dist ~ speed - 1
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
#Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
#Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
#Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
#  Res.Df   RSS Df Sum of Sq      F   Pr(>F)
#1     49 12954                            
#2     48 10831  1   2122.66 9.1542 0.004272 **
#3     47 10743  1     87.75 0.3784 0.541845
#4     45 10263  2    480.05 1.0351 0.364269
#5     41  9507  4    756.35 0.8155 0.522711
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1

# If adding a variable makes the model have higher explanatory power,
# then you can see ** within Pr(>F) as shown above.
# 0.004272 **
#
# So, in this case, second order polynominal model
#Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
# is the one we should choose.
#
# We do not go into detail, but this result (degree = 2) is backed by physics.



After running r scripts above, you'll get these png files on your R Console working directory:







After running 6_anova.r above, you'll get a txt file (output_6_anova.txt) on your R Console working directory which include:


Analysis of Variance Table

Model 1: dist ~ speed - 1
Model 2: dist ~ poly(speed, degree = 2, raw = TRUE) - 1
Model 3: dist ~ poly(speed, degree = 3, raw = TRUE) - 1
Model 4: dist ~ poly(speed, degree = 5, raw = TRUE) - 1
Model 5: dist ~ poly(speed, degree = 9, raw = TRUE) - 1
  Res.Df   RSS Df Sum of Sq      F   Pr(>F)
1     49 12954                              
2     48 10831  1   2122.66 9.1542 0.004272 **
3     47 10743  1     87.75 0.3784 0.541845
4     45 10263  2    480.05 1.0351 0.364269
5     41  9507  4    756.35 0.8155 0.522711
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


Monday, June 17, 2019

keeping minutes of a meeting

A small tip for keeping minutes of a meeting:

1. If you join a meeting, then please try to take minutes of the meeting by yourself (unless you’re a final decision maker). If you cannot understand words, contexts, etc., ask for other participants’ help after the meeting. (That’s why taking minutes is one of the best ways to learn especially for young professionals.)

2. Finish taking your own minutes, ask other participants (on your end, such as your boss and colleagues) to review, update the minutes when necessary.

3. After completing the minutes on your company’s end, then distribute it to all the participants and other related parties. When sending the completed minutes, please do include executive summary not only for those who cannot attend, but also for the participants who join the meeting. Executive summary is the best way to summarize what needs to be done, how, when, who.. If you really understand the situation, you should be able to do it; otherwise, you understand you did not understand it.


Hope this helps.

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights

Deep Learning (Regression, Multiple Features/Explanatory Variables, Supervised Learning): Impelementation and Showing Biases and Weights ...