0_runme.txt
########## R: L1 Regularization for Logistic Regression ##### Run this script on your R Console ##### Background # # We use machine learning models to learn training data. # The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above). # If the machined learning model is over-fitting the training data (including noises and outlier), # the model's prediction accuracy for new data could be lowered. # This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful. # To explain noises and outliers, the model is overly optimized. # # Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients). # # To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models. # # Regularization: A way to find a model while avoiding over-fitting # L1 (Lasso): A penalty term is sum of absolute parameter values of the model # By setting weight = 0 of certain data, deleting unnecessary data. # "Dimension comperession to delete unnecessary explanatory variables" # L2 (Ridge): A penalty term is sum of squared parameter values of the model. # This is to have a smoother model. # "More accurate prediction while avoiding over-fitting" # Under both L1 regularization and L2 regularization, # models with lower dimensions have smaller penalty. # If training data have exceptional data such as noises and outliers, # models have to increase its dimensions to explain data including such exceptional data # while trying not to be penalized for increased dimensions. # (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.) # # # Regression: # A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn} # Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X # # Logistic regression: # Generally, hθ(X) above is a continuous value without any upper and lower boundaries. # To make 0 ≤ hθ(X) ≤ 1, # Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z)) # When doing logistic regressions, # hθ(X) = 1/(1 + e^(−θT X)) # # hθ(x)≥0.5, then Y = 1 # hθ(x)<0.5, then Y = 0 # Set your working directory on your R Console # The following directory is dummy - set to your own directory where you save all the r files below. setwd('/Users/yoshi/Dropbox/Google Drive/Coding/R/regularization_and_logistic_regression/') # source('1_install_packages.r') # You have to run this r script only for the first time. source('2_library.r') source('3_quick_start.r') source('4_logistic_regression_binominal.r') source('5_logistic_regression_multinominal.r') |
Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5
1_install_packages.r
########## install packages install.packages("glmnet") #install.packages('glmnet_2.0-18.tgz') #zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html # Reference #http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html # Dowload files # # QuickStartExample.RData # https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData # # BinomialExample.RData # https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData # # MultinomialExample.RData # https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData |
########## library setting library('glmnet') |
3_quick_start.r
# Dowload QuickStartExample.RData from the follwowing Github site. # https://github.com/cran/glmnet/blob/master/data/QuickStartExample.RData load("QuickStartExample.RData") fit = glmnet(x, y) ##### Start: saving as a png file png("fig_3_quick_start_1.png") #draw a figure plot(fit) # Each curve corresponds to a variable. # It shows the path of its coefficient against the ℓ1-norm of the whole coefficient vector at as λ varies. # (The tuning parameter λ controls the overall strength of the penalty.) dev.off() ##### End: saving as a png file #A summary of the glmnet path at each step is displayed if we just enter the object name or use the print function: print(fit) coef(fit,s=0.1) nx = matrix(rnorm(10*20),10,20) predict(fit,newx=nx,s=c(0.1,0.05)) cvfit = cv.glmnet(x, y) ##### Start: saving as a png file png("fig_3_quick_start_2.png") plot(cvfit) dev.off() ##### End: saving as a png file cvfit$lambda.min coef(cvfit, s = "lambda.min") predict(cvfit, newx = x[1:5,], s = "lambda.min") |
4_logistic_regression_binominal.r
# Download from https://github.com/cran/glmnet/blob/master/data/BinomialExample.RData load("BinomialExample.RData") fit = glmnet(x, y, family = "binomial") ##### Start: saving as a png file png("fig_4_logistic_regression_binominal_1.png") plot(fit, xvar = "dev", label = TRUE) dev.off() ##### End: saving as a png file predict(fit, newx = x[1:5,], type = "class", s = c(0.05, 0.01)) cvfit = cv.glmnet(x, y, family = "binomial", type.measure = "class") ##### Start: saving as a png file png("fig_4_logistic_regression_binominal_2.png") plot(cvfit) dev.off() ##### End: saving as a png file cvfit$lambda.min cvfit$lambda.1se coef(cvfit, s = "lambda.min") predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") |
5_logistic_regression_multinominal.r
# Download from https://github.com/cran/glmnet/blob/master/data/MultinomialExample.RData load("MultinomialExample.RData") fit = glmnet(x, y, family = "multinomial", type.multinomial = "grouped") ##### Start: saving as a png file png("fig_5_logistic_regression_multinominal_1.png") plot(fit, xvar = "lambda", label = TRUE, type.coef = "2norm") dev.off() ##### End: saving as a png file cvfit=cv.glmnet(x, y, family="multinomial", type.multinomial = "grouped", parallel = TRUE) ##### Start: saving as a png file png("fig_5_logistic_regression_multinominal_2.png") plot(cvfit) dev.off() ##### End: saving as a png file predict(cvfit, newx = x[1:10,], s = "lambda.min", type = "class") |
Figures
No comments:
Post a Comment