number,age,blood_pressure,lung_capacity,sex,illness,weight 1,22,110,4300,M,1,79 2,23,128,4500,M,1,65 3,24,104,3900,F,0,53 4,25,112,3000,F,0,45 5,27,108,4800,M,0,80 6,28,126,3800,F,0,50 7,28,126,3800,F,1,43 8,29,104,4000,F,1,55 9,30,125,3600,F,1,47 10,31,120,3400,F,1,49 11,32,116,3600,M,1,64 12,32,124,3900,M,0,61 13,33,106,3100,F,0,48 14,33,134,2900,F,0,41 15,34,128,4100,M,1,70 16,36,128,3420,M,1,55 17,37,116,3800,M,1,70 18,37,132,4150,M,1,90 19,38,134,2700,F,0,39 20,39,116,4550,M,1,86 21,40,120,2900,F,1,50 22,42,130,3950,F,1,65 23,46,126,3100,M,0,58 24,49,140,3000,F,0,45 25,50,156,3400,M,1,60 26,53,124,3400,M,1,71 27,56,118,3470,M,1,62 28,58,144,2800,M,0,51 29,64,142,2500,F,1,40 30,65,144,2350,F,0,42 |
0_runme.txt
########## R: Logistic Regression ##### Run this script on your R Console ##### Background # # We use machine learning models to learn training data. # The trained machine learning models are expected to predict in a reliable manner even when using new data (which is different from the training data above). # If the machined learning model is over-fitting the training data (including noises and outlier), # the model's prediction accuracy for new data could be lowered. # This is because the model learn noises, outliers, and other meaningful data points of the training data, and regard the entire data as meaningful. # To explain noises and outliers, the model is overly optimized. # # Reasons for over-fitting are mainly (1) numbers of data points are too small, (2) too many explanatory variables, and (3) too big parameters (coefficients). # # To avoid over-fitting, we can use regularization. This method is widely used in various machine learning models. # # Regularization: A way to find a model while avoiding over-fitting # L1 (Lasso): A penalty term is sum of absolute parameter values of the model # By setting weight = 0 of certain data, deleting unnecessary data. # "Dimension comperession to delete unnecessary explanatory variables" # L2 (Ridge): A penalty term is sum of squared parameter values of the model. # This is to have a smoother model. # "More accurate prediction while avoiding over-fitting" # Under both L1 regularization and L2 regularization, # models with lower dimensions have smaller penalty. # If training data have exceptional data such as noises and outliers, # models have to increase its dimensions to explain data including such exceptional data # while trying not to be penalized for increased dimensions. # (Both L1 and L2 can be simultaneously used as liner sum. This is elastic net regularization.) # # # Regression: # A certain objective variable Y is predicted by using weighted explanatory variables X {x0, x1, x2, ..., xn} # Predicted Y = hθ(X) = θ0 * x0 + θ1 * x1 + ... + θn * xn =θT X # # Logistic regression: # Generally, hθ(X) above is a continuous value without any upper and lower boundaries. # To make 0 ≤ hθ(X) ≤ 1, # Logistic Function (AKA Sigmoid Function) g(z) = 1/(1 + e^(−z)) # When doing logistic regressions, # hθ(X) = 1/(1 + e^(−θT X)) # # hθ(x)≥0.5, then Y = 1 # hθ(x)<0.5, then Y = 0 # Set your working directory on your R Console ##### The following directory is dummy - set to your own directory where you save all the r files below. setwd('/Users/XXX/Downloads/') #source('1_install_packages.r') # You have to run this r script only for the first time. #source('2_library.r') source('3_logi_fun.r') source('4_logistic_regression.r') |
Source: https://qiita.com/katsu1110/items/e4ef613559f02f183af5
1_install_packages.r
########## install packages #install.packages("glmnet") #install.packages('glmnet_2.0-18.tgz') #zip file dowloaded from https://cran.r-project.org/web/packages/glmnet/index.html |
########## library setting #library('glmnet') |
3_logi_fun.r
logi_fun <- function(data,file,disease){ ans <- glm(data$Y~.,data=data,family=binomial) # family=binomial for logistics regression s.ans <- summary(ans) coe <- s.ans$coefficient RR <- exp(coe[,1]) RRlow <- exp(coe[,1]-1.96*coe[,2]) RRup <- exp(coe[,1]+1.96*coe[,2]) N <- nrow(data) aic <- AIC(ans) result <- cbind(coe,RR,RRlow,RRup,aic,N) colnames(result)[6:7] <- c("RR95%CI.low","RR95%CI.up") if(nrow(result)>=2){ result[2:nrow(result),8:9] <- "" } write.table(disease,file,append=T,quote=F,sep=",",row.names=F,col.names=F) write.table(matrix(c("",colnames(result)),nrow=1),file,append=T,quote=F,sep=",",row.names=F,col.names=F) write.table(result,file,append=T,quote=F,sep=",",row.names=T,col.names=F) write.table("",file,append=T,quote=F,sep=",",row.names=F,col.names=F) } |
4_logistic_regression.r
df <- read.csv("data.csv",header=T,row.names=1) dat <- df[,c(5,1,2,6)] # 5th column (illness): explanined (target) variable # 1(age),2(blood_pressure),6(weight): explanatory variables colnames(dat)[1] <- "Y" logi_fun(dat,"results_logistic_reg.csv","illness") # See this csv file. # If significance level = 0.05, then only "age" has Pr (p-value) which is less than 0.05 (0.038665). |
results_logistic_reg.csv
illness ,Estimate,Std. Error,z value,Pr(>|z|),RR,RR95%CI.low,RR95%CI.up,aic,N (Intercept),-6.27037164366909,5.6269544356187,-1.11434555147231,0.265130972267466,0.00189152547909671,3.06938865463388e-08,116.566164818212,42.6982377386664,30 age,0.00171984691269398,0.0447696844915921,0.0384154351817462,0.969356454570465,1.00172132669761,0.91756786481243,1.09359280641977,, blood_pressure,0.016973167573557,0.0445691192249947,0.380827978400756,0.703330897109487,1.01711803021436,0.932037428183773,1.10996517532894,, weight,0.0801901371302698,0.0387817328646643,2.06772960378298,0.038665456531477,1.08349306035207,1.004186680477,1.16906271976586,, |
No comments:
Post a Comment