The Financial Journal (Global): Multiple Linear Regression in Python

Multiple Linear Regression in Python

**0_MacOS_Python_setup.txt**
# Install on Terminal of MacOS # 1. pandas #pip3 install -U pandas # 2. matplotlib, from mpl_toolkits.mplot3d import Axes3D #pip3 install -U matplotlib # 3. scikit-learn (sklearn) #pip3 install -U scikit-learn # 4. statsmodels #pip3 install -U statsmodels # 5. tkinter #pip3 install -U tkinter # 6. NumPy #pip3 install -U numpy # 7. seaborn #pip3 install -U seaborn

1_MacOS_Terminal.txt

########## Run Terminal on MacOS and execute
### TO UPDATE
cd "YOUR_WORKING_DIRECTORY"

python3 lrm01.py training.csv test.csv

Data files

training.csv

Year,Month,x1,x2,y
2017,12,2.75,5.3,1464.00
2017,11,2.5,5.3,1394.00
2017,10,2.5,5.3,1357.00
2017,9,2.5,5.3,1293.00
2017,8,2.5,5.4,1256.00
2017,7,2.5,5.6,1254.00
2017,6,2.5,5.5,1234.00
2017,5,2.25,5.5,1195.00
2017,4,2.25,5.5,1159.00
2017,3,2.25,5.6,1167.00
2017,2,2,5.7,1130.00
2017,1,2,5.9,1075.00
2016,12,2,6,1047.00
2016,11,1.75,5.9,965.00
2016,10,1.75,5.8,943.00
2016,9,1.75,6.1,958.00
2016,8,1.75,6.2,971.00
2016,7,1.75,6.1,949.00
2016,6,1.75,6.1,884.00
2016,5,1.75,6.1,866.00
2016,4,1.75,5.9,876.00
2016,3,1.75,6.2,822.00
2016,2,1.75,6.2,704.00
2016,1,1.75,6.1,719.00

test.csv

Year,Month,x1,x2,y
2019,12,3.75,4.5,1831.75
2019,11,3.5,4.5,1744.17
2019,10,3.5,4.7,1697.88
2019,9,3.5,4.7,1786.3
2019,8,3.25,4.8,1735.18
2019,7,3.25,4.8,1732.42
2019,6,3.25,4.7,1704.79
2019,5,3.25,4.7,1650.91
2019,4,3.25,4.7,1601.18
2019,3,3.25,4.8,1612.23
2019,2,3,4.9,1561.11
2019,1,3,4.9,1485.13
2018,12,3,5,1446.45
2018,11,3,4.9,1580.77
2018,10,3,4.8,1544.73
2018,9,3,5.1,1569.3
2018,8,3,5.2,1590.6
2018,7,3,5.1,1554.56
2018,6,3,5.3,1586.29
2018,5,3,5.3,1553.99
2018,4,2.75,5.1,1571.93
2018,3,2.75,5.4,1475.03
2018,2,2.75,5.4,1404.79
2018,1,2.75,5.3,1434.72

Python files

**lrm01.py**
########## Multiple Linear Regression in Python ########## # # Run this script on Terminal of MacOS as follows: # python3 lrm01.py training.csv test.csv # # Reference: # Example of Multiple Linear Regression in Python # https://datatofish.com/multiple-linear-regression-python/ ##### Checking for Linearity import pandas as pd import matplotlib.pyplot as plt import sys import numpy as np import seaborn as sns from mpl_toolkits.mplot3d import Axes3D ###training.csv trainingcsv = sys.argv[1] # the first argument after lrm01.py # # 'y' (dependent variable) : e.g., Stock_Index_Price # 'x1' (independent variable 1) : e.g., Interest_Rate # 'x2' (independent variable 2) : e.g., Unemployment_Rate # # #trainingcsv = {'Year': [2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016], # 'Month': [12, 11,10,9,8,7,6,5,4,3,2,1,12,11,10,9,8,7,6,5,4,3,2,1], # 'x1': [2.75,2.5,2.5,2.5,2.5,2.5,2.5,2.25,2.25,2.25,2,2,2,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75,1.75], # 'x2': [5.3,5.3,5.3,5.3,5.4,5.6,5.5,5.5,5.5,5.6,5.7,5.9,6,5.9,5.8,6.1,6.2,6.1,6.1,6.1,5.9,6.2,6.2,6.1], # 'y': [1464,1394,1357,1293,1256,1254,1234,1195,1159,1167,1130,1075,1047,965,943,958,971,949,884,866,876,822,704,719] # } # #df = pd.DataFrame(trainingcsv,columns=['Year','Month','x1','x2','y']) ###test.csv testcsv = sys.argv[2] # the second argument after lrm01.py ### Read csv files # #training data dftmp = pd.read_csv(trainingcsv, index_col=0) #dfnp = dftmp.values #covert pandas to numpy.ndarray df = dftmp # # #test data dftesttmp = pd.read_csv(testcsv, index_col=0) #dftestnp = dftesttmp.values #covert pandas to numpy.ndarray dftest = dftesttmp ### plot 1 # # 'y' (dependent variable) : e.g., Stock_Index_Price # 'x1' (independent variable 1) : e.g., Interest_Rate # plt.scatter(df['x1'], df['y'], color='red', label="Training Data x1") plt.legend() plt.title('y vs x1', fontsize=14) plt.xlabel('x1', fontsize=14) plt.ylabel('y', fontsize=14) plt.grid(True) plt.savefig("Figure_1_y_x1_training.png") # added to save a figure plt.show() ### plot 2 # # 'y' (dependent variable) : e.g., Stock_Index_Price # 'x2' (independent variable 2) : e.g., Unemployment_Rate # plt.scatter(df['x2'], df['y'], color='green', label="Training Data x2") plt.legend() plt.title('y vs x2', fontsize=14) plt.xlabel('x2', fontsize=14) plt.ylabel('y', fontsize=14) plt.grid(True) plt.savefig("Figure_2_y_x2_training.png") # added to save a figure plt.show() ##### Performing the Multiple Linear Regression from sklearn import linear_model import statsmodels.api as sm X = df[['x1','x2']] Y = df['y'] ### with sklearn regr = linear_model.LinearRegression() regr.fit(X, Y) print('\n') print('==============================') print('Prediction model: \n') print('Intercept: \n', regr.intercept_) print('Coefficients: \n', regr.coef_) with open('coef.txt', 'w') as f: print('Intercept: ' + str(regr.intercept_), file=f) print('Coefficients:' + str(regr.coef_), file=f) # prediction with sklearn #new_x1 = 2.75 #new_x2 = 5.3 #print ('Predicted y: \n', regr.predict([[new_x1 ,new_x2]])) #print('============================== \n') ### with statsmodels X = sm.add_constant(X) # adding a constant # # X: raw training data that has const=1, x1, and x2. pd.DataFrame(data=X).to_csv("X.csv", header=True, index=False) model = sm.OLS(Y, X).fit() predictions = model.predict(X) # # YPRED # y : raw data Y # y_pred : predicted data by using raw training data x1 and x2 plus intercentpt. YYPRED = pd.concat([Y, predictions], axis=1).rename(columns={0: 'y_pred'}) pd.DataFrame(data=YYPRED).to_csv("YYPRED.csv", header=True, index=False) # #resultstmp have: y, y_pred, const, x1, and x2; all data except for y_pred & const=1 are raw training data resultstmp = pd.concat([YYPRED, X], axis=1) resultstmp.to_csv("resultstmp.csv", header=True, index=False) # resultsint = pd.Series(regr.intercept_, index=resultstmp.index, name='coef_const') resultscoef1 = pd.Series(regr.coef_[0], index=resultstmp.index, name='coef_x1') resultscoef2 = pd.Series(regr.coef_[1], index=resultstmp.index, name='coef_x2') # RESULTS = pd.concat([resultstmp, resultsint, resultscoef1, resultscoef2], axis=1) RESULTS.to_csv("RESULTS.csv", header=True, index=False) # See "RESULTS.csv" # y, y_pred, const, x1, x2, coef_const, coef_x1, and coef_x2; the last three were coefficients for intercept, x1, and x2, respectively. # # For instance, # y_pred (B2) = F2C2 + G2D2 + H2E2 # y_pred (B2) = coef_const(F2)const(C2) + coef_1(G2)x1(D2) + coef_1(H2)x2(E2) # 1422.862389 = 1798.4039781 + 345.5400872.75 + (-250.1465714)5.3 print_model = model.summary() print(print_model) ''' Intercept: 1798.4039776258546 Coefficients: [ 345.54008701 -250.14657137] Predicted y: [1422.86238865] OLS Regression Results ============================================================================== Dep. Variable: Stock_Index_Price R-squared: 0.898 Model: OLS Adj. R-squared: 0.888 Method: Least Squares F-statistic: 92.07 Date: Fri, 29 May 2020 Prob (F-statistic): 4.04e-11 Time: 08:43:16 Log-Likelihood: -134.61 No. Observations: 24 AIC: 275.2 Df Residuals: 21 BIC: 278.8 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>\|t\| [0.025 0.975] ------------------------------------------------------------------------------ const 1798.4040 899.248 2.000 0.059 -71.685 3668.493 x1 345.5401 111.367 3.103 0.005 113.940 577.140 x2 -250.1466 117.950 -2.121 0.046 -495.437 -4.856 ============================================================================== Omnibus: 2.691 Durbin-Watson: 0.530 Prob(Omnibus): 0.260 Jarque-Bera (JB): 1.551 Skew: -0.612 Prob(JB): 0.461 Kurtosis: 3.226 Cond. No. 394. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. ''' ##### 3D plot ###Training Data: df (y, x1, x2) XXX = df[['x1']] YYY = df[['x2']] ZZZ = df['y'] ### Predictions (y_pred) by Training Data (y, x1, x2) ZZZPRED = YYPRED['y_pred'] ### Test Data: dftest (y, x1, x2) XXXTEST = dftest[['x1']] YYYTEST = dftest[['x2']] ZZZTEST = dftest['y'] ###graph sns.set_style("darkgrid") # #frames of a graph fig = plt.figure() ax = Axes3D(fig) # #axis labels ax.set_xlabel("X1") ax.set_ylabel("X2") ax.set_zlabel("Y") ###plot ##linestyle='None' means no line # # Training Data ax.plot(XXX,YYY,ZZZ, marker="o",linestyle='None', label='Training Data') # # Predictions (y_pred) by Training Data (y, x1, x2) ax.plot(XXX,YYY,ZZZPRED, marker="", linestyle='None', label='Predictions by Training Data') # # Test Data ax.plot(XXXTEST,YYYTEST,ZZZTEST, marker="+", linestyle='None', label='Test Data') ax.legend() ###show a prediction equation on the plot # #print(XXX.min()[0]) minx = XXX.min()[0] # #print(min(YYY.min()[0], YYYTEST.min()[0])) miny = min(YYY.min()[0], YYYTEST.min()[0]) # #print(min(ZZZ.min(), ZZZPRED.min(), ZZZTEST.min())) minz = min(ZZZ.min(), ZZZPRED.min(), ZZZTEST.min()) # #print('Intercept: \n', regr.intercept_) #print(type(regr.intercept_)) #<class 'numpy.float64'> # #print('Coefficients: \n', regr.coef_) #print(type(regr.coef_[0])) #<class 'numpy.float64'> #print(type(regr.coef_[1])) #<class 'numpy.float64'> # #ax.text(minx, miny, minz, 'y_pred = ' + str(regr.intercept_) + ' + (' + str(regr.coef_[0]) + '* x1) + (' + str(regr.coef_[1]) + '* x2)', color='black') ax.text(minx, miny, minz, 'y_pred = ' + str(np.round(regr.intercept_, decimals=2)) + ' + (' + str(np.round(regr.coef_[0], decimals=2)) + '* x1) + (' + str(np.round(regr.coef_[1], decimals=2)) + '* x2)', color='black') plt.savefig("Figure_3_y_x1_x2_training_predicted_and_test_data.png") # added to save a figure plt.show()