The data can be downloaded from the GitHub repository:
This dataset is all about South West Airlines flights, including information about the fare, duration, and type of delays between Quarter 3 of 1996 and Quarter 2 of 1997. The variables in the table above are believed to be important in predicting FARE (outcome variable). We are required to study the effects of the presence or absence of SW Airlines on FARE
library(fastDummies)
library(MASS)
library(AICcmodavg)
#Importing the csv file to a books DataFrame
airfare <- read.csv("Route-Airfares.csv")
#Viewing the DataFrame
head(airfare,n=6)
#To begin our analysis, we are removing S_CITY, E_CITY from the DataFrame
airfare <- airfare[,-c(1:2)]
#Viewing the DataFrame
head(airfare,n=6)
# One Hot Encoding on Categorical Variables
dummy_airfare <- dummy_cols(airfare[,c(3:4,10:11)],remove_first_dummy=TRUE)
dummy_airfare <- dummy_airfare[,-c(1:4)]
airfare <- cbind(airfare,dummy_airfare)
airfare <- airfare[,-c(3:4,10:11)]
head(airfare,n=5)
# Use stepwise regression
linear_model <- lm(FARE~.,data=airfare)
options(scipen=999)
summary(linear_model)
step_model <- stepAIC(linear_model,direction="backward",trace=FALSE)
summary(step_model)
Call:
lm(formula = FARE ~ ., data = airfare)
Residuals:
Min 1Q Median 3Q Max
-106.329 -22.707 -2.329 21.135 128.694
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.6994118001 27.3794275438 0.464 0.642931
COUPON 3.7548860460 12.1940745200 0.308 0.758241
NEW -2.3955300091 1.8754235621 -1.277 0.201962
HI 0.0084257891 0.0009900663 8.510 < 0.0000000000000002 ***
S_INCOME 0.0012066778 0.0005171071 2.334 0.019938 *
E_INCOME 0.0013742730 0.0003749187 3.666 0.000268 ***
S_POP 0.0000034009 0.0000006523 5.213 0.0000002525 ***
E_POP 0.0000043631 0.0000007547 5.781 0.0000000117 ***
DISTANCE 0.0749842554 0.0035795488 20.948 < 0.0000000000000002 ***
PAX -0.0008709429 0.0001459072 -5.969 0.0000000040 ***
VACATION_Yes -35.6444414923 3.6170502952 -9.855 < 0.0000000000000002 ***
SW_Yes -40.9696003501 3.7437293472 -10.944 < 0.0000000000000002 ***
SLOT_Free -16.2447682308 3.8468797056 -4.223 0.0000277170 ***
GATE_Free -20.5792299506 4.0015843898 -5.143 0.0000003629 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 35.47 on 624 degrees of freedom
Multiple R-squared: 0.7868, Adjusted R-squared: 0.7823
F-statistic: 177.1 on 13 and 624 DF, p-value: < 0.00000000000000022
Call:
lm(formula = FARE ~ HI + S_INCOME + E_INCOME + S_POP + E_POP +
DISTANCE + PAX + VACATION_Yes + SW_Yes + SLOT_Free + GATE_Free,
data = airfare)
Residuals:
Min 1Q Median 3Q Max
-106.097 -22.633 -1.566 20.704 128.504
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.7597724739 20.9214529374 0.610 0.542156
HI 0.0082581922 0.0009464056 8.726 < 0.0000000000000002 ***
S_INCOME 0.0011770289 0.0005141766 2.289 0.022403 *
E_INCOME 0.0013375253 0.0003733298 3.583 0.000366 ***
S_POP 0.0000034108 0.0000006504 5.244 0.0000002147768 ***
E_POP 0.0000043848 0.0000007532 5.822 0.0000000093095 ***
DISTANCE 0.0754922692 0.0025487565 29.619 < 0.0000000000000002 ***
PAX -0.0008898289 0.0001344086 -6.620 0.0000000000771 ***
VACATION_Yes -35.5596967094 3.6086880137 -9.854 < 0.0000000000000002 ***
SW_Yes -40.9960384252 3.7271479312 -10.999 < 0.0000000000000002 ***
SLOT_Free -16.1397070979 3.8249647249 -4.220 0.0000280988352 ***
GATE_Free -20.6002972047 3.9990619190 -5.151 0.0000003471197 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 35.46 on 626 degrees of freedom
Multiple R-squared: 0.7862, Adjusted R-squared: 0.7824
F-statistic: 209.2 on 11 and 626 DF, p-value: < 0.00000000000000022
#Apply log transformation to numerical variables and repeating stepwise regression
log_airfare <- log(airfare[,c(1:10)])
# Remove rows with Infinity
log_airfare <- log_airfare[!is.infinite(rowSums(log_airfare)),]
log_linear_model <- lm(FARE~.,data=log_airfare)
summary(log_linear_model)
options(scipen=999)
log_step_model <- stepAIC(linear_model,direction="backward",trace=FALSE)
summary(log_step_model)
Call:
lm(formula = FARE ~ ., data = log_airfare)
Residuals:
Min 1Q Median 3Q Max
-0.84121 -0.16409 0.02604 0.18574 0.83595
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -19.46603 1.24889 -15.587 < 0.0000000000000002 ***
COUPON 0.28555 0.14216 2.009 0.045 *
NEW 0.01484 0.05184 0.286 0.775
HI 0.16635 0.03395 4.900 0.00000123611919243 ***
S_INCOME 1.10259 0.10096 10.921 < 0.0000000000000002 ***
E_INCOME 0.62588 0.07762 8.063 0.00000000000000412 ***
S_POP 0.14108 0.01460 9.661 < 0.0000000000000002 ***
E_POP 0.19450 0.01404 13.858 < 0.0000000000000002 ***
DISTANCE 0.34451 0.02548 13.522 < 0.0000000000000002 ***
PAX -0.21147 0.02226 -9.499 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2707 on 594 degrees of freedom
Multiple R-squared: 0.7127, Adjusted R-squared: 0.7083
F-statistic: 163.7 on 9 and 594 DF, p-value: < 0.00000000000000022
Call:
lm(formula = FARE ~ HI + S_INCOME + E_INCOME + S_POP + E_POP +
DISTANCE + PAX + VACATION_Yes + SW_Yes + SLOT_Free + GATE_Free,
data = airfare)
Residuals:
Min 1Q Median 3Q Max
-106.097 -22.633 -1.566 20.704 128.504
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.7597724739 20.9214529374 0.610 0.542156
HI 0.0082581922 0.0009464056 8.726 < 0.0000000000000002 ***
S_INCOME 0.0011770289 0.0005141766 2.289 0.022403 *
E_INCOME 0.0013375253 0.0003733298 3.583 0.000366 ***
S_POP 0.0000034108 0.0000006504 5.244 0.0000002147768 ***
E_POP 0.0000043848 0.0000007532 5.822 0.0000000093095 ***
DISTANCE 0.0754922692 0.0025487565 29.619 < 0.0000000000000002 ***
PAX -0.0008898289 0.0001344086 -6.620 0.0000000000771 ***
VACATION_Yes -35.5596967094 3.6086880137 -9.854 < 0.0000000000000002 ***
SW_Yes -40.9960384252 3.7271479312 -10.999 < 0.0000000000000002 ***
SLOT_Free -16.1397070979 3.8249647249 -4.220 0.0000280988352 ***
GATE_Free -20.6002972047 3.9990619190 -5.151 0.0000003471197 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 35.46 on 626 degrees of freedom
Multiple R-squared: 0.7862, Adjusted R-squared: 0.7824
F-statistic: 209.2 on 11 and 626 DF, p-value: < 0.00000000000000022
library(AICcmodavg)
models <- list(linear_model,step_model,log_linear_model,log_step_model)
model.names <- c("Linear Regressor","Stepwise Regressor","Linear Regressor with Log","Stepwise Regressor with Log")
aictab(cand.set=models,modnames=model.names)
Model selection based on AICc:
K AICc Delta_AICc AICcWt Cum.Wt LL
Linear Regressor with Log 11 147.77 0.00 1 1 -62.66
Stepwise Regressor 13 6378.39 6230.62 0 1 -3175.90
Stepwise Regressor with Log 13 6378.39 6230.62 0 1 -3175.90
Linear Regressor 15 6380.77 6233.00 0 1 -3175.00
K: the value of K shows the number of parameters required by the model. (Default value of 2 shows one parameter)
AICc: information score of the model. c indicates that the AIC value has been corrected for small sample sizes. The smaller the AICc value, the better the model
Delta_AICc: Difference between the best model and the model being compared. Lower value is preferred
AICcWt: Proportion of total amount of predictive power explained by full set of models contained in the model being assessed. Value ranges between 0 and 1
Cum.Wt: Sum of the AICc weights. Best score is close to 100%
LL: Log Likelihood describes the model, given the data. AIC is calculated from LL and K
summary(log_linear_model)
Call:
lm(formula = FARE ~ ., data = log_airfare)
Residuals:
Min 1Q Median 3Q Max
-0.84121 -0.16409 0.02604 0.18574 0.83595
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -19.46603 1.24889 -15.587 < 0.0000000000000002 ***
COUPON 0.28555 0.14216 2.009 0.045 *
NEW 0.01484 0.05184 0.286 0.775
HI 0.16635 0.03395 4.900 0.00000123611919243 ***
S_INCOME 1.10259 0.10096 10.921 < 0.0000000000000002 ***
E_INCOME 0.62588 0.07762 8.063 0.00000000000000412 ***
S_POP 0.14108 0.01460 9.661 < 0.0000000000000002 ***
E_POP 0.19450 0.01404 13.858 < 0.0000000000000002 ***
DISTANCE 0.34451 0.02548 13.522 < 0.0000000000000002 ***
PAX -0.21147 0.02226 -9.499 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2707 on 594 degrees of freedom
Multiple R-squared: 0.7127, Adjusted R-squared: 0.7083
F-statistic: 163.7 on 9 and 594 DF, p-value: < 0.00000000000000022
# Partition DataFrame into 65% training set and 35% validation/test set
set.seed(101)
train_prt <- sample.split(log_airfare,SplitRatio=0.65)
train_df <- subset(log_airfare,train_prt==TRUE)
test_df <- subset(log_airfare,train_prt==FALSE)
# Fit the best model with training data
log_linear_model <- lm(FARE~.,data=train_df)
# Predict the result
preds <- predict(log_linear_model,test_df)
# Display the first 6 rows of predicted and actual values and residuals
err_df <- data.frame(Actual=test_df$FARE,Prediction=preds,Residual=test_df$FARE-preds)
head(err_df,n=6)
# Histogram for Residuals of the Prediction model
hist(err_df$Residual,c="Blue")
nSW_set <- data.frame(COUPON=1.202,NEW=3,HI=4442.141,S_INCOME=28760,E_INCOME=27664,S_POP=4557004,E_POP=3195503,DISTANCE=1976,PAX=12782,VACATION_Yes=0,SW_Yes=0,SLOT_Free=1,GATE_Free=1)
head(nSW_set,n=6)
# Predict the average FARE price for the sample above
sprintf("Average FARE price without SW: %.2f",predict(log_linear_model,nSW_set))
[1] "Average FARE price without SW: 1260547.47"
# Predict the same if SouthWest Airlines were flying this route
SW_set <- data.frame(COUPON=1.202,NEW=3,HI=4442.141,S_INCOME=28760,E_INCOME=27664,S_POP=4557004,E_POP=3195503,DISTANCE=1976,PAX=12782,VACATION_Yes=0,SW_Yes=1,SLOT_Free=1,GATE_Free=1)
sprintf("Reduction in average FARE price with SW: %.2f",predict(log_linear_model,nSW_set)-predict(log_linear_model,SW_set))
[1] "Reduction in average FARE price with SW: 0.00"