#==========
#Set-Up Enviroment
#===

#Set Working Directiory
setwd("~/Documents/Classes/Year 5/Fall 2019/Predictive Analytics/Final Proj")

#Load Libraries
library(readr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
#Import Data Set
census <- read_csv("adult-1.csv")
## Parsed with column specification:
## cols(
##   age = col_double(),
##   workclass = col_character(),
##   fnlwgt = col_double(),
##   education = col_character(),
##   `educational-num` = col_double(),
##   `marital-status` = col_character(),
##   occupation = col_character(),
##   relationship = col_character(),
##   race = col_character(),
##   gender = col_character(),
##   `capital-gain` = col_double(),
##   `capital-loss` = col_double(),
##   `hours-per-week` = col_double(),
##   `native-country` = col_character(),
##   income = col_character()
## )
#Review Data
summary(census)
##       age         workclass             fnlwgt         education        
##  Min.   :17.00   Length:48842       Min.   :  12285   Length:48842      
##  1st Qu.:28.00   Class :character   1st Qu.: 117550   Class :character  
##  Median :37.00   Mode  :character   Median : 178144   Mode  :character  
##  Mean   :38.64                      Mean   : 189664                     
##  3rd Qu.:48.00                      3rd Qu.: 237642                     
##  Max.   :90.00                      Max.   :1490400                     
##  educational-num marital-status      occupation        relationship      
##  Min.   : 1.00   Length:48842       Length:48842       Length:48842      
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :10.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :10.08                                                           
##  3rd Qu.:12.00                                                           
##  Max.   :16.00                                                           
##      race              gender           capital-gain    capital-loss   
##  Length:48842       Length:48842       Min.   :    0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:    0   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Median :    0   Median :   0.0  
##                                        Mean   : 1079   Mean   :  87.5  
##                                        3rd Qu.:    0   3rd Qu.:   0.0  
##                                        Max.   :99999   Max.   :4356.0  
##  hours-per-week  native-country        income         
##  Min.   : 1.00   Length:48842       Length:48842      
##  1st Qu.:40.00   Class :character   Class :character  
##  Median :40.00   Mode  :character   Mode  :character  
##  Mean   :40.42                                        
##  3rd Qu.:45.00                                        
##  Max.   :99.00
#==========
#Clean The Data & Select Trainig Variables
#===

#Remove vairable fnl-weight
census <- census[,-3]
#Remove variable eduation-String (education-Number will remain)
census <- census[,-3]
#Remove variable occupation
census <- census[,-5]

#Remove rows where native-country is not equal to United States
census <- census[grepl("United-States", census$`native-country`),]
#Remove the now irrelevent native-country variable
census <- census[,-11]

#Consolidate Work-Class Variabe: Self-Employed
census$workclass <- gsub('^Self-emp-inc', 'Self-Employed', census$workclass)
census$workclass <- gsub('^Self-emp-not-inc', 'Self-Employed', census$workclass)

#Rename Work-Calass Variable: ? to Unknown
census$workclass <- gsub('?', 'Unknown', census$workclass, fixed = TRUE)

#Set income classification
census$income <- as.factor(census$income)

#Review Data
summary(census)
##       age        workclass         educational-num marital-status    
##  Min.   :17.0   Length:43832       Min.   : 1.00   Length:43832      
##  1st Qu.:28.0   Class :character   1st Qu.: 9.00   Class :character  
##  Median :37.0   Mode  :character   Median :10.00   Mode  :character  
##  Mean   :38.7                      Mean   :10.17                     
##  3rd Qu.:48.0                      3rd Qu.:12.00                     
##  Max.   :90.0                      Max.   :16.00                     
##  relationship           race              gender           capital-gain  
##  Length:43832       Length:43832       Length:43832       Min.   :    0  
##  Class :character   Class :character   Class :character   1st Qu.:    0  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0  
##                                                           Mean   : 1090  
##                                                           3rd Qu.:    0  
##                                                           Max.   :99999  
##   capital-loss     hours-per-week    income     
##  Min.   :   0.00   Min.   : 1.00   <=50K:33138  
##  1st Qu.:   0.00   1st Qu.:40.00   >50K :10694  
##  Median :   0.00   Median :40.00                
##  Mean   :  88.79   Mean   :40.44                
##  3rd Qu.:   0.00   3rd Qu.:45.00                
##  Max.   :4356.00   Max.   :99.00
#==========
#Divide Data into training & testing
#Set seed for randomization
set.seed(101)

#Determine the size of the training set
trainingSize <- round(.9 * dim(census)[1])

#Create training and testing set
trainingSet <- census[1:trainingSize, ]
testingSet <- census[-(1:trainingSize), ]




#==========
#Create the models
# M1 : income = a*age + b*educational-num + c*marital-status + d*race + e*gender
m1 <- glm(income ~ age + `educational-num` + `marital-status` + race + gender, data = trainingSet, family = binomial('logit'))

# M2:income = a*age + b*educational-num + c*marital-status + d*race + e*gender + f*workclass + g*capital-gain + h*capital-loss + d*hours-per-week
m2 <- glm(income ~ age + `educational-num` + `marital-status` + race + gender + `workclass` + `capital-gain` + `capital-loss` + `hours-per-week`, data = trainingSet, family = binomial('logit'))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Summary of M1
print(m1)
## 
## Call:  glm(formula = income ~ age + `educational-num` + `marital-status` + 
##     race + gender, family = binomial("logit"), data = trainingSet)
## 
## Coefficients:
##                           (Intercept)  
##                             -8.109155  
##                                   age  
##                              0.022612  
##                     `educational-num`  
##                              0.403040  
##     `marital-status`Married-AF-spouse  
##                              1.910756  
##    `marital-status`Married-civ-spouse  
##                              2.055712  
## `marital-status`Married-spouse-absent  
##                             -0.005905  
##         `marital-status`Never-married  
##                             -0.607624  
##             `marital-status`Separated  
##                             -0.078741  
##               `marital-status`Widowed  
##                             -0.249626  
##                raceAsian-Pac-Islander  
##                              0.748683  
##                             raceBlack  
##                              0.179416  
##                             raceOther  
##                             -0.006941  
##                             raceWhite  
##                              0.431557  
##                            genderMale  
##                              0.293403  
## 
## Degrees of Freedom: 39448 Total (i.e. Null);  39435 Residual
## Null Deviance:       43770 
## Residual Deviance: 29960     AIC: 29980
# Summary of M2
print(m2)
## 
## Call:  glm(formula = income ~ age + `educational-num` + `marital-status` + 
##     race + gender + workclass + `capital-gain` + `capital-loss` + 
##     `hours-per-week`, family = binomial("logit"), data = trainingSet)
## 
## Coefficients:
##                           (Intercept)  
##                             -9.010330  
##                                   age  
##                              0.026921  
##                     `educational-num`  
##                              0.374198  
##     `marital-status`Married-AF-spouse  
##                              2.338750  
##    `marital-status`Married-civ-spouse  
##                              2.306621  
## `marital-status`Married-spouse-absent  
##                              0.077243  
##         `marital-status`Never-married  
##                             -0.442011  
##             `marital-status`Separated  
##                              0.015193  
##               `marital-status`Widowed  
##                             -0.078553  
##                raceAsian-Pac-Islander  
##                              0.856751  
##                             raceBlack  
##                              0.185059  
##                             raceOther  
##                              0.116247  
##                             raceWhite  
##                              0.456359  
##                            genderMale  
##                              0.078719  
##                    workclassLocal-gov  
##                             -0.629048  
##                 workclassNever-worked  
##                             -7.937035  
##                      workclassPrivate  
##                             -0.515943  
##                workclassSelf-Employed  
##                             -0.875965  
##                    workclassState-gov  
##                             -0.688284  
##                      workclassUnknown  
##                             -1.462238  
##                  workclassWithout-pay  
##                             -1.726116  
##                        `capital-gain`  
##                              0.000318  
##                        `capital-loss`  
##                              0.000679  
##                      `hours-per-week`  
##                              0.029113  
## 
## Degrees of Freedom: 39448 Total (i.e. Null);  39425 Residual
## Null Deviance:       43770 
## Residual Deviance: 26400     AIC: 26450
#==========
#Perform 10-fold cross validation
#Set trainContol settings
trainControlSettings <- trainControl(method = "cv", number = 10, verboseIter = TRUE)

# M1 : income = a*age + b*educational-num + c*marital-status + d*race + e*gender
m1 <- train(income ~ age + `educational-num` + `marital-status` + race + gender, trainingSet, method = "glm", trControl = trainControlSettings)
## + Fold01: parameter=none 
## - Fold01: parameter=none 
## + Fold02: parameter=none 
## - Fold02: parameter=none 
## + Fold03: parameter=none 
## - Fold03: parameter=none 
## + Fold04: parameter=none 
## - Fold04: parameter=none 
## + Fold05: parameter=none 
## - Fold05: parameter=none 
## + Fold06: parameter=none 
## - Fold06: parameter=none 
## + Fold07: parameter=none 
## - Fold07: parameter=none 
## + Fold08: parameter=none 
## - Fold08: parameter=none 
## + Fold09: parameter=none 
## - Fold09: parameter=none 
## + Fold10: parameter=none 
## - Fold10: parameter=none 
## Aggregating results
## Fitting final model on full training set
# M2:income = a*age + b*educational-num + c*marital-status + d*race + e*gender + f*workclass + g*capital-gain + h*capital-loss + d*hours-per-week
m2 <- train(income ~ age + `educational-num` + `marital-status` + race + gender + `workclass` + `capital-gain` + `capital-loss` + `hours-per-week`, trainingSet, method = "glm", trControl = trainControlSettings)
## + Fold01: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold01: parameter=none 
## + Fold02: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold02: parameter=none 
## + Fold03: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold03: parameter=none 
## + Fold04: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold04: parameter=none 
## + Fold05: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold05: parameter=none 
## + Fold06: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold06: parameter=none 
## + Fold07: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold07: parameter=none 
## + Fold08: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold08: parameter=none 
## + Fold09: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold09: parameter=none 
## + Fold10: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## - Fold10: parameter=none 
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Summary of 10-fold M1
print(m1)
## Generalized Linear Model 
## 
## 39449 samples
##     5 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 35503, 35505, 35503, 35505, 35504, 35505, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8168519  0.4445791
# Summary of 10-fold M2
print(m2)
## Generalized Linear Model 
## 
## 39449 samples
##     9 predictor
##     2 classes: '<=50K', '>50K' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 35504, 35503, 35504, 35505, 35504, 35504, ... 
## Resampling results:
## 
##   Accuracy   Kappa   
##   0.8420238  0.537627
#Summary of 10-fold model shows that Model 2 has an improvment of Accuracy (~3%)
modelAccuracy <- data.frame((m1$results$Accuracy * 100), (m2$results$Accuracy * 100))

#Store performance in data frame
colnames(modelAccuracy) <- c("Model 1", "Model 2")
rownames(modelAccuracy) <- c("Accuracy")
modelAccuracy
##           Model 1  Model 2
## Accuracy 81.68519 84.20238
#==========
#Test-set performance
#Create prediction using Model 1
pred_m1 <- predict(m1, testingSet)

#Create prediction using Model 2
pred_m2 <- predict(m2, testingSet)

#Generate a confusion matrix to evaluate the performance of each model
#Model 1
cm_m1 <- confusionMatrix(testingSet$income, pred_m1)

#Model 2
cm_m2 <- confusionMatrix(testingSet$income, pred_m2)




#==========
#Evaluate each model's performance
#Print Model 1 Performacne
cm_m1$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.138261e-01   4.472375e-01   8.019808e-01   8.252518e-01   8.270591e-01 
## AccuracyPValue  McnemarPValue 
##   9.897684e-01   1.748714e-32
cm_m1$table
##           Reference
## Prediction <=50K >50K
##      <=50K  3047  238
##      >50K    578  520
#Print Model 2 Performacne
cm_m2$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##   8.384668e-01   5.411786e-01   8.272327e-01   8.492479e-01   7.964864e-01 
## AccuracyPValue  McnemarPValue 
##   7.429586e-13   1.314885e-14
cm_m2$table
##           Reference
## Prediction <=50K >50K
##      <=50K  3034  251
##      >50K    457  641
#==========
# Create Visualizations in ggplot2 for Part 3 of final proj...