Titanic Kaggle

Titanic: Machine Learning from disaster. My R Notebook for Kaggle

X
library(knitr)
library(rmdformats)
## Global options
options(max.print="75")
opts_chunk$set(#echo=FALSE,
               cache=TRUE,
               prompt=FALSE,
               tidy=TRUE,
               comment=NA,
               message=FALSE,
               warning=FALSE, 
               results='asis'
               )
opts_knit$set(width=75)
print("notebook options imported successfully!")
## [1] "notebook options imported successfully!"
# set the working directory
setwd("~/Documents/GitHub/titanic_kaggle_ml")

# read data files
test.data <- read.csv("test.csv", na.strings = c("NA",""))
train.data <- read.csv("train.csv", na.strings = c("NA",""))

# convert ints factors 
train.data$Survived = factor(train.data$Survived)
train.data$Pclass = factor(train.data$Pclass)
# test.data$Survived = factor(test.data$Survived)
# test.data$Pclass = factor(test.data$Pclass)


# DETECTING MISSING VALUES
sum(is.na(train.data$Embarked))

[1] 2

sapply(train.data, function(df){
    sum(is.na(df==TRUE) /length(df));
    })

PassengerId Survived Pclass Name Sex Age 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000 0.198653199 SibSp Parch Ticket Fare Cabin Embarked 0.000000000 0.000000000 0.000000000 0.000000000 0.771043771 0.002244669

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

library(Amelia)
missmap(train.data, main="Missing Map")

# Ameliaview() 
### IMPUTTING MISSING VALUES ###
table(train.data$Embarked, useNA = "always") %>% as.data.frame();

Var1 Freq 1 C 168 2 Q 77 3 S 644 4 2

# Change NA Values to S, the most probable port
train.data$Embarked[which(is.na(train.data$Embarked))] = 'S' ; #using which to subset
# get table of salutations from names using grep
train.data$Name = as.character(train.data$Name) 
table_words = table(unlist(strsplit(train.data$Name, "\\s+")))
sort(table_words [grep('\\.',names(table_words))], decreasing = TRUE)
  Mr.     Miss.      Mrs.   Master.       Dr.      Rev.      Col. 
  517       182       125        40         7         6         2 

Major. Mlle. Capt. Countess. Don. Jonkheer. L. 2 2 1 1 1 1 1 Lady. Mme. Ms. Sir. 1 1 1 1

# find missing values
library(stringr)
tb = cbind(train.data$Age, str_match(train.data$Name, "[a-zA-z]+\\."))
table(tb[is.na(tb[,1]),2])
Dr. Master.   Miss.     Mr.    Mrs. 
  1       4      36     119      17 
# impute mean value for missing ages
mean.mr = mean(train.data$Age[grepl(" Mr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.mrs = mean(train.data$Age[grepl(" Mrs\\.", train.data$Name) & !is.na(train.data$Age)])
mean.dr = mean(train.data$Age[grepl(" Dr\\.", train.data$Name) & !is.na(train.data$Age)])
mean.miss = mean(train.data$Age[grepl(" Miss\\.", train.data$Name) & !is.na(train.data$Age)])
mean.master = mean(train.data$Age[grepl(" Master\\.", train.data$Name) & !is.na(train.data$Age)])

# assign missing value with the mean value of each title
train.data$Age[grepl(" Mr\\.", train.data$Name) & is.na(train.data$Age)] = mean.mr
train.data$Age[grepl(" Mrs\\.", train.data$Name) & is.na(train.data$Age)] = mean.mrs
train.data$Age[grepl(" Dr\\.", train.data$Name) & is.na(train.data$Age)] = mean.dr
train.data$Age[grepl(" Miss\\.", train.data$Name) & is.na(train.data$Age)] = mean.miss
train.data$Age[grepl(" Master\\.", train.data$Name) & is.na(train.data$Age)] = mean.master
### DATA VISUALIZATION ###
barplot(table(train.data$Survived), main="Passenger Survival", names = c("Perished", "Survived"))

barplot(table(train.data$Pclass), main = "Passenger Class", names = c("first", "seconds", "third"))

barplot(table(train.data$Sex), main = "Passenger Gender")

hist(train.data$Age, main = "Passenger Age", xlab = "Age")

barplot(table(train.data$SibSp), main = "Passenger Siblings")

barplot(table(train.data$Parch), main = "Passenger Parch")

hist(train.data$Fare, main = "Passenger Fare", xlab = "Fare")

barplot(table(train.data$Embarked), main = "Port of Embarkation")

counts = table( train.data$Survived, train.data$Sex )
counts
female male

0 81 468 1 233 109

barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Sex")

# does Pclass affect survival rate? YUP!
counts = table( train.data$Survived, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = c("Perished", "Survived"), main = "Passenger Survival by Class")

# Gender Composition of Class
counts = table( train.data$Sex, train.data$Pclass)
barplot(counts, col = c("darkblue", "red"), legend = rownames(counts), main = "Passenger Gender by Class")

# What does age distribution look like? - Age Histogram
hist(train.data$Age[which(train.data$Survived == "0")], main = "Passenger Age Histogram",xlab = "Age", ylab = "Count", col = "blue", breaks = seq(0,80,by=2))
hist(train.data$Age[which(train.data$Survived == "1")], col = "red", add = T, breaks = seq(0,80,by=2))

# what's the relationship between age and survival rate?
boxplot(train.data$Age ~ train.data$Survived , main = "Passenger Survival by Age" , xlab = "Survived", ylab = "Age" )

# categorize people into different age groups
train.child = train.data$Survived[train.data$Age < 13]
paste( 'child survival rate = ', length(train.child[which(train.child == 1)]) / length(train.child))

[1] “child survival rate = 0.575342465753425”

train.youth = train.data$Survived[train.data$Age >= 15 & train.data$Age <25 ]
paste('youth survival rate = ', length(train.youth[which(train.youth == 1 )]) / length(train.youth))

[1] “youth survival rate = 0.402542372881356”

train.adult = train.data$Survived[train.data$Age >= 25 & train.data$Age <65 ]
paste('adult survival rate = ', length(train.adult[which(train.adult == 1 )]) / length(train.adult)) 

[1] “adult survival rate = 0.354092526690391”

train.senior = train.data$Survived[train.data$Age >= 65 ]
paste('senior survival rate = ', length(train.senior[which(train.senior == 1 )]) / length(train.senior)) 

[1] “senior survival rate = 0.0909090909090909”

library(vcd)
mosaicplot(train.data$Pclass ~ train.data$Survived, 
  main = "Passenger Survival Class", color = TRUE,
  xlab = "Pclass", ylab = "Survived")

split.data  = function(data, p=0.7, s=666){
  set.seed(s) 
  index = sample(1:dim(data)[1]) 
  train = data[index[1:floor(dim(data)[1] * p)], ]
  test = data[index[((ceiling(dim(data)[1] * p)) + 1):dim(data)[1]], ]
  return(list(train = train, test = test))
}

allset = split.data(train.data)
trainset = allset$train
testset = allset$test
library(party)
train.ctree = ctree(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset)
train.ctree
 Conditional inference tree with 7 terminal nodes

Response: Survived Inputs: Pclass, Sex, Age, SibSp, Fare, Parch, Embarked Number of observations: 623

  1. Sex == {male}; criterion = 1, statistic = 173.672
  2. Pclass == {2, 3}; criterion = 1, statistic = 30.951
    1. Age <= 9; criterion = 0.993, statistic = 10.923
    2. SibSp <= 1; criterion = 0.999, statistic = 14.856 5)* weights = 10
    3. SibSp > 1 6)* weights = 13
    4. Age > 9 7)* weights = 280
  3. Pclass == {1} 8)* weights = 87
  4. Sex == {female}
  5. Pclass == {1, 2}; criterion = 1, statistic = 59.504 10)* weights = 125
  6. Pclass == {3}
    1. Fare <= 23.25; criterion = 0.997, statistic = 12.456 12)* weights = 85
    2. Fare > 23.25 13)* weights = 23
plot(train.ctree, main = "Conditional inference tree of Titanic Dataset")

# Now use svm to generate a prediction model using the same dataset as above
library('e1071')
svm.model = svm(Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset, probability=TRUE)
svm.model

Call: svm(formula = Survived ~ Pclass + Sex + Age + SibSp + Fare + Parch + Embarked, data = trainset, probability = TRUE)

Parameters: SVM-Type: C-classification SVM-Kernel: radial cost: 1 gamma: 0.1

Number of Support Vectors: 324

# Validate the power of the prediction model using a confusion matrix
ctree.predict = predict(train.ctree, testset)
library('caret')
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(ctree.predict, testset$Survived)

Confusion Matrix and Statistics

      Reference

Prediction 0 1 0 160 23 1 16 68

           Accuracy : 0.8539         
             95% CI : (0.8058, 0.894)
No Information Rate : 0.6592         
P-Value [Acc > NIR] : 5.347e-13      
                                     
              Kappa : 0.6688         

Mcnemar’s Test P-Value : 0.3367

        Sensitivity : 0.9091         
        Specificity : 0.7473         
     Pos Pred Value : 0.8743         
     Neg Pred Value : 0.8095         
         Prevalence : 0.6592         
     Detection Rate : 0.5993         

Detection Prevalence : 0.6854
Balanced Accuracy : 0.8282

   'Positive' Class : 0              
                                     
# Validate the power of the prediction model using a confusion matrix
svm.predict = predict(svm.model, testset,na.action = na.pass)
library('caret')
# test the accuracy of the ctree model using confusion matrix via caret package
confusionMatrix(table(svm.predict, testset$Survived))

Confusion Matrix and Statistics

svm.predict 0 1 0 161 24 1 15 67

           Accuracy : 0.8539         
             95% CI : (0.8058, 0.894)
No Information Rate : 0.6592         
P-Value [Acc > NIR] : 5.347e-13      
                                     
              Kappa : 0.667          

Mcnemar’s Test P-Value : 0.2002

        Sensitivity : 0.9148         
        Specificity : 0.7363         
     Pos Pred Value : 0.8703         
     Neg Pred Value : 0.8171         
         Prevalence : 0.6592         
     Detection Rate : 0.6030         

Detection Prevalence : 0.6929
Balanced Accuracy : 0.8255

   'Positive' Class : 0              
                                     

Assesing Performance with the ROC Curve

Prepare the Probability Matrix

train.ctree.pred = predict(train.ctree, testset)
train.ctree.prob = 1 - unlist(treeresponse(train.ctree, testset), use.names = F) [seq(1,nrow(testset)*2,2)]

Create an ROCR prediction from probabilities & Create ROC Curve

library(ROCR)
train.ctree.prob.rocr = prediction(train.ctree.prob, testset$Survived)
train.ctree.perf = performance(train.ctree.prob.rocr, "tpr", "fpr")
train.ctree.auc.perf = performance(train.ctree.prob.rocr, measure = "auc", x.measure = "cutoff")
plot(train.ctree.perf, col=2, colorize=T, main=paste("AUC:", train.ctree.auc.perf@y.values))

Comments Welcomed