This is a code for an online hackathon to predict likelihood of someone suffering from stroke. Various algorithms are used to get predictions. Since the time was limited, the documentation in this code is sloppy.
library(tidyverse)
library(data.table)
library(verification)
library(randomForest)
library(nnet)
library(caret)
library(gbm)
library(xgboost)
library(MatrixModels)
library(Matrix)
library(MASS)
library(neuralnet)
library('h2o')
h2o.init(ip='localhost',port=54321,max_mem_size = '2600m')
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 9 minutes 35 seconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 15 days
## H2O cluster name: H2O_started_from_R_Swapnil_gmu724
## H2O cluster total nodes: 1
## H2O cluster total memory: 2.25 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.3 (2017-11-30)
dim(train)
## [1] 43400 12
str(train)
## Classes 'data.table' and 'data.frame': 43400 obs. of 12 variables:
## $ id : int 30669 30468 16523 56543 46136 32257 52800 41413 15266 28674 ...
## $ gender : chr "Male" "Male" "Female" "Female" ...
## $ age : num 3 58 8 70 14 47 52 75 32 74 ...
## $ hypertension : int 0 1 0 0 0 0 0 0 0 1 ...
## $ heart_disease : int 0 0 0 0 0 0 0 1 0 0 ...
## $ ever_married : chr "No" "Yes" "No" "Yes" ...
## $ work_type : chr "children" "Private" "Private" "Private" ...
## $ Residence_type : chr "Rural" "Urban" "Urban" "Rural" ...
## $ avg_glucose_level: num 95.1 88 110.9 69 161.3 ...
## $ bmi : num 18 39.2 17.6 35.9 19.1 50.1 17.7 27 32.3 54.6 ...
## $ smoking_status : chr "" "never smoked" "" "formerly smoked" ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
## Formattin##
train$gender<-as.factor(train$gender)
train$ever_married<-as.factor(train$ever_married)
train$work_type<-as.factor(train$work_type)
train$Residence_type<-as.factor(train$Residence_type)
#train$smoking_status<-as.factor(train$smoking_status)
train$hypertension<-as.factor(train$hypertension)
train$heart_disease<-as.factor(train$heart_disease)
train$stroke<-as.factor(train$stroke)
summary(train)
## id gender age hypertension heart_disease
## Min. : 1 Female:25665 Min. : 0.08 0:39339 0:41338
## 1st Qu.:18038 Male :17724 1st Qu.:24.00 1: 4061 1: 2062
## Median :36352 Other : 11 Median :44.00
## Mean :36326 Mean :42.22
## 3rd Qu.:54514 3rd Qu.:60.00
## Max. :72943 Max. :82.00
##
## ever_married work_type Residence_type avg_glucose_level
## No :15462 children : 6156 Rural:21644 Min. : 55.00
## Yes:27938 Govt_job : 5440 Urban:21756 1st Qu.: 77.54
## Never_worked : 177 Median : 91.58
## Private :24834 Mean :104.48
## Self-employed: 6793 3rd Qu.:112.07
## Max. :291.05
##
## bmi smoking_status stroke
## Min. :10.10 Length:43400 0:42617
## 1st Qu.:23.20 Class :character 1: 783
## Median :27.70 Mode :character
## Mean :28.61
## 3rd Qu.:32.90
## Max. :97.60
## NA's :1462
colSums(is.na(train))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 1462 0 0
## Test##
str(test)
## Classes 'data.table' and 'data.frame': 18601 obs. of 11 variables:
## $ id : int 36306 61829 14152 12997 40801 9348 51550 60512 31309 39199 ...
## $ gender : chr "Male" "Female" "Female" "Male" ...
## $ age : num 80 74 14 28 63 66 49 46 75 75 ...
## $ hypertension : int 0 0 0 0 0 1 0 0 0 0 ...
## $ heart_disease : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "No" "No" ...
## $ work_type : chr "Private" "Self-employed" "children" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 83.8 179.5 95.2 94.8 83.6 ...
## $ bmi : num 21.1 26 21.2 23.4 27.6 32.2 25.1 32.5 28 25.7 ...
## $ smoking_status : chr "formerly smoked" "formerly smoked" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
test$gender<-as.factor(test$gender)
test$ever_married<-as.factor(test$ever_married)
test$work_type<-as.factor(test$work_type)
test$Residence_type<-as.factor(test$Residence_type)
#train$smoking_status<-as.factor(train$smoking_status)
test$hypertension<-as.factor(test$hypertension)
test$heart_disease<-as.factor(test$heart_disease)
summary(test)
## id gender age hypertension heart_disease
## Min. : 2 Female:10957 Min. : 0.08 0:16868 0:17707
## 1st Qu.:18542 Male : 7642 1st Qu.:24.00 1: 1733 1: 894
## Median :36717 Other : 2 Median :43.00
## Mean :36747 Mean :42.06
## 3rd Qu.:55114 3rd Qu.:60.00
## Max. :72942 Max. :82.00
##
## ever_married work_type Residence_type avg_glucose_level
## No : 6662 children : 2613 Rural:9291 Min. : 55.00
## Yes:11939 Govt_job : 2302 Urban:9310 1st Qu.: 77.55
## Never_worked : 75 Median : 91.83
## Private :10750 Mean :104.39
## Self-employed: 2861 3rd Qu.:112.31
## Max. :275.72
##
## bmi smoking_status
## Min. :10.20 Length:18601
## 1st Qu.:23.30 Class :character
## Median :27.70 Mode :character
## Mean :28.55
## 3rd Qu.:32.80
## Max. :88.30
## NA's :591
colSums(is.na(test))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status
## 0 591 0
testBmi<-test%>%filter(!is.na(bmi))%>%
group_by(work_type)%>%
summarise(med=median(bmi))
testBmi
## # A tibble: 5 x 2
## work_type med
## <fct> <dbl>
## 1 children 18.7
## 2 Govt_job 29.5
## 3 Never_worked 24.4
## 4 Private 28.7
## 5 Self-employed 29.2
bmiCommplete<-test[!is.na(test$bmi),c(7,10)]
test$bmi[is.na(test$bmi) & test$work_type=='children'] <- 18.7
test$bmi[is.na(test$bmi) & test$work_type=='Govt_job'] <- 29.50
test$bmi[is.na(test$bmi) & test$work_type=='Never_worked'] <- 24.40
test$bmi[is.na(test$bmi) & test$work_type=='Private'] <- 28.70
test$bmi[is.na(test$bmi) & test$work_type=='Self-employed'] <- 29.20
summary(test)
## id gender age hypertension heart_disease
## Min. : 2 Female:10957 Min. : 0.08 0:16868 0:17707
## 1st Qu.:18542 Male : 7642 1st Qu.:24.00 1: 1733 1: 894
## Median :36717 Other : 2 Median :43.00
## Mean :36747 Mean :42.06
## 3rd Qu.:55114 3rd Qu.:60.00
## Max. :72942 Max. :82.00
## ever_married work_type Residence_type avg_glucose_level
## No : 6662 children : 2613 Rural:9291 Min. : 55.00
## Yes:11939 Govt_job : 2302 Urban:9310 1st Qu.: 77.55
## Never_worked : 75 Median : 91.83
## Private :10750 Mean :104.39
## Self-employed: 2861 3rd Qu.:112.31
## Max. :275.72
## bmi smoking_status
## Min. :10.20 Length:18601
## 1st Qu.:23.40 Class :character
## Median :28.00 Mode :character
## Mean :28.54
## 3rd Qu.:32.60
## Max. :88.30
plot.ecdf(train$bmi)
plot.ecdf(train$avg_glucose_level)
plot.ecdf(train$age)
summary(train$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10.10 23.20 27.70 28.61 32.90 97.60 1462
quantile(train$bm, 0.95,na.rm = T)
## 95%
## 42.6
quantile(train$avg_glucose_level, 0.95)
## 95%
## 212.0415
length(train$bmi[train$bmi>60])
## [1] 1551
summary(train$avg_glucose_level)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 55.00 77.54 91.58 104.48 112.07 291.05
ggplot(data = train,aes(x = stroke,y = bmi))+geom_boxplot()
## Smoking
test$smoking_status[test$smoking_status==''] <- 'Unknown'
test$smoking_status<-as.factor(test$smoking_status)
summary(test)
## id gender age hypertension heart_disease
## Min. : 2 Female:10957 Min. : 0.08 0:16868 0:17707
## 1st Qu.:18542 Male : 7642 1st Qu.:24.00 1: 1733 1: 894
## Median :36717 Other : 2 Median :43.00
## Mean :36747 Mean :42.06
## 3rd Qu.:55114 3rd Qu.:60.00
## Max. :72942 Max. :82.00
## ever_married work_type Residence_type avg_glucose_level
## No : 6662 children : 2613 Rural:9291 Min. : 55.00
## Yes:11939 Govt_job : 2302 Urban:9310 1st Qu.: 77.55
## Never_worked : 75 Median : 91.83
## Private :10750 Mean :104.39
## Self-employed: 2861 3rd Qu.:112.31
## Max. :275.72
## bmi smoking_status
## Min. :10.20 formerly smoked:3260
## 1st Qu.:23.40 never smoked :6833
## Median :28.00 smokes :2757
## Mean :28.54 Unknown :5751
## 3rd Qu.:32.60
## Max. :88.30
## BMI ##
medianBmi<-train%>%filter(!is.na(bmi))%>%
group_by(work_type)%>%
summarise(med=median(bmi))
bmiCommplete<-train[!is.na(train$bmi),c(7,10)]
train$bmi[is.na(train$bmi) & train$work_type=='children'] <- 18.80
train$bmi[is.na(train$bmi) & train$work_type=='Govt_job'] <- 29.50
train$bmi[is.na(train$bmi) & train$work_type=='Never_worked'] <- 24.20
train$bmi[is.na(train$bmi) & train$work_type=='Private'] <- 28.75
train$bmi[is.na(train$bmi) & train$work_type=='Self-employed'] <- 29.20
summary(train)
## id gender age hypertension heart_disease
## Min. : 1 Female:25665 Min. : 0.08 0:39339 0:41338
## 1st Qu.:18038 Male :17724 1st Qu.:24.00 1: 4061 1: 2062
## Median :36352 Other : 11 Median :44.00
## Mean :36326 Mean :42.22
## 3rd Qu.:54514 3rd Qu.:60.00
## Max. :72943 Max. :82.00
## ever_married work_type Residence_type avg_glucose_level
## No :15462 children : 6156 Rural:21644 Min. : 55.00
## Yes:27938 Govt_job : 5440 Urban:21756 1st Qu.: 77.54
## Never_worked : 177 Median : 91.58
## Private :24834 Mean :104.48
## Self-employed: 6793 3rd Qu.:112.07
## Max. :291.05
## bmi smoking_status stroke
## Min. :10.10 Length:43400 0:42617
## 1st Qu.:23.40 Class :character 1: 783
## Median :28.00 Mode :character
## Mean :28.59
## 3rd Qu.:32.60
## Max. :97.60
## Smoking
table(train$smoking_status,train$work_type)
##
## children Govt_job Never_worked Private Self-employed
## 5525 1094 76 5249 1348
## formerly smoked 104 1091 9 4629 1660
## never smoked 518 2291 85 10452 2707
## smokes 9 964 7 4504 1078
table(train$heart_disease,train$smoking_status)
##
## formerly smoked never smoked smokes
## 0 12908 6846 15449 6135
## 1 384 647 604 427
train$smoking_status[train$smoking_status==''] <- 'Unknown'
train$smoking_status<-as.factor(train$smoking_status)
summary(train)
## id gender age hypertension heart_disease
## Min. : 1 Female:25665 Min. : 0.08 0:39339 0:41338
## 1st Qu.:18038 Male :17724 1st Qu.:24.00 1: 4061 1: 2062
## Median :36352 Other : 11 Median :44.00
## Mean :36326 Mean :42.22
## 3rd Qu.:54514 3rd Qu.:60.00
## Max. :72943 Max. :82.00
## ever_married work_type Residence_type avg_glucose_level
## No :15462 children : 6156 Rural:21644 Min. : 55.00
## Yes:27938 Govt_job : 5440 Urban:21756 1st Qu.: 77.54
## Never_worked : 177 Median : 91.58
## Private :24834 Mean :104.48
## Self-employed: 6793 3rd Qu.:112.07
## Max. :291.05
## bmi smoking_status stroke
## Min. :10.10 formerly smoked: 7493 0:42617
## 1st Qu.:23.40 never smoked :16053 1: 783
## Median :28.00 smokes : 6562
## Mean :28.59 Unknown :13292
## 3rd Qu.:32.60
## Max. :97.60
## Write##
write.csv(train,'trainclean.csv')
#write.csv(test,'test.csv')
### Logistic ##
logtrain<-train
logtrain$bmi<-log(logtrain$bmi)
logModel <- glm(data = logtrain[,-1],stroke~.-ever_married,family=binomial(link="logit"))
summary(logModel)
##
## Call:
## glm(formula = stroke ~ . - ever_married, family = binomial(link = "logit"),
## data = logtrain[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8054 -0.1972 -0.1045 -0.0514 4.1376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.386e+00 9.102e-01 -9.213 < 2e-16 ***
## genderMale 7.716e-02 7.608e-02 1.014 0.310445
## genderOther -1.111e+01 6.930e+02 -0.016 0.987206
## age 7.024e-02 3.134e-03 22.411 < 2e-16 ***
## hypertension1 3.088e-01 8.766e-02 3.523 0.000427 ***
## heart_disease1 6.185e-01 9.429e-02 6.560 5.38e-11 ***
## work_typeGovt_job 2.949e-01 7.449e-01 0.396 0.692222
## work_typeNever_worked -9.477e+00 1.712e+02 -0.055 0.955861
## work_typePrivate 4.054e-01 7.399e-01 0.548 0.583764
## work_typeSelf-employed 3.692e-01 7.453e-01 0.495 0.620363
## Residence_typeUrban 2.687e-02 7.392e-02 0.364 0.716210
## avg_glucose_level 3.675e-03 6.571e-04 5.592 2.24e-08 ***
## bmi -1.837e-01 1.888e-01 -0.973 0.330634
## smoking_statusnever smoked -4.508e-02 9.434e-02 -0.478 0.632752
## smoking_statussmokes 1.935e-01 1.154e-01 1.677 0.093613 .
## smoking_statusUnknown -1.276e-02 1.116e-01 -0.114 0.908967
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7839.4 on 43399 degrees of freedom
## Residual deviance: 6446.7 on 43384 degrees of freedom
## AIC: 6478.7
##
## Number of Fisher Scoring iterations: 15
logPredict<-predict(logModel,type = 'response')
summary(logPredict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.001473 0.005992 0.018041 0.020766 0.319165
roc.plot(x = logtrain$stroke=='1',pred =logPredict,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8523632 2.098851e-251 NA
probitModel <- glm(data = train[,-1],stroke~.,family=binomial(link="probit"))
summary(probitModel)
##
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "probit"),
## data = train[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7420 -0.2026 -0.0996 -0.0418 4.2078
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.722e+00 2.077e-01 -17.921 < 2e-16 ***
## genderMale 3.439e-02 3.369e-02 1.021 0.307386
## genderOther -2.831e+00 1.730e+02 -0.016 0.986947
## age 2.917e-02 1.310e-03 22.266 < 2e-16 ***
## hypertension1 1.500e-01 4.081e-02 3.676 0.000237 ***
## heart_disease1 3.070e-01 4.616e-02 6.650 2.93e-11 ***
## ever_marriedYes -5.622e-02 5.459e-02 -1.030 0.303008
## work_typeGovt_job -1.727e-01 2.158e-01 -0.801 0.423333
## work_typeNever_worked -2.735e+00 4.201e+01 -0.065 0.948089
## work_typePrivate -1.136e-01 2.120e-01 -0.536 0.592078
## work_typeSelf-employed -1.214e-01 2.158e-01 -0.563 0.573589
## Residence_typeUrban 2.047e-02 3.277e-02 0.625 0.532140
## avg_glucose_level 1.697e-03 3.034e-04 5.591 2.25e-08 ***
## bmi -4.041e-03 2.639e-03 -1.531 0.125744
## smoking_statusnever smoked -2.978e-02 4.230e-02 -0.704 0.481367
## smoking_statussmokes 7.644e-02 5.129e-02 1.491 0.136084
## smoking_statusUnknown -8.582e-03 4.976e-02 -0.172 0.863061
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7839.4 on 43399 degrees of freedom
## Residual deviance: 6430.6 on 43383 degrees of freedom
## AIC: 6464.6
##
## Number of Fisher Scoring iterations: 15
probitPredict<-predict(probitModel,type = 'response')
summary(probitPredict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0009931 0.0055093 0.0180496 0.0219777 0.2648691
roc.plot(x = train$stroke=='1',pred =probitPredict,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8530403 2.312941e-252 NA
probitModel <- glm(data = train[,-1],stroke~.,family=binomial(link="cloglog"))
summary(probitModel)
##
## Call:
## glm(formula = stroke ~ ., family = binomial(link = "cloglog"),
## data = train[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8276 -0.1961 -0.1054 -0.0539 4.1346
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.751e+00 7.285e-01 -12.013 < 2e-16 ***
## genderMale 7.399e-02 7.368e-02 1.004 0.315318
## genderOther -1.105e+01 6.642e+02 -0.017 0.986729
## age 6.896e-02 3.069e-03 22.472 < 2e-16 ***
## hypertension1 2.952e-01 8.431e-02 3.501 0.000464 ***
## heart_disease1 5.833e-01 8.987e-02 6.491 8.55e-11 ***
## ever_marriedYes -7.612e-02 1.204e-01 -0.632 0.527369
## work_typeGovt_job 4.309e-01 7.463e-01 0.577 0.563649
## work_typeNever_worked -9.343e+00 1.653e+02 -0.057 0.954919
## work_typePrivate 5.352e-01 7.412e-01 0.722 0.470274
## work_typeSelf-employed 4.937e-01 7.460e-01 0.662 0.508085
## Residence_typeUrban 2.324e-02 7.160e-02 0.325 0.745508
## avg_glucose_level 3.589e-03 6.332e-04 5.669 1.44e-08 ***
## bmi -8.027e-03 5.980e-03 -1.342 0.179515
## smoking_statusnever smoked -4.069e-02 9.131e-02 -0.446 0.655859
## smoking_statussmokes 1.897e-01 1.116e-01 1.700 0.089174 .
## smoking_statusUnknown -1.198e-02 1.082e-01 -0.111 0.911810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7839.4 on 43399 degrees of freedom
## Residual deviance: 6448.2 on 43383 degrees of freedom
## AIC: 6482.2
##
## Number of Fisher Scoring iterations: 15
probitPredict<-predict(probitModel,type = 'response')
summary(probitPredict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.001605 0.006060 0.018008 0.020605 0.343364
roc.plot(x = train$stroke=='1',pred =probitPredict,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8524762 1.452702e-251 NA
## test prediction
log.test.pred<-predict(logModel,test[,-1],type='response')
summary(log.test.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 1.651e-05 4.546e-05 2.380e-04 1.761e-04 1.532e-02
submission<-cbind(test$id,log.test.pred)
#write.csv(as.data.frame(submission),'SwapnilSub.csv')
###RandomForest
logtrain<-train
logtrain$a<-logtrain$bmi*logtrain$avg_glucose_level
logtrain$b<-logtrain$bmi/logtrain$avg_glucose_level
logtrain$c<-logtrain$bmi*logtrain$age
logtrain$d<-logtrain$avg_glucose_level*logtrain$age
rfmodel<-randomForest(data=logtrain[,-1],stroke~.,ntree=100)
summary(rfmodel)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 43400 factor numeric
## err.rate 300 -none- numeric
## confusion 6 -none- numeric
## votes 86800 matrix numeric
## oob.times 43400 -none- numeric
## classes 2 -none- character
## importance 14 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 43400 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
predicted.rf<-predict(rfmodel,type = 'prob')
roc.plot(x = logtrain$stroke == "1", pred = predicted.rf[,2],thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.751206 3.511642e-204 NA
varImpPlot(rfmodel)
## Neural network
nntrain<-train
nntrain$work_type<-as.character(nntrain$work_type)
nntrain$smoking_status<-as.character(nntrain$smoking_status)
nntrain$work_type[nntrain$work_type=='Self-employed']<-'self_employed'
nntrain$smoking_status[nntrain$smoking_status=='formerly smoked']<-'formerly_smoked'
nntrain$smoking_status[nntrain$smoking_status=='never smoked']<-'never_smoked'
nntrain$work_type<-as.factor(nntrain$work_type)
nntrain$smoking_status<-as.factor(nntrain$smoking_status)
nntrain$age<-scale(nntrain$age)
nntrain$bmi<-scale(nntrain$bmi)
nntrain$avg_glucose_level<-scale(nntrain$avg_glucose_level)
summary(nntrain)
## id gender age hypertension
## Min. : 1 Female:25665 Min. :-1.87116 0:39339
## 1st Qu.:18038 Male :17724 1st Qu.:-0.80898 1: 4061
## Median :36352 Other : 11 Median : 0.07914
## Mean :36326 Mean : 0.00000
## 3rd Qu.:54514 3rd Qu.: 0.78963
## Max. :72943 Max. : 1.76655
## heart_disease ever_married work_type Residence_type
## 0:41338 No :15462 children : 6156 Rural:21644
## 1: 2062 Yes:27938 Govt_job : 5440 Urban:21756
## Never_worked : 177
## Private :24834
## self_employed: 6793
##
## avg_glucose_level bmi smoking_status stroke
## Min. :-1.1478 Min. :-2.41683 formerly_smoked: 7493 0:42617
## 1st Qu.:-0.6250 1st Qu.:-0.67881 never_smoked :16053 1: 783
## Median :-0.2993 Median :-0.07769 smokes : 6562
## Mean : 0.0000 Mean : 0.00000 Unknown :13292
## 3rd Qu.: 0.1760 3rd Qu.: 0.52343
## Max. : 4.3275 Max. : 9.01751
nntrain_mat<-model.matrix(data = nntrain[,-1],stroke~.-1)
head(nntrain_mat)
## genderFemale genderMale genderOther age hypertension1
## 1 0 1 0 -1.7414967 0
## 2 0 1 0 0.7008149 1
## 3 1 0 0 -1.5194684 0
## 4 1 0 0 1.2336829 0
## 5 0 1 0 -1.2530344 0
## 6 1 0 0 0.2123526 0
## heart_disease1 ever_marriedYes work_typeGovt_job work_typeNever_worked
## 1 0 0 0 0
## 2 0 1 0 0
## 3 0 0 0 0
## 4 0 1 0 0
## 5 0 0 0 1
## 6 0 1 0 0
## work_typePrivate work_typeself_employed Residence_typeUrban
## 1 0 0 0
## 2 1 0 1
## 3 1 0 1
## 4 1 0 0
## 5 0 0 0
## 6 1 0 1
## avg_glucose_level bmi smoking_statusnever_smoked
## 1 -0.2171740 -1.3844702 0
## 2 -0.3832540 1.3859071 1
## 3 0.1486196 -1.4367415 0
## 4 -0.8221134 0.9546691 0
## 5 1.3174424 -1.2407242 0
## 6 2.4695645 2.8102992 0
## smoking_statussmokes smoking_statusUnknown
## 1 0 1
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 1
## 6 0 1
stroke<-as.numeric(nntrain$stroke)-1
nntrain_mat<-cbind(nntrain_mat,stroke)
head(nntrain_mat)
## genderFemale genderMale genderOther age hypertension1
## 1 0 1 0 -1.7414967 0
## 2 0 1 0 0.7008149 1
## 3 1 0 0 -1.5194684 0
## 4 1 0 0 1.2336829 0
## 5 0 1 0 -1.2530344 0
## 6 1 0 0 0.2123526 0
## heart_disease1 ever_marriedYes work_typeGovt_job work_typeNever_worked
## 1 0 0 0 0
## 2 0 1 0 0
## 3 0 0 0 0
## 4 0 1 0 0
## 5 0 0 0 1
## 6 0 1 0 0
## work_typePrivate work_typeself_employed Residence_typeUrban
## 1 0 0 0
## 2 1 0 1
## 3 1 0 1
## 4 1 0 0
## 5 0 0 0
## 6 1 0 1
## avg_glucose_level bmi smoking_statusnever_smoked
## 1 -0.2171740 -1.3844702 0
## 2 -0.3832540 1.3859071 1
## 3 0.1486196 -1.4367415 0
## 4 -0.8221134 0.9546691 0
## 5 1.3174424 -1.2407242 0
## 6 2.4695645 2.8102992 0
## smoking_statussmokes smoking_statusUnknown stroke
## 1 0 1 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 1 0
## 6 0 1 0
#nntrain_dMatrix<-as.matrix(data = nntrain_mat,label=train_label)
a<-paste(colnames(nntrain_mat[,-18]),collapse=" + ")
b<-paste('stroke',a,sep='~')
f<-as.formula(b)
nnet.full<-neuralnet(f,data=nntrain_mat,hidden=c(1,1),linear.output=F)
plot(nnet.full)
nnet.pred<-nnet.full$net.result
nnet.pred<-as.numeric(nnet.pred[[1]])
roc.plot(x = train$stroke == "1", pred = nnet.pred,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8521605205 4.057303567e-251 NA
### GBM boosting
gbm.model<-gbm(data=train[,-1],as.character(stroke)~.,distribution = "bernoulli",n.trees = 1000,
interaction.depth = 8)
gbm.model.pred<-predict(gbm.model,newdata = train[,-1],n.trees = 1000,type = 'response')
summary(gbm.model.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.009061867 0.009061867 0.010256984 0.018009278 0.016631662 0.233936250
roc.plot(x = train$stroke == "1", pred = gbm.model.pred,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8624224661 2.349628625e-280 NA
## XGB boosting
set.seed(22334455)
logtrain<-train
logtrain$bmi[logtrain$bmi>42.5 & logtrain$work_type=='children'] <- 18.80
logtrain$bmi[logtrain$bmi>42.5 & logtrain$work_type=='Govt_job'] <- 29.50
logtrain$bmi[logtrain$bmi>42.5 & logtrain$work_type=='Never_worked'] <- 24.20
logtrain$bmi[logtrain$bmi>42.5 & logtrain$work_type=='Private'] <- 28.75
logtrain$bmi[logtrain$bmi>42.5 & logtrain$work_type=='Self-employed'] <- 29.20
logtrain$avg_glucose_level[logtrain$avg_glucose_level>213] <- 91.58
#bt<-sample(nrow(train),6200,replace = T)
#xgtrain<-train[bt,]
flag<-sample(nrow(logtrain),0.8*nrow(logtrain),replace = F)
xtrain<-logtrain[flag,]
xtest<-logtrain[-flag,]
train_mat<-sparse.model.matrix(data = xtrain[,-1],stroke~.-1)
head(train_mat)
## 6 x 17 sparse Matrix of class "dgCMatrix"
##
## 1 1 . . 23 . . 1 . . 1 . 1 64.22 38.8 . 1 .
## 2 . 1 . 27 . . . . . 1 . 1 90.41 22.7 . . 1
## 3 1 . . 78 . . 1 1 . . . . 71.94 23.6 . . 1
## 4 1 . . 20 . . . . . 1 . . 96.57 34.1 1 . .
## 5 . 1 . 12 . . . . . . . . 74.34 20.0 1 . .
## 6 1 . . 18 . . . . . . 1 . 64.56 26.1 1 . .
test_mat<-sparse.model.matrix(data = xtest[,-1],stroke~.-1)
head(test_mat)
## 6 x 17 sparse Matrix of class "dgCMatrix"
##
## 1 1 . . 74 1 . 1 . . . 1 1 205.84 29.2 1 . .
## 2 1 . . 44 . . 1 1 . . . 1 57.33 24.6 . 1 .
## 3 1 . . 79 . 1 1 . . . 1 1 67.84 25.2 . 1 .
## 4 1 . . 65 1 . 1 . . 1 . . 75.70 41.8 . . 1
## 5 1 . . 49 . . 1 . . 1 . . 60.22 31.5 . 1 .
## 6 1 . . 25 . . 1 . . 1 . 1 60.84 24.5 1 . .
train_label<-as.numeric(xtrain$stroke)-1
test_label<-as.numeric(xtest$stroke)-1
# We need to conver data to DMatrix form
train_dMatrix<-xgb.DMatrix(data = as.matrix(train_mat),label=train_label)
test_dMatrix<-xgb.DMatrix(data = as.matrix(test_mat),label=test_label)
## Modeling
params <- list("objective" = "reg:logistic",
"eval_metric" = "auc")
watchlist <- list(train = train_dMatrix, test = test_dMatrix)
# eXtreme Gradient Boosting Model.
## outputs a LONG list of results. hiding it for readability
xgb_model <- xgb.train(params = params,
data = train_dMatrix,
nrounds = 500,
watchlist = watchlist,
eta = 0.015,
max.depth = 5,
gamma = 0,
subsample = 1,
colsample_bytree = 1,
missing = NA)
tunning<-as.data.frame(xgb_model$evaluation_log)
ggplot(data = NULL,aes(x = tunning$iter,y = tunning$train_auc,col='train'))+geom_line()+
geom_line(aes(y = tunning$test_auc,col='test'))
### Training prediction-
train_matrix<-sparse.model.matrix(data = logtrain[,-1],stroke~.-1)
train_label<-as.numeric(train$stroke)-1
train_matrix<-xgb.DMatrix(data = as.matrix(train_matrix),label=train_label)
xgb_prediction.train<-predict(xgb_model, newdata = train_matrix)
summary(xgb_prediction.train)
## Min. 1st Qu. Median Mean 3rd Qu.
## 0.0003822189 0.0004090883 0.0061154272 0.0185546014 0.0219228165
## Max.
## 0.4289977252
roc.plot(x = train$stroke == "1", pred = xgb_prediction.train,thresholds = seq(0,1,0.001))$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.8926586777 0 NA
## Prediction on test data-
# creating test Matrix
xgtest<-test
xgtest$bmi[xgtest$bmi>42.5 & xgtest$work_type=='children'] <- 18.7
xgtest$bmi[xgtest$bmi>42.5 & xgtest$work_type=='Govt_job'] <- 29.50
xgtest$bmi[xgtest$bmi>42.5 & xgtest$work_type=='Never_worked'] <- 24.40
xgtest$bmi[xgtest$bmi>42.5 & xgtest$work_type=='Private'] <- 28.70
xgtest$bmi[xgtest$bmi>42.5 & xgtest$work_type=='Self-employed'] <- 29.20
quantile(xgtest$avg_glucose_level,0.95)
## 95%
## 210.89
xgtest$avg_glucose_level[xgtest$bmi>211] <- median(xgtest$avg_glucose_level)
xgtest$stroke<-0
test_matrix<-sparse.model.matrix(data = xgtest[,-1],stroke~.-1)
test_label<-as.numeric(test$stroke)
test_matrix<-xgb.DMatrix(data = as.matrix(test_matrix))
test_xgb_prediction<-predict(xgb_model, newdata = test_matrix)
summary(test_xgb_prediction)
## Min. 1st Qu. Median Mean 3rd Qu.
## 0.0003822189 0.0004090883 0.0059279129 0.0171217211 0.0213156752
## Max.
## 0.3133027852
submission<-cbind(test$id,test_xgb_prediction)
#colnames(submission)<c('id','stroke')
write.csv(as.data.frame(submission),'SwapnilSub.csv')