기타/R

[R]KNN-Titanic

stonesy 2021. 11. 26. 05:07
728x90

#1.
install.packages('titanic')
library(titanic)
library(dplyr)
library(caret)
library(e1071)

#Data Preprocessing
#결측값 처리
setwd('C:\\Users\\Cho\\Desktop\\단국\\2학년 2학기\\데이터사이언스\\데이터사이언스_과제6자료')
titanic_train2<-read.csv('train.csv')
titanic_test2<-read.csv('test.csv')
colSums(is.na(titanic_train2))
colSums(is.na(titanic_test2))
titanic_full<-dplyr::bind_rows(titanic_train2, titanic_test2)
titanic_full
colSums(is.na(titanic_full))
View(titanic_full)

#Age결측값 처리
idx1<-which(titanic_full$Pclass==1)
age.p1<-mean(titanic_full$Age[idx1],na.rm=T)
idx2<-which(titanic_full$Pclass==2)
age.p2<-mean(titanic_full$Age[idx2],na.rm=T)
idx3<-which(titanic_full$Pclass==3)
age.p3<-mean(titanic_full$Age[idx3],na.rm=T)
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==1)]=age.p1
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==2)]=age.p2
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==3)]=age.p3

#Fare 결측값 처리
titanic_full[is.na(titanic_full$Fare),]
titanic_full$Fare[is.na(titanic_full$Fare)]=mean(titanic_full$Fare[which(titanic_full$Pclass==3)],na.rm=T)
colSums(is.na(titanic_full))

#Factorization/Numeric Conversion
titanic_full$Survived<-as.factor(titanic_full$Survived)
titanic_full$Pclass<-as.factor(titanic_full$Pclass)
titanic_full$Pclass<-as.numeric(titanic_full$Pclass)
titanic_full$Age<-as.factor(titanic_full$Age)
titanic_full$Age<-as.numeric(titanic_full$Age)
titanic_full$Embarked<-as.factor(titanic_full$Embarked)
titanic_full$Embarked<-as.numeric(titanic_full$Embarked)
titanic_full$Sex<-as.factor(titanic_full$Sex)
titanic_full$Sex<-as.numeric(titanic_full$Sex)
titanic_full$Fare<-as.factor(titanic_full$Fare)
titanic_full$Fare<-as.numeric(titanic_full$Fare)
View(titanic_full)

#시각화, 중요한 변수는 무엇?
grp0<-titanic_full[which(titanic_full$Survived==0),]
grp1<-titanic_full[which(titanic_full$Survived==1),]
p0<-hist(grp0$Pclass)
p1<-hist(grp1$Pclass)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,3),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,3),ylim=c(0,600),add=T)
t.test(grp0$Pclass,grp1$Pclass)

p0<-hist(grp0$Sex)
p1<-hist(grp1$Sex)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,3),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,3),ylim=c(0,600),add=T)
t.test(grp0$Sex,grp1$Sex)

par(mar=c(1,1,1,1))
p0<-hist(grp0$Age)
p1<-hist(grp1$Age)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,80),ylim=c(0,150))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,80),ylim=c(0,150),add=T)
t.test(grp0$Age,grp1$Age)
par(mfrow=c(1,1),mar=c(5,4,4,2)+.1)

titanic_full2<-titanic_full
titanic_full2<-titanic_full2[,c('Survived','Pclass','Sex','Age','Fare','Embarked')]
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jre1.8.0_40")
library(FSelector)
weights<-FSelector::symmetrical.uncertainty(Survived~.,titanic_full2)
print(weights)

#modeling
train=subset(titanic_full2,!is.na(titanic_full2$Survived))
ds.train<-train[-1]
cl.train<-train$Survived
test=subset(titanic_full2,is.na(titanic_full2$Survived))
ds.test<-test[-1]
View(test)
#KNN
library(class)
#k=1에서 가장 좋은 결과를 가짐
for(i in 1:10){
  pred<-knn(ds.train,ds.train,cl.train,k=i,prob=F)
  acc<-mean(pred==cl.train)
  acc
  m<-caret::confusionMatrix(as.factor(cl.train),as.factor(pred))
  cat("k=",i,"\n",acc,"\n")
  print(m)
  cat("\n---------------------------------------\n")
}
#kfold를 이용한 경우, k=1일때 가장 좋은 결과
train=subset(titanic_full2,!is.na(titanic_full2$Survived))
ds.train2<-train[-1]
cl.train2<-train$Survived
set.seed(100) # For the same result
fold<-createFolds(as.factor(train$Survived), k=5, list=TRUE, returnTrain = FALSE)
kfold <- function(classifier, ds, cl, fold, i) {
  acc <- c()
  for (i in 1:length(fold)) {
    ds.train <- ds[-fold[[i]], ]
    ds.test <-  ds[fold[[i]], ]
    cl.train <- cl[-fold[[i]]]
    cl.test <- cl[fold[[i]]]
    
    if (classifier == 'svm') {
      model <- svm(ds.train, cl.train)
      pred <- predict(model, ds.test)
    } else if (classifier == 'knn') {
      pred <-knn(ds.train, ds.test, cl.train, k=i, prob=F) 
      pred<-factor(pred)
    }
    
    acc[i] <- mean(pred==cl.test)
  }
  
  return(mean(acc))
}
for(i in 1:10){
  score<-kfold("knn",ds.train2,cl.train2,fold,i)
  cat('k=',i,'acc=',score,'\n')
}
#결과
pred<-knn(ds.train,ds.test,cl.train,k=3,prob=F)
df<-data.frame(PassengerId=titanic_test2$PassengerId,pred)
colnames(df)<-c("PassengerId","Survived")
write.csv(df,'submission.csv',row.names = F)
728x90