https://www.kaggle.com/c/titanic/data
1.강의내용
#Titanic
#install.packages('titanic')
library(titanic)
library(dplyr)
library(caret)
library(e1071)
###Data Preprocessing###
titanic_train2<-titanic_train
#A.Feature Selection(All): Pclass, Age, Sex, SibSp, Parch, Embarked
titanic_train2<-select(titanic_train2,PassengerId,Survived,Pclass, Age, Sex, SibSp, Parch, Embarked)
head(titanic_train2)
#B. NAs=>mean
age.mean.male<-mean(titanic_train[which(titanic_train$Sex=='male'),'Age'],na.rm=T)
age.mean.male
age.mean.female<-mean(titanic_train[which(titanic_train$Sex=='female'),'Age'],na.rm=T)
age.mean.female
for(i in 1:nrow(titanic_train2)){
if(titanic_train2[i,'Sex']=='male'&&is.na(titanic_train2[i,'Age'])){
titanic_train2[i,'Age']=age.mean.male
}else if(titanic_train2[i,'Sex']=='female'&&is.na(titanic_train2[i,'Age'])){
titanic_train2[i,'Age']=age.mean.female
}
}
summary(titanic_train2)
#c.Factorization / Numeric Conversion
titanic_train2$Survived<-as.factor(titanic_train2$Survived)
titanic_train2$Pclass<-as.factor(titanic_train2$Pclass)
titanic_train2$Pclass<-as.numeric(titanic_train2$Pclass)
titanic_train2$Sex<-as.factor(titanic_train2$Sex)
titanic_train2$Sex<-as.numeric(titanic_train2$Sex)
titanic_train2$Embarked<-as.factor(titanic_train2$Embarked)
titanic_train2$Embarked<-as.numeric(titanic_train2$Embarked)
titanic_train2$SibSp<-as.factor(titanic_train2$SibSp)
titanic_train2$Parch<-as.factor(titanic_train2$Parch)
###End of Data Preprocessing###
###Exploratory Data Analysis (EDA) on titanic2###
grp0<-titanic_train2[which(titanic_train2$Survived==0),]
grp1<-titanic_train2[which(titanic_train2$Survived==1),]
#Pclass graph:
p0<-hist(grp0$Pclass)
p1<-hist(grp1$Pclass)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,3),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,3),ylim=c(0,600),add=T)
#Pclass t-test:
t.test(grp0$Pclass,grp1$Pclass)
#Sex graph:
p0<-hist(grp0$Sex)
p1<-hist(grp1$Sex)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,2),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,2),ylim=c(0,600),add=T)
#Sex t-test:
t.test(grp0$Sex,grp1$Sex)
#Age graph:
p0<-hist(grp0$Age)
p1<-hist(grp1$Age)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,80),ylim=c(0,150))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,80),ylim=c(0,150),add=T)
#Age t-test:
t.test(grp0$Age,grp1$Age)
###Use FSelector for best feature selection###
#install.packages('FSelector')
#Sys.setenv(JAVA_HOME="C:/Program Files/Java/jre1.8.0_40")
library(FSelector)
weights<-FSelector::symmetrical.uncertainty(Survived~.,titanic_train2[,-1]) #feature중요도 구하기
print(weights)
###SVM###
#Classification
ds.train<-as.matrix(titanic_train2[,-2])
cl.train<-titanic_train2$Survived
model.svmC<-svm(ds.train,cl.train,type="C-classification")
pred.train<-predict(model.svmC,ds.train)
caret::confusionMatrix(cl.train,pred.train)
#Regression
ds.train<-as.matrix(titanic_train2[,-2])
cl.train<-titanic_train2$Survived
model.svmR<-svm(ds.train,cl.train,type="nu-regression")
pred.train<-predict(model.svmC,ds.train)
caret::confusionMatrix(cl.train,pred.train)
##Knn
#install.packages("class")
library(class)
pred<-knn(ds.train,ds.train,cl.train,k=3,prob=T)
acc<-mean(pred==cl.train)
acc
m<-caret::confusionMatrix(cl.train,pred)
m
##iris
tr.idx<-c(1:25,51:75,101:125)
ds.tr<-iris[tr.idx,1:4]
ds.ts<-iris[-tr.idx,1:4]
cl.tr<-iris[tr.idx,5]
cl.ts<-iris[-tr.idx,5]
pred<-knn(ds.tr,ds.ts,cl.tr,k=3,prob=T)
pred
acc<-mean(pred==cl.ts)
acc
table(pred,cl.ts)
2.과제
#Titanic
trainData=read.csv("C:\\Users\\Cho\\Desktop\\단국\\2학년 2학기\\데이터사이언스\\데이터사이언스_과제5자료\\train.csv")
testData=read.csv("C:\\Users\\Cho\\Desktop\\단국\\2학년 2학기\\데이터사이언스\\데이터사이언스_과제5자료\\test.csv")
testLabel=read.csv("C:\\Users\\Cho\\Desktop\\단국\\2학년 2학기\\데이터사이언스\\데이터사이언스_과제5자료\\gender_submission.csv")
View(trainData)
View(testData)
#Data Preprecessing####################################################
library(dplyr)
titanic_full <- dplyr::bind_rows(trainData, testData)
#A.Age-선실별 나이 평균값으로 결측값 처리
age.p1<-mean(subset(titanic_full,Pclass=="1")$Age,na.rm=T)
age.p2<-mean(subset(titanic_full,Pclass=="2")$Age,na.rm=T)
age.p3<-mean(subset(titanic_full,Pclass=="3")$Age,na.rm=T)
age.mean<-mean(titanic_full$Age,na.rm=T)
titanic_full$Age<-ifelse((is.na(titanic_full$Age) & titanic_full$Pclass == '1'), age.p1, titanic_full$Age)
titanic_full$Age<-ifelse((is.na(titanic_full$Age) & titanic_full$Pclass == '2'), age.p2, titanic_full$Age)
titanic_full$Age<-ifelse((is.na(titanic_full$Age) & titanic_full$Pclass == '3'), age.p3, titanic_full$Age)
colSums(is.na(titanic_full))
#B.Fare-요금의 평균으로 결측값 처리
fare.mean<-mean(titanic_full$Fare,na.rm=T)
titanic_full$Fare<-ifelse((is.na(titanic_full$Fare)), age.mean, titanic_full$Fare)
colSums(is.na(titanic_full))
View(titanic_full)
#c.Factorization / Numeric Conversion
titanic_full$Survived<-as.factor(titanic_full$Survived)
titanic_full$Pclass<-as.factor(titanic_full$Pclass)
titanic_full$Pclass<-as.numeric(titanic_full$Pclass)
titanic_full$Sex<-as.factor(titanic_full$Sex)
titanic_full$Sex<-as.numeric(titanic_full$Sex)
titanic_full$Embarked<-as.factor(titanic_full$Embarked)
titanic_full$Embarked<-as.numeric(titanic_full$Embarked)
titanic_full$SibSp<-as.factor(titanic_full$SibSp)
titanic_full$Parch<-as.factor(titanic_full$Parch)
########################################################
#model
titanic_full<-subset(titanic_full, select=-Cabin)
titanic_full<-subset(titanic_full, select=-Name)
titanic_full<-subset(titanic_full, select=-Ticket)
titanic_full<-subset(titanic_full, select=-PassengerId)
titanic_full<-subset(titanic_full, select=-Fare)
titanic_full<-subset(titanic_full, select=-Embarked)
titanic_full<-subset(titanic_full, select=-SibSp)
titanic_full<-subset(titanic_full, select=-Parch)
#titanic_full<-subset(titanic_full, select=-Age)
library(e1071)
trainIdx<-c(0:891)
svm.model<-svm(formula=as.factor(Survived) ~.,subset=trainIdx, data = titanic_full,type="C-classification")
test<-titanic_full[-trainIdx,]
test<-subset(test,select=-Survived)
Survived<-predict(svm.model,test)
PassengerId=testData$PassengerId
result<-cbind(data.frame(PassengerId),data.frame(Survived))
setwd("C:\\Users\\Cho\\Desktop\\단국\\2학년 2학기\\데이터사이언스\\수업자료, 복습")
write.csv(result,'submission.csv',row.names = F)
#knn
ds.train<-as.matrix(titanic_full[trainIdx,][,-1])
cl.train<-titanic_full[trainIdx,]$Survived
pred<-knn(ds.train,ds.train,cl.train,k=3,prob=T)
pred
acc<-mean(pred==cl.train)
acc
table(pred,cl.train)
install.packages('titanic')
library(titanic)
library(dplyr)
library(caret)
library(e1071)
#Data Preprocessing
#결측값 처리
titanic_train2<-titanic_train
titanic_test2<-titanic_test
colSums(is.na(titanic_train2))
colSums(is.na(titanic_test2))
titanic_full<-dplyr::bind_rows(titanic_train2, titanic_test2)
titanic_full
colSums(is.na(titanic_full))
View(titanic_full)
#Age결측값 처리
idx1<-which(titanic_full$Pclass==1)
age.p1<-mean(titanic_full$Age[idx1],na.rm=T)
idx2<-which(titanic_full$Pclass==2)
age.p2<-mean(titanic_full$Age[idx2],na.rm=T)
idx3<-which(titanic_full$Pclass==3)
age.p3<-mean(titanic_full$Age[idx3],na.rm=T)
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==1)]=age.p1
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==2)]=age.p2
titanic_full$Age[(is.na(titanic_full$Age))&(titanic_full$Pclass==3)]=age.p3
#Fare 결측값 처리
titanic_full[is.na(titanic_full$Fare),]
titanic_full$Fare[is.na(titanic_full$Fare)]=mean(titanic_full$Fare[which(titanic_full$Pclass==3)],na.rm=T)
colSums(is.na(titanic_full))
#Factorization/Numeric Conversion
titanic_full$Survived<-as.factor(titanic_full$Survived)
titanic_full$Pclass<-as.factor(titanic_full$Pclass)
titanic_full$Pclass<-as.numeric(titanic_full$Pclass)
titanic_full$Age<-as.factor(titanic_full$Age)
titanic_full$Age<-as.numeric(titanic_full$Age)
titanic_full$Embarked<-as.factor(titanic_full$Embarked)
titanic_full$Embarked<-as.numeric(titanic_full$Embarked)
titanic_full$Sex<-as.factor(titanic_full$Sex)
titanic_full$Sex<-as.numeric(titanic_full$Sex)
titanic_full$Fare<-as.factor(titanic_full$Fare)
titanic_full$Fare<-as.numeric(titanic_full$Fare)
View(titanic_full)
#시각화
grp0<-titanic_full[which(titanic_full$Survived==0),]
grp1<-titanic_full[which(titanic_full$Survived==1),]
p0<-hist(grp0$Pclass)
p1<-hist(grp1$Pclass)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,3),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,3),ylim=c(0,600),add=T)
t.test(grp0$Pclass,grp1$Pclass)
p0<-hist(grp0$Sex)
p1<-hist(grp1$Sex)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,3),ylim=c(0,600))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,3),ylim=c(0,600),add=T)
t.test(grp0$Sex,grp1$Sex)
par(mar=c(1,1,1,1))
p0<-hist(grp0$Age)
p1<-hist(grp1$Age)
plot(p0,col=rgb(1,0,0,1/4),xlim=c(1,80),ylim=c(0,150))
plot(p1,col=rgb(0,0,1,1/4),xlim=c(1,80),ylim=c(0,150),add=T)
t.test(grp0$Age,grp1$Age)
par(mfrow=c(1,1),mar=c(5,4,4,2)+.1)
titanic_full2<-titanic_full
titanic_full2<-titanic_full2[,c('Survived','Pclass','Sex','Age','Fare','Embarked')]
Sys.setenv(JAVA_HOME="C:/Program Files/Java/jre1.8.0_40")
library(FSelector)
weights<-FSelector::symmetrical.uncertainty(Survived~.,titanic_full2)
print(weights)
#model
train=subset(titanic_full2,!is.na(titanic_full2$Survived))
ds.train<-train[-1]
cl.train<-train$Survived
test=subset(titanic_full2,is.na(titanic_full2$Survived))
#Classification
model.SVMC<-svm(ds.train,cl.train,type="C-classification")
pred.train<-predict(model.SVMC,ds.train)
caret::confusionMatrix(cl.train,pred.train)
#Regression
titanic_full$Survived<-as.numeric(titanic_full$Survived)
titanic_full2<-titanic_full
titanic_full2<-titanic_full2[,c('Survived','Pclass','Sex','Age','Fare','Embarked')]
train=subset(titanic_full2,!is.na(titanic_full2$Survived))
ds.train<-train[-1]
cl.train<-train$Survived
test=subset(titanic_full2,is.na(titanic_full2$Survived))
model.SVMR<-svm(ds.train,cl.train,type="nu-regression")
pred.train<-predict(model.SVMR,ds.train)
pred.train=data.frame(pred.train)
colnames(pred.train)<-c('Survived')
for(i in 1:nrow(pred.train)){
if(pred.train[i,'Survived']<1.4){
pred.train[i,'Survived']=1
}
else{
pred.train[i,'Survived']=2
}
}
#caret::confusionMatrix(cl.train,pred.train)
confusionMatrix(as.factor(cl.train),as.factor(pred.train[,'Survived']))
#KNN
library(class)
pred<-knn(ds.train,ds.train,cl.train,k=3,prob=T)
acc<-mean(pred==cl.train)
acc
m<-caret::confusionMatrix(as.factor(cl.train),as.factor(pred))
m
#iris
ds.train<-iris[,-5]
cl.train<-iris[,5]
pred<-knn(ds.train,ds.train,cl.train,k=3,prob=T)
acc<-mean(pred==cl.train)
table(pred,cl.train)
Sensitivity,Specifity 계산하기
train<-titanic_full[trainIdx,]
train<-subset(train,select=-Survived)
predicted<-predict(svm.model,train)
actual<-titanic_full[trainIdx,]$Survived
m<-caret::confusionMatrix(as.factor(predicted),as.factor(actual))
m
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 480 102
1 69 240
Accuracy : 0.8081
95% CI : (0.7807, 0.8334)
No Information Rate : 0.6162
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5867
Mcnemar's Test P-Value : 0.0144
Sensitivity : 0.8743
Specificity : 0.7018
Pos Pred Value : 0.8247
Neg Pred Value : 0.7767
Prevalence : 0.6162
Detection Rate : 0.5387
Detection Prevalence : 0.6532
Balanced Accuracy : 0.7880
'기타 > R' 카테고리의 다른 글
[R]난생처음 R코딩&데이터 분석-9장 연습문제 (0) | 2021.11.21 |
---|---|
[R]난생처음 R코딩&데이터 분석-8장 연습문제 (0) | 2021.11.21 |
[R]난생처음 R코딩&데이터 분석-10장 개념 (0) | 2021.11.18 |
[R]난생처음 R코딩&데이터 분석-8장 개념 (0) | 2021.11.05 |
[R]난생처음 R코딩&데이터 분석-6장 연습문제 (0) | 2021.10.14 |