소품집

[다변량 분석] 캐글 Mushrooms Data Classification 본문

Statistics

[다변량 분석] 캐글 Mushrooms Data Classification

sodayeong 2021. 9. 29. 23:37
728x90
setwd('/Users/dayeong/Desktop/21-2/전공/다변량 분석')

# Pakage
library(dplyr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)

# Data Set roding
mushrooms <- read.csv('mushrooms.csv')

for (i in 2:23) { 
  test <- chisq.test(table(mushrooms$class, mushrooms[,i]))
  if (test$p.value < 0.05) {
    print(test)
  }
}

# Target 변수 확인 
ggplot(data=mushrooms, aes(x=class, fill=class)) + 
  geom_bar()+
  labs(title='Mushroom Class Count',subtitle = 'Edible vs Poisonous')

# veil.type 변수는 모두 p(poisonus) -level이 1인 변수로 무의미하다 판단하여 제거. 
mushrooms <- mushrooms[,-17]
mushrooms$class <- factor(mushrooms$class, levels=c('p', 'e'))
summary(mushrooms)

# Train / Test set Split
idx <- sample(1:nrow(mushrooms), nrow(mushrooms)*0.7)
train <- mushrooms[idx, ]
test <- mushrooms[-idx,]

# RandomForest
rf_model <- randomForest(class~., data=train)
pred <- predict(rf_model, newdata=test)
confusionMatrix(pred, test$class)

# Decision Tree

idx <- sample(1:nrow(mushrooms), nrow(mushrooms)*0.7)
train <- mushrooms[idx, ]
test <- mushrooms[-idx,]

tree <- rpart(class~.,data=train)
summary(tree)
pred <- predict(tree, newdata=test, type='class')
confusionMatrix(pred, test$class)

importance(rf_model)
importance(tree)
varImpPlot(rf_model)
varImpPlot(tree)

test$class <- predict(tree, test, type='class')
test$pred <- pred

ggplot(data=test, aes(class, pred)) + 
  geom_jitter(width = 0.2, height = 0.1, size=2)

(HW02)다변량분석-20181478 소다영.pdf
1.39MB

 

728x90
Comments