소품집

[다변량분석] 회귀분석 - Prestige Data 잔차 분석, 모델 성능 비교 본문

Statistics

[다변량분석] 회귀분석 - Prestige Data 잔차 분석, 모델 성능 비교

sodayeong 2022. 12. 14. 10:42

티스토리에 다변량 유입이 많아서 ...

졸업전에 올려둡니당

 

 

(HW06)다변량분석-20181478_소다영-2 (1).pdf
3.76MB
(HW05)다변량분석-20181478소다영-2.pdf
6.54MB

library(psych)
library(MASS)
library(PerformanceAnalytics)
library(forecast)
library(Metrics)
library(MLmetrics)
library(carData)

# 데이터 확인 
str(Prestige)
summary(Prestige)

# 결측치 제거 
data <- na.omit(Prestige)

# 종속변수 income log적용 
data$income
summary(data$income)

data$income <- log(data$income)
summary(data$income)

data <- data[,-6]

str(data)
summary(data)

# 상관관계 확인 
chart.Correlation(data, histogram = T)
cor(data)

# 데이터 나누기 
set.seed(1606)
n <- nrow(data)
idx <- 1:n
training_idx <- sample(idx, n * .60)
idx <- setdiff(idx, training_idx)
validate_idx <- sample(idx, n * .20) 
test_idx <- setdiff(idx, validate_idx)
training <- data[training_idx,]
validation <- data[validate_idx,]
test <- data[test_idx,]

# 1차항만 고려한 선형회귀(1) 
data_lm_full <- lm(income~., data=training)
summary(data_lm_full)

data_lm_full_val <- lm(income~., data=validation)
summary(data_lm_full_val)

# training 잔차
par(mfrow=c(1,3))
hist(data_lm_full$residuals)
plot(data_lm_full, which=1)
plot(data_lm_full, which=2)

par(mfrow=c(1,3))
hist(data_lm_full_val$residuals)
plot(data_lm_full_val, which=1)
plot(data_lm_full_val, which=2)

# validation 예측
predict_full <- predict(data_lm_full, newdata = validation)
# 잔차평가
vali_residuals_full<- validation$income - predict_full
hist(vali_residuals_full)
plot(predict_full,vali_residuals_full)
abline(h = 0, col = "red", lty =2)
qqnorm(vali_residuals_full)
qqline(vali_residuals_full, col = "blue") 

# 상호작용항까지 고려한 선형회귀(2)
data_lm_2 <- lm(income~.^2, data=training)
data_lm_2_val <- lm(income~.^2, data=validation)
summary(data_lm_2_val)

par(mfrow=c(1,3))
hist(data_lm_2$residuals)
plot(data_lm_2, which=1)
plot(data_lm_2, which=2)

# validation 예측
predict_full_2 <- predict(data_lm_2, newdata = validation)
# 잔차 평가(validation)
vali_residuals_full_2 <- validation$income - predict_full_2
hist(vali_residuals_full_2)
plot(predict_full_2,vali_residuals_full_2)
abline(h = 0, col = "red", lty =2)
qqnorm(vali_residuals_full_2)
qqline(vali_residuals_full_2, col = "blue")  


# 변수 선택법 활용하기 
# stepAIC 함수 활용 

# step both
data_step_both <- stepAIC(data_lm_full, direction = 'both',scope=list(upper=~.^2, lower=~1))
data_step_both_val <- stepAIC(data_lm_full_val, direction = 'both',scope=list(upper=~.^2, lower=~1))
summary(data_step_both)

par(mfrow=c(1,3))
hist(data_step_both$residuals)
plot(data_step_both, which = 1)
plot(data_step_both, which=2)

# validation 예측
predict_both <- predict(data_step_both, newdata = validation)
# 잔차 평가(validation)
vali_residuals_both <- validation$income - predict_both
hist(vali_residuals_both)
plot(predict_both,vali_residuals_both)
abline(h = 0, col = "red", lty =2)
qqnorm(vali_residuals_both)
qqline(vali_residuals_both, col = "blue") 


# step backward
data_step_back <- stepAIC(data_lm_full, direction = 'backward', scope=list(upper=~.^2, lower=~1))
data_step_back_val <- stepAIC(data_lm_full_val, direction = 'backward', scope=list(upper=~.^2, lower=~1))
summary(data_step_back)

par(mfrow=c(1,3))
hist(data_step_back$residuals)
plot(data_step_back, which = 1)
plot(data_step_back, which=2)

# validation 예측
predict_backward <- predict(data_step_back, newdata = validation)
# 잔차 평가(validation)
vali_residuals_backward <- validation$income - predict_backward
hist(vali_residuals_backward)
plot(predict_backward,vali_residuals_backward)
abline(h = 0, col = "red", lty =2)
qqnorm(vali_residuals_backward)
qqline(vali_residuals_backward, col = "blue") 


# step forward
data_step_forward <- stepAIC(data_lm_full, direction = 'forward', scope=list(upper=~.^2, lower=~1))
data_step_forward_val <- stepAIC(data_lm_full_val, direction = 'forward', scope=list(upper=~.^2, lower=~1))

summary(data_step_forward)
par(mfrow=c(1,3))
hist(data_step_forward$residuals)
plot(data_step_forward, which = 1)
plot(data_step_forward, which=2)

# validation 예측
predict_forward <- predict(data_step_forward, newdata = validation)
# 잔차 평가(validation)
vali_residuals_forward <- validation$income - predict_forward
hist(vali_residuals_forward)
plot(predict_forward,vali_residuals_forward)
abline(h = 0, col = "red", lty =2)
qqnorm(vali_residuals_forward)
qqline(vali_residuals_forward, col = "blue") 

# 성능평가 
# RMSE 함수
RMSE <- function(actual, predict){
  sqrt(mean((actual - predict)^2))
}

# 모형 비교 
library(rsq)
# predict 
predict_full <- predict(data_lm_full, newdata = validation)
predict_full_2 <- predict(data_lm_2, newdata = validation)
predict_both <- predict(data_step_both, newdata = validation)
predict_back <- predict(data_step_back, newdata = validation)
predict_forward <- predict(data_step_forward, newdata = validation)

full.model <- c(length(coef(data_lm_full)), rsq(data_lm_full), rsq(data_lm_full, adj=TRUE), 
                RMSE(training$income, predict(data_lm_full)), RMSE(validation$income, predict_full))

full2.model <- c(length(coef(data_lm_2)), rsq(data_lm_2), rsq(data_lm_2, adj=TRUE), 
                 RMSE(training$income, predict(data_lm_2)), RMSE(validation$income, predict_full_2))

both.model <- c(length(coef(data_step_both)), rsq(data_step_both), rsq(data_step_both, adj=TRUE), 
                RMSE(training$income, predict(data_step_both)), RMSE(validation$income, predict_both))

backward.model <- c(length(coef(data_step_back)), rsq(data_step_back), rsq(data_step_back, adj=TRUE), 
                    RMSE(training$income, predict(data_step_back)), RMSE(validation$income, predict_backward))

forward.model <- c(length(coef(data_step_forward)), rsq(data_step_forward), rsq(data_step_forward, adj=TRUE), 
                   RMSE(training$income, predict(data_step_forward)), RMSE(validation$income, predict_forward))

model_test <- data.frame(full.model, full2.model,both.model, backward.model, forward.model)

names(model_test) <- c("선형회귀", "이차항 선형회귀", "Both", "Backward", "Forward")
rownames(model_test) <- c("계수 개수", "결정계수", "수정된 결정계수", "RMSE 학습", "RMSE 검증")

View(model_test)

회귀분석 - Prestige Data

728x90
Comments