소품집

[Kaggle] Airbnb Data시각화 및 regression (1) 본문

AI

[Kaggle] Airbnb Data시각화 및 regression (1)

sodayeong 2020. 8. 26. 16:20
728x90

1일 1캐글 하깅

로깅하깅 

 

 

getwd()
setwd('/Users/dayeong/Desktop/reserch/data')


# kaggle 1DAY 
# https://www.kaggle.com/josipdomazet/mining-nyc-airbnb-data-using-r

library(tidyverse)
library(ggthemes)
library(GGally)
library(ggExtra)
library(caret)
library(glmnet)
library(corrplot)
library(leaflet)
library(kableExtra)
library(RColorBrewer)
library(plotly)
library(ggplot2)
library(knitr) # 간단하게 데이터 프레임 생성이 가능해짐!

airbnb <- read.csv('AB_NYC_2019.csv', encoding = 'UTF-8', na.strings = c(''), stringsAsFactors = F)

# 기초 통계 살펴보기 
summary(airbnb)
View(airbnb)

# 데이터 분석에서 id와 host_id는 사용하지 않아도 되므로, 삭제해주자. 
airbnb <- subset(airbnb, delect = -c('id','host_id'))

# character(문자형) column을 factor형으로 변환
names_to_factor <- c('host_name', "neighbourhood_group", "neighbourhood", "room_type")
airbnb[names_to_factor] <- map(airbnb[names_to_factor], as.factor)

# Sanity check 
glimpse(airbnb) # 네 개의 컬럼이 factor형으로 변환된 것을 확인할 수 있다. 

# Missing Data 처리
missing_airbnb <- airbnb %>% summarise_all(~(sum(is.na(.))/n()))
missing_airbnb <- gather(missing_airbnb, key = 'variables', value= 'percent_missing')

ggplot(missing_airbnb, aes(x= variables, y= percent_missing)) + 
        geom_bar(stat='identity',fill='red', aes(color=I('white')), size =0.3) +
        xlab('variables') + coord_flip() +
        ggtitle("Missing Data") +
        xlab("Column name") +
        ylab("Percentage missing") +
        annotate("text", x = 1.5, y = 0.1,label = "host_name and name have less than 0.001\n percentage missing", color = "slateblue", size = 5)


# Data Visualisation
# Price (숙박요금)

# histogram & Density(그릴 때, ..densitu..표시로 밀도를 나타낸다!)
ggplot(data=airbnb, aes(price)) +
  geom_histogram(bins=30, aes(y=..density..), fill='purple')+
  geom_density(alpha=0.2, fill='purple') +
  ggtitle('Trnasformed Distribution of price', subtitle = expression('With'~'log[10]'~'trnasformation of x-axis')) +
  geom_vline(xintercept = round(mean(airbnb$price),2),size=2, linetype=3)+
  scale_x_log10() +
  annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(airbnb$price), 2), "$")),
           color =  "#32CD32", size = 8)

# histogram & densitu with log10 transformation for neoghbournhood areas
# 뉴욕 근교의 도시를 살펴보자.  

airbnb_nh <- airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(price=round(mean(price),2)) %>%
  arrange(desc(price))
airbnb_nh

ggplot(airbnb, aes(price)) +
  geom_histogram(bins = 30, aes(y = ..density..), fill = "purple") + 
  geom_density(alpha = 0.2, fill = "purple") +
  ggtitle("Transformed distribution of price\n by neighbourhood groups",
          subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
  geom_vline(data = airbnb_nh, aes(xintercept = price), size = 2, linetype = 3) +
  geom_text(data = airbnb_nh,y = 1.5, aes(x = price + 1400, label = paste("Mean  = ",price)), color = "darkgreen", size = 4) +
  facet_wrap(~neighbourhood_group) + # 각기 그룹마다 시각화. 
  scale_x_log10()

# Above average price objects by NH area

airbnb_tally <- airbnb %>% 
  filter(price>=mean(price)) %>%
  group_by(neighbourhood_group, room_type) %>% 
  tally()

ggplot(data=airbnb_tally, aes(reorder(neighbourhood_group, desc(n)),n, fill=room_type)) +
  xlab(NULL)+
  ylab('Number of objects') + 
  geom_bar(stat = 'identity')

# Boxplot of price by room type
ggplot(airbnb, aes(x=room_type, y=price)) + geom_boxplot(aes(fill=room_type)) +
  scale_y_log10() +
  ggtitle("Boxplot of price by room type", subtitle = 'Entire homes and apartments haver the highest avg price')+
  geom_hline(yintercept = mean(airbnb$price), col='red', linetype=3)

# Summary of price Distributions
airbnb %>% arrange(desc(price)) %>%
  top_n(10) %>% select(-host_name, -name) %>%
  ggplot(aes(x=price, fill=neighbourhood_group))+
  geom_histogram(bins=50)+
  scale_x_log10() +
  facet_wrap(~room_type + neighbourhood_group)

## 
# Machine Learning 

# Dataset splitting (7:3)
airbnb <- airbnb %>% mutate(id=row_number())
airbnb_train <- airbnb %>% sample_frac(.7) %>% filter(price>0) # 트레인셋을 70%의 비율로 주고, 나머지는 테스트 셋으로 
airbnb_test <- anti_join(airbnb, airbnb_train, by='id') %>% filter(price>0)

# sanity check 
nrow(airbnb_train) + nrow(airbnb_test) == nrow(airbnb %>% filter(price>0))

# Linear Regression Model 
first_model <- train(price ~ latitude + longitude + room_type + minimum_nights  + availability_365 + neighbourhood_group, data = airbnb_train, method = "lm")
summary(first_model) # 성능이 0.098로 좋지 않은 성능을 보임 

plot(first_model$finalModel)






 

728x90
Comments