SML final project R3

House Sales in King County, USA

USA King County Data is come from Kaggle. It includes house sold between 2014 / 05 to 2015 / 05. https://www.kaggle.com/harlfoxem/housesalesprediction

DATA

##     bedrooms  bathrooms sqft_living   sqft_lot     floors  waterfront
## 1 -0.3987279 -1.4474301  -0.9798124 -0.2283160 -0.9154058 -0.08717061
## 2 -0.3987279  0.1756026   0.5336220 -0.1898810  0.9364841 -0.08717061
## 3 -1.4739253 -1.4474301  -1.4262210 -0.1232956 -0.9154058 -0.08717061
## 4  0.6764694  1.1494223  -0.1305470 -0.2440088 -0.9154058 -0.08717061
## 5 -0.3987279 -0.1490039  -0.4354115 -0.1696495 -0.9154058 -0.08717061
## 6  0.6764694  3.0970615   3.6367068  2.0961362 -0.9154058 -0.08717061
##         view  condition      grade sqft_above sqft_basement   yr_built
## 1 -0.3057524 -0.6291723 -0.5588228 -0.7346906    -0.6586658 -0.5448852
## 2 -0.3057524 -0.6291723 -0.5588228  0.4608302     0.2451357 -0.6810627
## 3 -0.3057524 -0.6291723 -1.4095545 -1.2298053    -0.6586658 -1.2938619
## 4 -0.3057524  2.4442374 -0.5588228 -0.8916782     1.3974827 -0.2044412
## 5 -0.3057524 -0.6291723  0.2919089 -0.1308922    -0.6586658  0.5445355
## 6 -0.3057524 -0.6291723  2.8441039  2.5378966     2.7983750  1.0211570
##   yr_renovated re.region sqft_living15 sqft_lot15   category
## 1   -0.2101235         E    -0.9433334 -0.2607094 Economical
## 2    4.7465678         D    -0.4326762 -0.1878634 Economical
## 3   -0.2101235         D     1.0701150 -0.1723713 Economical
## 4   -0.2101235         B    -0.9141530 -0.2845153 Economical
## 5   -0.2101235         D    -0.2721839 -0.1928443 Economical
## 6   -0.2101235         D     4.0465169  3.2654907 Economical

#地圖加上價格標示
library(leaflet)
m <- leaflet(data) %>% setView(lng=median(data$long), lat=median(data$lat), zoom = 9)
pal <- colorFactor(c("blue" ,"red","yellow","pink"), domain = c( "Economical","Luxury","Expensive","Slum"))
m %>% addTiles() %>% 
  addCircleMarkers(
    radius = ~ifelse(category == "Luxury", 3, 2),
    color = ~pal(data$category),
    stroke = FALSE, fillOpacity = 0.8
    
  )

## Assuming "long" and "lat" are longitude and latitude, respectively

re-region

#`re-region`地圖
library(leaflet)
m <- leaflet(data) %>% setView(lng=median(data$long), lat=median(data$lat), zoom = 9)
pal <- colorFactor(c("#A20055", "#FFFF00", "#00DD00", "#4400CC", "#CC00FF"), domain = c( "A","B","C","D","E","F"))
m %>% addTiles() %>% 
  addCircleMarkers(
    radius = ~ifelse(data_2$re.region == "A", 2, 1),
    color = ~pal(data_2$re.region),
    stroke = FALSE, fillOpacity = 0.8
    
  )

## Assuming "long" and "lat" are longitude and latitude, respectively

re-region : F

train & test data

smp.size <- floor(0.8*nrow(data_2)) 
set.seed(516)                     
train.ind <- sample(seq_len(nrow(data_2)), smp.size)
train <- data_2[train.ind, ] # 80%
test <- data_2[-train.ind, ] # 20%
head(train)

##         bedrooms  bathrooms sqft_living    sqft_lot     floors  waterfront
## 10188 -0.3987279  0.5002092   0.2396455  0.30079378  0.9364841 -0.08717061
## 994   -0.3987279 -0.1490039  -0.6422838 -0.18000665 -0.9154058 -0.08717061
## 11173  2.8268641  3.0970615   1.5462076 -0.15941299  0.9364841 -0.08717061
## 4737  -0.3987279 -0.4736105  -0.8491562 -0.15226677 -0.9154058 -0.08717061
## 13498 -0.3987279  0.5002092  -0.2285392 -0.33005308  0.9364841 -0.08717061
## 13082 -0.3987279 -0.4736105  -0.6096198 -0.09710087 -0.9154058 -0.08717061
##             view  condition      grade  sqft_above sqft_basement
## 10188 -0.3057524 -0.6291723  1.1426405  0.61781776    -0.6586658
## 994   -0.3057524 -0.6291723 -0.5588228 -0.36033564    -0.6586658
## 11173 -0.3057524 -0.6291723 -0.5588228  2.06693389    -0.6586658
## 4737  -0.3057524  0.9075325  0.2919089 -0.58977903    -0.6586658
## 13498 -0.3057524 -0.6291723 -0.5588228  0.09855114    -0.6586658
## 13082 -0.3057524 -0.6291723  0.2919089 -0.32410773    -0.6586658
##         yr_built yr_renovated re.region sqft_living15 sqft_lot15
## 10188  0.8168906   -0.2101235         D     0.7199501  0.1421593
## 994    0.5785799   -0.2101235         E    -0.5785782 -0.1821500
## 11173  0.3062247   -0.2101235         D    -0.6369391 -0.1577947
## 4737  -0.2725300   -0.2101235         C    -0.9287432 -0.1453424
## 13498  1.1232902   -0.2101235         E    -0.3889056 -0.3998090
## 13082  0.4083579   -0.2101235         D     0.4719166 -0.1143215
##         category
## 10188 Economical
## 994         Slum
## 11173       Slum
## 4737   Expensive
## 13498       Slum
## 13082 Economical

head(test)

##      bedrooms  bathrooms sqft_living   sqft_lot      floors  waterfront
## 3  -1.4739253 -1.4474301  -1.4262210 -0.1232956 -0.91540583 -0.08717061
## 10 -0.3987279  0.5002092  -0.2067632 -0.2063463  0.93648411 -0.08717061
## 19 -1.4739253 -1.4474301  -0.9580363 -0.1269170 -0.91540583 -0.08717061
## 26 -0.3987279 -0.1490039  -0.4027475 -0.2513240  0.01053914 -0.08717061
## 39  0.6764694 -1.4474301  -0.9362603 -0.1697702 -0.91540583 -0.08717061
## 42  0.6764694  0.1756026   2.3301448  0.2191917 -0.91540583 -0.08717061
##          view  condition      grade  sqft_above sqft_basement    yr_built
## 3  -0.3057524 -0.6291723 -1.4095545 -1.22980532   -0.65866580 -1.29386186
## 10 -0.3057524 -0.6291723 -0.5588228  0.12270307   -0.65866580  1.08924580
## 19 -0.3057524  0.9075325 -0.5588228 -0.71053870   -0.65866580 -1.70239460
## 26 -0.3057524  2.4442374 -1.4095545 -0.09466435   -0.65866580 -1.02150670
## 39 -0.3057524 -2.1658772 -0.5588228 -1.08489371    0.08697046 -0.06826364
## 42 -0.3057524 -0.6291723  0.2919089  0.98009679    3.00173039  0.44240229
##    yr_renovated re.region sqft_living15 sqft_lot15   category
## 3    -0.2101235         D     1.0701150 -0.1723713 Economical
## 10   -0.2101235         E     0.5886382 -0.1903905       Slum
## 19   -0.2101235         E    -1.3518591 -0.2810359       Slum
## 26   -0.2101235         E    -1.3956297 -0.2953195       Slum
## 39   -0.2101235         E    -1.0162844 -0.1819669 Economical
## 42   -0.2101235         E     0.6178186  0.6536928 Economical

SVM

svm.modle <- svm(formula = factor(category) ~ .,  # 依變數的資料形態要是Factor
            data = train)  
summary(svm.modle) # 可以看到SVM預設的參數設定

## 
## Call:
## svm(formula = factor(category) ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.04761905 
## 
## Number of Support Vectors:  10848
## 
##  ( 4936 2306 3056 550 )
## 
## 
## Number of Classes:  4 
## 
## Levels: 
##  Economical Expensive Luxury Slum

train.pred <- predict(svm.modle, train)
table(real=train$category, predict=train.pred)

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       6912       775      5  954
##   Expensive        1150      2591     21   11
##   Luxury             47       403    106    0
##   Slum              956        34      0 3325

#training error
1-sum(diag(table(real=train$category, predict=train.pred)))/sum(table(real=train$category, predict=train.pred))

## [1] 0.2519375

# 訓練資料的分類準確率
test.pred <- predict(svm.modle, test)
confus.matrix <- table(real=test$category, predict=test.pred)
confus.matrix

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       1693       224      1  238
##   Expensive         306       609      7    1
##   Luxury             13       109     30    0
##   Slum              264        14      0  814

#testing error
1-sum(diag(confus.matrix))/sum(confus.matrix)

## [1] 0.2722646

GBDT

## Loaded gbm 2.1.5

gbm <- gbm(category~., train, distribution = "multinomial", 
            n.trees=1000, n.minobsinnode = 10, cv.folds = 5)

best.iter <- gbm.perf(gbm,method="cv")

print(best.iter)

## [1] 1000

#training error
gbm.predict <- as.data.frame(predict(gbm,train))

## Using 1000 trees...

head(gbm.predict)

##   Economical.1000 Expensive.1000 Luxury.1000 Slum.1000
## 1       1.3174820     -0.9287836   -5.005131 -2.320543
## 2       1.1466451     -3.2002124   -6.513454  2.272589
## 3       1.9762202     -3.1827462   -6.656051  3.006480
## 4       0.1911357      1.4040416   -2.516033 -4.891224
## 5       1.5081256     -3.9032253   -6.907769  3.428508
## 6       0.8802032     -0.5016033   -4.841015 -2.733155

p.gbm.predict <- as.data.frame(apply(gbm.predict, 1, which.max))
colnames(p.gbm.predict) <- "gbm.predict"
p.gbm.predict <- p.gbm.predict %>%
  mutate(gbm.predict = gbm.predict) %>%
  mutate(gbm.predict = gsub("1","Economical",gbm.predict)) %>%
  mutate(gbm.predict = gsub("2","Expensive",gbm.predict)) %>%
  mutate(gbm.predict = gsub("3","Luxury",gbm.predict)) %>%
  mutate(gbm.predict = gsub("4","Slum",gbm.predict))
  
confus.matrix <- table(real = train$category, predict = p.gbm.predict$gbm.predict)
confus.matrix

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       7024       715     14  893
##   Expensive        1257      2439     66   11
##   Luxury             42       340    170    4
##   Slum              910        30      4 3371

error.rate <- 1-sum(diag(confus.matrix))/sum(confus.matrix)
error.rate

## [1] 0.247889

#testing error
gbm.predict <- as.data.frame(predict(gbm,test))

## Using 1000 trees...

p.gbm.predict <- as.data.frame(apply(gbm.predict, 1, which.max))
colnames(p.gbm.predict) <- "gbm.predict"
p.gbm.predict <- p.gbm.predict %>%
  mutate(gbm.predict = gbm.predict) %>%
  mutate(gbm.predict = gsub("1","Economical",gbm.predict)) %>%
  mutate(gbm.predict = gsub("2","Expensive",gbm.predict)) %>%
  mutate(gbm.predict = gsub("3","Luxury",gbm.predict)) %>%
  mutate(gbm.predict = gsub("4","Slum",gbm.predict))
  
confus.matrix <- table(real = test$category, predict = p.gbm.predict$gbm.predict)
confus.matrix

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       1718       188      8  242
##   Expensive         315       582     25    1
##   Luxury             15        87     47    3
##   Slum              258        15      1  818

error.rate <- 1-sum(diag(confus.matrix))/sum(confus.matrix)
error.rate

## [1] 0.2678695

summary(gbm,n.trees=best.iter) # 個解釋變數的重要程度

##                         var    rel.inf
## re.region         re.region 53.5978800
## sqft_living     sqft_living 12.8116832
## yr_built           yr_built  5.2109368
## grade                 grade  4.5566718
## sqft_living15 sqft_living15  3.4359664
## sqft_lot15       sqft_lot15  3.3855800
## view                   view  2.7951582
## sqft_above       sqft_above  2.7847761
## sqft_lot           sqft_lot  2.7205827
## sqft_basement sqft_basement  2.5049506
## waterfront       waterfront  1.5983205
## condition         condition  1.3503481
## bedrooms           bedrooms  0.9257865
## bathrooms         bathrooms  0.8880155
## yr_renovated   yr_renovated  0.8306173
## floors               floors  0.6027263

Adaboost

## Loading required package: rpart

## Loading required package: caret

## Loading required package: lattice

## Loading required package: foreach

## Loading required package: doParallel

## Loading required package: iterators

## Loading required package: parallel

adaboost.m <- boosting(category ~ ., data = train)
#training error
train.adaboost.predict <-predict(adaboost.m,train)
train.confus.matrix <- table(real = train$category, predict = train.adaboost.predict$class)
train.confus.matrix

##             predict
## real         Economical Expensive Slum
##   Economical       6578       988 1080
##   Expensive        1382      2352   39
##   Luxury             89       463    4
##   Slum             1006        57 3252

train.error.rate <- train.adaboost.predict$error
train.error.rate

## [1] 0.2954309

#testing error
adaboost.predict <- predict(adaboost.m,test)
confus.matrix <- table(real = test$category, predict = adaboost.predict$class)
confus.matrix

##             predict
## real         Economical Expensive Slum
##   Economical       1621       257  278
##   Expensive         350       569    4
##   Luxury             26       124    2
##   Slum              272        23  797

error.rate <- adaboost.predict$error
error.rate

## [1] 0.3090446

#AdaBoost algorithm with different numbers of classifiers
error <- as.numeric()
for(i in 1:60){
  adaboost_fit <- boosting(category~., data=train, mfinal=i)
  adaboost_predict <- predict.boosting(adaboost_fit,newdata = test)
  error[i] <- adaboost_predict$error
}
plot(error,type = "l")
points(which(error == min(error)),min(error),col = "pink",pch=19,cex=1.2)
text(which(error == min(error))+5,min(error)+0.001,round(min(error),4),cex=0.7)
abline(h=error.rate,col="red")
text(5,error.rate,"mfinal = 100",error.rate,cex=0.7)

xgboost

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

xgb.train <- sparse.model.matrix(category ~ .-1, data = train)
xgb.test <- sparse.model.matrix(category ~ .-1, data = test)
Y <- as.integer(data_2$category) - 1
param <- list("objective" = "multi:softprob",
             "eval_metric" = "mlogloss",
             "num_class" = 4)
cv.model <- xgb.cv(
  params = param, 
  data = xgb.train,
  label=Y[train.ind],
  nfold = 5,     # 5-fold cv
  nrounds=200,   # 測試1-100，各個樹總數下的模型
  # 如果當nrounds < 30 時，就已經有overfitting情況發生，那表示不用繼續tune下去了，可以提早停止                
  early_stopping_rounds = 30, 
  print_every_n = 20 # 每20個單位才顯示一次結果，
)

## [1]  train-mlogloss:1.129162+0.001991    test-mlogloss:1.141337+0.003697 
## Multiple eval metrics are present. Will use test_mlogloss for early stopping.
## Will train until test_mlogloss hasn't improved in 30 rounds.
## 
## [21] train-mlogloss:0.492367+0.003904    test-mlogloss:0.624576+0.013018 
## [41] train-mlogloss:0.407712+0.003773    test-mlogloss:0.613407+0.013818 
## [61] train-mlogloss:0.353745+0.004061    test-mlogloss:0.614434+0.014775 
## Stopping. Best iteration:
## [48] train-mlogloss:0.387375+0.003866    test-mlogloss:0.612533+0.014233

tmp <- cv.model$evaluation_log

plot(x=1:nrow(tmp), y= tmp$train_mlogloss_mean
     , col='red', xlab="nround", ylab="error", main="Avg.Performance in CV") 
points(x=1:nrow(tmp), y= tmp$test_mlogloss_mean, col='blue') 
legend("topright", pch=1, col = c("red", "blue"), 
       legend = c("Train", "Validation") )

best.nrounds <- cv.model$best_iteration 
best.nrounds

## [1] 48

xgb.m <- xgboost(param = param, data = xgb.train, label = Y[train.ind], nrounds = best.nrounds)

## [1]  train-mlogloss:1.130233 
## [2]  train-mlogloss:0.982262 
## [3]  train-mlogloss:0.878376 
## [4]  train-mlogloss:0.805090 
## [5]  train-mlogloss:0.750401 
## [6]  train-mlogloss:0.709056 
## [7]  train-mlogloss:0.676003 
## [8]  train-mlogloss:0.649906 
## [9]  train-mlogloss:0.627689 
## [10] train-mlogloss:0.609096 
## [11] train-mlogloss:0.593382 
## [12] train-mlogloss:0.579202 
## [13] train-mlogloss:0.568102 
## [14] train-mlogloss:0.557642 
## [15] train-mlogloss:0.547872 
## [16] train-mlogloss:0.540220 
## [17] train-mlogloss:0.532291 
## [18] train-mlogloss:0.524282 
## [19] train-mlogloss:0.518107 
## [20] train-mlogloss:0.510698 
## [21] train-mlogloss:0.506133 
## [22] train-mlogloss:0.500622 
## [23] train-mlogloss:0.495493 
## [24] train-mlogloss:0.490939 
## [25] train-mlogloss:0.485078 
## [26] train-mlogloss:0.481138 
## [27] train-mlogloss:0.475833 
## [28] train-mlogloss:0.470954 
## [29] train-mlogloss:0.467318 
## [30] train-mlogloss:0.463604 
## [31] train-mlogloss:0.459022 
## [32] train-mlogloss:0.455061 
## [33] train-mlogloss:0.452483 
## [34] train-mlogloss:0.448760 
## [35] train-mlogloss:0.445700 
## [36] train-mlogloss:0.441747 
## [37] train-mlogloss:0.438776 
## [38] train-mlogloss:0.435880 
## [39] train-mlogloss:0.433537 
## [40] train-mlogloss:0.430380 
## [41] train-mlogloss:0.426177 
## [42] train-mlogloss:0.422699 
## [43] train-mlogloss:0.419195 
## [44] train-mlogloss:0.416899 
## [45] train-mlogloss:0.413884 
## [46] train-mlogloss:0.410645 
## [47] train-mlogloss:0.408554 
## [48] train-mlogloss:0.405254

xgb.pred.train <- predict(xgb.m,xgb.train)
xgb.pred.train <- t(matrix(xgb.pred.train,4,length(xgb.pred.train)/4))
xgb.pred.train <- levels(data_2$category)[max.col(xgb.pred.train)]
xgb.pred.train <- factor(xgb.pred.train,levels=levels(data_2$category))
confusion.matrix.train <- table(real = train$category,predict = xgb.pred.train)
confusion.matrix.train

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       7526       465      2  653
##   Expensive         725      3031     11    6
##   Luxury             23       128    405    0
##   Slum              582        25      0 3708

training.error <- 1-sum(diag(confusion.matrix.train))/sum(confusion.matrix.train)
training.error

## [1] 0.1515327

xgb.pred.test <- predict(xgb.m,xgb.test)
xgb.pred.test <- t(matrix(xgb.pred.test,4,length(xgb.pred.test)/4))
xgb.pred.test <- levels(data_2$category)[max.col(xgb.pred.test)]
xgb.pred.test <- factor(xgb.pred.test,levels=levels(data_2$category))
confusion.matrix.test <- table(real = test$category,predict = xgb.pred.test)
confusion.matrix.test

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       1723       199      8  226
##   Expensive         304       585     34    0
##   Luxury             12        81     58    1
##   Slum              268        16      1  807

testing.error <- 1-sum(diag(confusion.matrix.test))/sum(confusion.matrix.test)
testing.error

## [1] 0.266019

imp <- xgb.importance(xgb.m$feature_names,model=xgb.m)
xgb.plot.importance(imp)

randomForest

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

randomForest.m <- randomForest(category ~., data = train,
                               importance = TRUE,proximity=TRUE,
                               ntree = 100,norm.votes=TRUE)
plot(randomForest.m,col=c("pink","blue","red","green","yellow"),
     type = "l",lwd = 2)
legend(65,0.65,lty=c(1,2,2,2,2),col = c("pink","blue","red","green","yellow"), 
      legend = c("OOB",levels(train$category)),cex=0.6 )

tuneRF(train[,-17], train[,17])

## mtry = 4  OOB error = 26.69% 
## Searching left ...
## mtry = 2     OOB error = 27.24% 
## -0.02058505 0.05 
## Searching right ...
## mtry = 8     OOB error = 26.82% 
## -0.004983749 0.05

##       mtry  OOBError
## 2.OOB    2 0.2724118
## 4.OOB    4 0.2669173
## 8.OOB    8 0.2682475

rf.model <- randomForest(category ~.,
                        data = train,
                        ntree = 60,       
                        mtry = 4)

rf.predict <- predict(rf.model, train)
confusion.matrix.train <- table(real = train$category,predict = rf.predict)
confusion.matrix.train

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       8629         5      1   11
##   Expensive          12      3758      0    3
##   Luxury              4         4    548    0
##   Slum               30         4      0 4281

training.error <- 1-sum(diag(confusion.matrix.train))/sum(confusion.matrix.train)
training.error

## [1] 0.004279931

rf.predict <- predict(rf.model, test)
confusion.matrix.test <- table(real = test$category,predict = rf.predict)
confusion.matrix.test

##             predict
## real         Economical Expensive Luxury Slum
##   Economical       1722       212      3  219
##   Expensive         300       606     16    1
##   Luxury             15        90     46    1
##   Slum              291        13      0  788

testing.error <- 1-sum(diag(confusion.matrix.test))/sum(confusion.matrix.test)
testing.error

## [1] 0.2685635

varImpPlot(rf.model)

DATA

train & test data

data.conclusion <- data[,-c(1,2,3,18,19,22,23,24,25,26,27)]
colnames(data.conclusion)[14] <- "re.region"
data.conclusion[14] <- data_2[14]
data.conclusion <- data.conclusion %>%
  mutate(re.region = gsub("A", 3, re.region)) %>%
  mutate(re.region = gsub("B", 4, re.region)) %>%
  mutate(re.region = gsub("C", 5, re.region)) %>%
  mutate(re.region = gsub("D", 2, re.region)) %>%
  mutate(re.region = gsub("E", 1, re.region)) %>%
  mutate(re.region = gsub("F", 6, re.region))
data.conclusion$re.region <- as.numeric(data.conclusion$re.region)
data.conclusion[,-17] <- scale(data.conclusion[,-17])

train.knn <- data.conclusion[train.ind, ]
test.knn <- data.conclusion[-train.ind, ]

#k=1~100
predicted.knn <- NULL
error.rate <- NULL

for(i in 1:100){
  predicted_knn <- knn(train.knn[,-17],test.knn[,-17],train.knn[,17],k=i)
  error.rate[i] <- mean(test$category != predicted_knn)
}
print(error.rate)

##   [1] 0.3687254 0.3698820 0.3400416 0.3351839 0.3354152 0.3310201 0.3189914
##   [8] 0.3169096 0.3173722 0.3141337 0.3152903 0.3134397 0.3143650 0.3171409
##  [15] 0.3199167 0.3208420 0.3178348 0.3194541 0.3145963 0.3127458 0.3162156
##  [22] 0.3159843 0.3099699 0.3099699 0.3088133 0.3099699 0.3106639 0.3115892
##  [29] 0.3129771 0.3108952 0.3099699 0.3113579 0.3132084 0.3122831 0.3152903
##  [36] 0.3136711 0.3139024 0.3157529 0.3141337 0.3150590 0.3173722 0.3150590
##  [43] 0.3141337 0.3148277 0.3152903 0.3136711 0.3148277 0.3129771 0.3143650
##  [50] 0.3148277 0.3143650 0.3150590 0.3150590 0.3178348 0.3162156 0.3141337
##  [57] 0.3189914 0.3143650 0.3166782 0.3176035 0.3173722 0.3199167 0.3196854
##  [64] 0.3217673 0.3217673 0.3229239 0.3210733 0.3210733 0.3217673 0.3194541
##  [71] 0.3185288 0.3215360 0.3203794 0.3219986 0.3229239 0.3231552 0.3259311
##  [78] 0.3252371 0.3250058 0.3254684 0.3261624 0.3238492 0.3236179 0.3250058
##  [85] 0.3250058 0.3266250 0.3266250 0.3261624 0.3275503 0.3273190 0.3263937
##  [92] 0.3268563 0.3263937 0.3256997 0.3256997 0.3275503 0.3268563 0.3282443
##  [99] 0.3270877 0.3289382

conclusion plot

plot (x = 1:100, y = error.rate,
      xlab = "k of KNN",
      ylab = "error.rate",
      main = "conclusion plot",
      xlim = c(1,100), ylim = c(0,max(error.rate)), pch = 16,
      col = 2, cex = 0.5)
text(90,0.35,"knn",cex=0.8,col = "red")

points(which(error.rate == min(error.rate)),min(error.rate),col = "red",pch=1,cex=1.8)
text(which(error.rate == min(error.rate)),min(error.rate)+0.025,"knn.min.error ",cex=0.8,col = "red")

points(15,0.2519375,col = "pink",pch=19,cex=1.2)
points(15,0.2722646,col = "pink",pch=19,cex=1.2)
text(9,0.2519375-0.025,"svm.training.error",cex=0.8,col = "pink")
text(9,0.2722646+0.025,"svm.testing.error",cex=0.8,col = "pink")

points(30,0.247889,col = "blue",pch=19,cex=1.2)
points(30,0.2678695,col = "blue",pch=19,cex=1.2)
text(31,0.247889-0.025,"gbm.training.error",cex=0.8,col = "blue")
text(31,0.2678695+0.025,"gbm.testing.error",cex=0.8,col = "blue")

points(45,0.2954309,col = "orange",pch=19,cex=1.2)
points(45,0.3090446,col = "orange",pch=19,cex=1.2)
text(46,0.2954309-0.025,"ada.training.error",cex=0.8,col = "orange")
text(46,0.3090446+0.025,"ada.testing.error",cex=0.8,col = "orange")

points(60,0.3090446,col = "green",pch=19,cex=1.2)
points(60,0.266019,col = "green",pch=19,cex=1.2)
text(61,0.3090446+0.025,"ada.best.iter.training.error",cex=0.8,col = "green")
text(61,0.266019-0.025,"ada.best.iter.testing.error",cex=0.8,col = "green")

points(75,0.1515327,col = "black",pch=19,cex=1.2)
points(75,0.266019,col = "black",pch=19,cex=1.2)
text(76,0.1515327-0.025,"xgb.cv.training.error",cex=0.8,col = "black")
text(76,0.266019+0.025,"xgb.cv.testing.error",cex=0.8,col = "black")

points(90,0.004279931,col = "darkviolet",pch=19,cex=1.2)
points(90,0.2685635,col = "darkviolet",pch=19,cex=1.2)
text(91,0.004279931+0.025,"rf.best.iter.training.error",cex=0.8,col = "darkviolet")
text(91 ,0.2685635-0.025,"rf.best.iter.testing.error",cex=0.8,col = "darkviolet")

SML final project R3

Team members : 劉雅涵、李冠廷、簡國丞、許勝連

2019/6/15

House Sales in King County, USA

DATA

re-region

train & test data

SVM

GBDT

Adaboost

xgboost

randomForest

DATA

train & test data

conclusion plot