본문 바로가기

카테고리 없음

RF vs. BTLM and BGPLLM

하층부 식생 분율을 랜덤 포레스트와 베이지안 GP, 베이지안 트리드 리니어 모델 세 가지로 모형해 본 것. 워낙 자료 양이 적어서 결정계수는 높지 않지만 셋 다 그런대로 경향은 잘 잡고 있음. 오늘 친구 분석 도와준 건데, 그냥 tgp 라이브러리 소개 겸 올림. 역시 RF가 잘 맞추면서도 mean 을 모델하다 보니 전반적으로 높은 값을 못 찾아내는 경향을 보임. 코드에선 빠졌는데 상대 오차 보면 RF는 살짝 underestimate 하고 나머지 두 개는 살짝 overestimate. 나는 overestimate 하는 편이 더 좋은데, 지금 다 갈아엎긴 그렇고, 여튼 고민 중. 오늘 처음으로 bgp 돌렸군 글고 보니..  밥 값은 했다. 

참고로, 
보통 RF 쓰면 cross-validation 만 하는 경우가 많다. 자동으로 해 주기도 하고, 말이 안되지도 않고. 아래에선 자료를 손으로 잘라서 별도로 했는데, 그냥 예시로 보여주느라고. 여기 처럼 손으로 자르는 것 보단 자동으로 여러 번 잘라서 계산하는 cross-validation 결과가 더 믿을 수 있음. 다만 두 개의 다소 차이나는 장소와 시간에서 자료가 만들어진 경우 나는 별도로 테스트 하는 것 선호. 


load("slc.var.RData")
str(slc.var)
## 'data.frame':    30 obs. of  13 variables:
##  $ pnr0051_avg: num  0.00789 0.00678 0.02334 0 0 ...
##  $ pnr0102_avg: num  0 0 0.00105 0 0 ...
##  $ pnr0205_avg: num  0.01535 0.01357 0.00899 0.0163 0 ...
##  $ pnr0510_avg: num  0.1342 0.0498 0.0909 0.1245 0.1408 ...
##  $ pnr1020_avg: num  0.2106 0.2821 0.2452 0.2032 0.0511 ...
##  $ pnr2030_avg: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ cvr_avg    : num  0.823 0.862 0.822 0.843 0.967 ...
##  $ grnd_avg   : num  0.307 0.307 0.247 0.271 0.262 ...
##  $ h10_avg    : num  2.78 4.23 3.17 2.37 5.78 ...
##  $ dtm_avg    : num  89.1 46.1 67.6 48 50.9 ...
##  $ skw_avg    : num  -0.1559 -0.2283 -0.0117 -0.2704 -0.2972 ...
##  $ scosa      : num  2.58e-02 -2.29e-02 -4.48e-03 4.75e-05 5.57e-05 ...
##  $ under      : num  0.22 0.452 0.665 0.37 0.21 ...
# 'data.frame': 30 obs. of 13 variables: $ pnr0051_avg: num 0.00789
# 0.00678 0.02334 0 0 ...  $ pnr0102_avg: num 0 0 0.00105 0 0 ...  $
# pnr0205_avg: num 0.01535 0.01357 0.00899 0.0163 0 ...  $ pnr0510_avg:
# num 0.1342 0.0498 0.0909 0.1245 0.1408 ...  $ pnr1020_avg: num 0.2106
# 0.2821 0.2452 0.2032 0.0511 ...  $ pnr2030_avg: num 0 0 0 0 0 0 0 0 0 0
# ...  $ cvr_avg : num 0.823 0.862 0.822 0.843 0.967 ...  $ grnd_avg : num
# 0.307 0.307 0.247 0.271 0.262 ...  $ h10_avg : num 2.78 4.23 3.17 2.37
# 5.78 ...  $ dtm_avg : num 89.1 46.1 67.6 48 50.9 ...  $ skw_avg : num
# -0.1559 -0.2283 -0.0117 -0.2704 -0.2972 ...  $ scosa : num 2.58e-02
# -2.29e-02 -4.48e-03 4.75e-05 5.57e-05 ...  $ under : num 0.22 0.452
# 0.665 0.37 0.21 ...

learning.samples <- 20

slc.learning <- slc.var[1:learning.samples, ]
slc.learning.X <- slc.learning[, -13]
slc.learning.under <- data.frame(under = slc.learning[, 13])

slc.test <- slc.var[(learning.samples + 1):30, ]
slc.test.X <- slc.test[, -13]
slc.test.under <- data.frame(under = slc.test[, 13])

#### 1. Random Forests
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-7
## Type rfNews() to see new features/changes/bug fixes.
slc.rf <- randomForest(under ~ ., data = slc.learning, ntree = 1000, mtry = 5, 
    importance = T, na.action = na.fail, proximity = T, nodesize = 3, do.trace = 250)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##  250 |  0.08735   122.86 |
##  500 |  0.08584   120.74 |
##  750 |  0.08503   119.59 |
## 1000 |  0.08482   119.30 |

slc.rf.pred <- sin(predict(slc.rf, newdata = slc.test.X, se.fit = T, prediction.interval = T, 
    trace = T, verb = T, type = "response"))^2

# Present the - all data taken to be the learning set
require(quantregForest)
## Loading required package: quantregForest

slc.rf.q <- quantregForest(x = slc.learning.X, y = unlist(slc.learning.under), 
    ntree = 1000, mtry = 5)
plot(slc.rf.q)

plot of chunk unnamed-chunk-1



#### 2. Bayesian Treed modeling Bayesian Linear Model Bayesian Treed LMs
require(tgp)
## Loading required package: tgp

slc.btlm <- btlm(X = slc.learning.X, XX = slc.test.X, Z = slc.learning.under, 
    trace = T)
## 
## burn in:
## r=1000 d=[0]; n=20
## r=2000 d=[0]; n=20
## 
## Sampling @ nn=10 pred locs: [with traces]
## r=1000 d=[0]; mh=1 n=20
## r=2000 d=[0]; mh=1 n=20
## r=3000 d=[0]; mh=1 n=20
## r=4000 d=[0]; mh=1 n=20
## r=5000 d=[0]; mh=1 n=20
## Grow: 0%, 
## 
## Gathering traces
##   XX 77% done   
  XX 78% done   
  XX 79% done   
  XX 80% done   
  XX 81% done   
  XX 83% done   
  XX 85% done   
  XX 88% done   
  XX 92% done   
  XX 100% done  
##   hier-params done
##   parts done
##   posts done
##   Zp done
##   Zp.km done
##   Zp.ks2 done
##   ZZ done
##   ZZ.km done
##   ZZ.ks2 done
slc.btlm
## 
## This is a 'tgp' class object.  
## It is basically a list with the following entries:
## 
##  [1] X        n        d        Z        XX       nn       Xsplit  
##  [8] trace    BTE      R        linburn  g        dparams  itemps  
## [15] bimprov  Zp.mean  ZZ.mean  Zp.km    ZZ.km    Zp.vark  ZZ.vark 
## [22] Zp.q     ZZ.q     Zp.s2    ZZ.s2    Zp.ks2   ZZ.ks2   Zp.q1   
## [29] Zp.med   Zp.q2    ZZ.q1    ZZ.med   ZZ.q2    ess      gpcs    
## [36] response improv   parts    trees    posts    params   m0r1    
## 
## See ?btgp for an explanation of the individual entries.  
## See plot.tgp and tgp.trees for help with visualization.
## 
## The $trace field, if it exists, is of class 'tgptraces' 
## and has its own print statement
slc.btlm.pred <- sin(slc.btlm$ZZ.mean)^2

# Bayesian GP LLM (not treed)
slc.bgpllm <- bgpllm(X = slc.learning.X, XX = slc.test.X, Z = slc.learning.under, 
    trace = T)
## 
## burn in:
## r=1000 d=[0/0.869599 0/0.777034 0/1.23805 0/1.35708 0/1.79883 0/0.718572 0/1.16572 0/1.44239 0.018367 0.494185 0/1.24017 0.621521]; n=20
## 
## Sampling @ nn=10 pred locs: [with traces]
## r=1000 d=[0]; mh=1 n=20
## r=2000 d=[0.893157 0/1.38708 0/0.027845 0/0.788122 0.00301293 0/0.00609372 0.118247 1.25198 0/1.21043 0/0.38984 0/0.056766 0.00397402]; mh=1 n=20
## r=3000 d=[0.852548 0/0.746395 0.0265804 0/1.2483 0.00870976 0.00129379 0.0228211 0/1.1123 0/1.35974 0/1.3035 0.0304227 0.0739256]; mh=1 n=20
## 
## Gathering traces
##   XX 76% done   
  XX 77% done   
  XX 78% done   
  XX 79% done   
  XX 80% done   
  XX 82% done   
  XX 84% done   
  XX 87% done   
  XX 91% done   
  XX 100% done  
##   hier-params done
##   linarea done
##   posts done
##   Zp done
##   Zp.km done
##   Zp.ks2 done
##   ZZ done
##   ZZ.km done
##   ZZ.ks2 done
slc.bgpllm.pred <- sin(slc.bgpllm$ZZ.mean)^2


##### Evaluation
slc.obs <- sin(slc.test.under)^2

slc.rf.mse <- sqrt(mean(slc.rf.pred - slc.obs)^2)
slc.rf.mse
## [1] 0.03931

slc.btlm.mse <- sqrt(mean(slc.btlm.pred - slc.obs)^2)
slc.btlm.mse
## [1] 0.1664

slc.bgpllm.mse <- sqrt(mean(slc.bgpllm.pred - slc.obs)^2)
slc.bgpllm.mse
## [1] 0.1671

### Calculates Coefficient of determination (R2) Note: Negative values of
### R2 may occur when fitting non-linear trends to data.[2] In these
### instances, the mean of the data provides a fit to the data that is
### superior to that of the trend under this goodness of fit analysis.
### source: http://en.wikipedia.org/wiki/Coefficient_of_determination

R2 <- function(x.obs, x.sim, na.rm = T, digits = NULL) {

    stopifnot(length(x.obs) == length(x.sim))
    x.obs.avg <- mean(x.obs, na.rm = na.rm)

    TSS <- sum((x.obs - x.obs.avg)^2, na.rm = na.rm)  # Total sum of squares
    RSS <- sum((x.obs - x.sim)^2, na.rm = na.rm)  # Residual sum of squares.

    R2 <- 1 - (RSS/TSS)
    if (is.null(digits)) {
        return(R2)
    } else {
        return(round(R2, digits = digits))
    }
}


R2(slc.obs, slc.rf.pred)
## [1] -0.2169
R2(slc.obs, slc.btlm.pred)
## [1] -6.12
R2(slc.obs, slc.bgpllm.pred)
## [1] -6.12

cor(slc.obs, slc.rf.pred)
##         [,1]
## under 0.1651
cor(slc.obs, slc.btlm.pred)
##         [,1]
## under 0.3534
cor(slc.obs, slc.bgpllm.pred)
##         [,1]
## under 0.3762


plot(x = slc.obs, y = slc.rf.pred, xlim = c(0, 1), asp = 1, ylim = c(0, 1), 
    pch = 4, ylab = "Prediction", xlab = "Observation", col = "darkgreen", main = "SLC VAR - Test results (Understory fraction prediction")
abline(a = 0, b = 1)
points(x = slc.obs, y = slc.btlm.pred, col = "red", pch = 7)
points(x = slc.obs, y = slc.bgpllm.pred, col = "blue", pch = 17)
legend("topright", legend = c("Random Forests", "Bayesian Treed LM", "Bayesian Gaussian Process LLM"), 
    col = c("darkgreen", "red", "blue"), pch = c(4, 7, 17))

plot of chunk unnamed-chunk-1



#### Variable importance
varImpPlot(slc.rf, sort = T)

plot of chunk unnamed-chunk-1