Mais conteúdo relacionado
データサイエンティストカジュアルトーク by wdkz
- 12. 2013/09/06
toy dataの取得用関数(参考)
サイバー系
12
library(RCurl)
library(XML)
#検索ワードからpmidを取得
get.pmid <- function(term="wada+kazuya[author]"){
url.str <- paste0("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?
db=pubmed&term=", term)
xml1 <- xmlTreeParse(getURL(url.str))
pmids.list <- xml1[["doc"]][["eSearchResult"]][["IdList"]]
pmids <- rep(NA, length=length(pmids.list))
for(i in 1:length(pmids.list)){
pmids[i] <- as.integer(xmlValue(pmids.list[[i]]))
}
return(pmids)
}
#pmidから論文のAbstructを取得
get.pmsummary <- function(pmids=c(21799770,21416533)){
url.str <- paste0("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
db=pubmed&id=", paste(pmids, collapse=","), "&retmode=xml")
xml1 <- xmlTreeParse(getURL(url.str))
pm.summaries <- xml1[["doc"]][["PubmedArticleSet"]]
pmsummary <- rep(NA, length=length(pm.summaries))
for(i in 1:length(pm.summaries)){
pmsummary[i] <- xmlValue(pm.summaries[[i]][[1]][["Article"]][["Abstract"]]
[["AbstractText"]])
}
return(pmsummary)
}
pmsummary <- get.pmsummary(get.pmid("wada+kazuya[author]")) #wdkzの論文サマリ取得
- 22. 2013/09/06
簡易実装例_model構築
サイバー系
22
wadandomForest <- function(formula, data, ndiv=10,...){�
data.hdfs <- to.dfs(data) # hdfs
# _Maper ndiv
map.fun <- function(k, v){�
generate.rand <- function(i){�
draws <- rpois(n=nrow(v), lambda=(1/ndiv))�
indices <- rep((1:nrow(v)), draws)�
keyval(i, v[indices, ])�
}�
c.keyval(lapply(1:ndiv, generate.rand))�
}�
# _Reducer randomForest hdfs
reduce.fun <- function(k, v){�
rf_mdl <- randomForest::randomForest(formula=formula, data=v, ...)�
keyval(k, list(forest=rf_mdl))�
}�
#Map-Reduce
mr_res <- mapreduce(input=data.hdfs, map=map.fun, reduce=reduce.fun)�
}�
- 23. 2013/09/06
簡易実装例_変数重要度
サイバー系
23
variableImp <- function(wf_mdl, var_output=NULL){�
#variableImportace Mapper
# key, MeanDecreaseGini value
map.fun <- function(k, v){�
mk.keyval <- function(i){�
keyval(key=rownames(v[[i]]$importance), val=as.numeric(v[[i]]$importance))�
}�
c.keyval(lapply(1:length(k), mk.keyval))�
}�
#variableImportance reduce
reduce.fun <- function(k, v){�
keyval(k, mean(v))�
}�
#MR
mr_res <- mapreduce(input=wf_mdl$model_output, map=map.fun, reduce=reduce.f
}�
variableImp.plot <- function(vi_df=as.data.frame(varImportance)){�
library(ggplot2)�
print(ggplot(vi_df, aes(x=key, y=val)) + geom_bar(stat="identity") �
���������+ opts(axis.text.x=theme_text(angle=-90)))�
}�
- 24. 2013/09/06
簡易実装例_予測
サイバー系
24
predict.wadandomForest <- function(wf_mdl, data, ...){ �
data.hdfs <- to.dfs(data)�
#predict Mapper
map.fun <- function(k, v){�
generate.rnd <- function(i){�
draws <- rpois(n=nrow(v), lambda=0.2)�
indices <- rep((1:nrow(v)), draws)�
vv <- v[unique(indices), ]�
rf_mdl <- rf_mdls$val[[sample(length(rf_mdls$key),1)]]�
vv_prd <- predict(rf_mdl, vv, …)�
keyval(as.integer(names(vv_prd)), as.integer(vv_prd))�
}�
library(randomForest)�
rf_mdls <- from.dfs(wf_mdl$model_output)�
c.keyval(lapply(1:100, generate.rnd)) �
}
#predict reduce
reduce.fun <- function(k, v){�
keyval(k, mean(v))�
}�
#MR
mr_res <- mapreduce(input=data.hdfs, map=map.fun, reduce=reduce.fun)�
} �
- 25. 2013/09/06
randomForest on Hadoop実行結果例
サイバー系
25
★randomForest on Map-Reduce概要_予測
�������������1 ���� 2�
1 45576 1664�
2 ��3893 5824�
#model hdfs
wf_mdl <- wadandomForest(formula=label ~ ., data=train.data)�
#
wf_varImp <- variableImp(wf_mdl)�
varImportance <- from.dfs(wf_varImp)�
variableImp.plot(as.data.frame(varImportance))�
#
wf_pred <- predict.wadandomForest(wf_mdl, test.data)�
wf_result <- from.dfs(wf_pred)�
#
table(as.integer(gf_test$label), ifelse(wf_result$val>=1.5,2,1))�