SlideShare a Scribd company logo
1 of 20
Download to read offline
R_Recap_02
Felix Mueller
2/27/2017
Prepare your R session
setwd("~/Google_Drive/github/R/R_Recap/session_02")
library(dplyr)
library(ggplot2)
How to apply models in general
Understanding the problem
Understanding the data
Preparing the data
Split in Train & Test
Predicting variable selection
Resampling
Model
Tuning of the model
Evaluation
=> ITERATIVE PROCESS! # Iris dataset {.smaller}
Loading the Iris dataset
glimpse(iris)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6,
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4,
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4,
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3,
## $ Species <fctr> setosa, setosa, setosa, setosa, se
df = iris
df[,-5] = scale(df[,-5])
print(any(is.na(df)))
## [1] FALSE
Function: Split data
splitdf <- function(dataframe, seed=1,
percentage=0.8) {
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
numTrainingSamples <- round(length(index)
* percentage)
trainindex <- sample(index,
numTrainingSamples)
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)}
resampling the iris data
df = df %>% filter(Species!= "setosa")
levels(df$Species) = list("virginica"="virginica",
"versicolor"=c("versicolor","setosa"))
split <- splitdf(df,seed=1)
train <- split$trainset
test <- split$testset
library(ROSE)
print(sum(train$Species == "versicolor"))
## [1] 46
train <- ovun.sample(Species~.,data=train,
method="under",p=0.7)$data
print(sum(train$Species == "versicolor"))
## [1] 14
Building and tuning a tree
library(tree)
set.seed(37)
tree = tree(Species ~.,train)
tree_cv = cv.tree(tree,method="misclass")
print(tree_cv$size)
## [1] 3 2 1
print(tree_cv$dev)
## [1] 7 7 15
tree_pruned = prune.tree(tree,best=2)
Test with final test set
predictions = predict(tree,newdata=test)
predictions = as.factor(if_else(
predictions[,2]>0.5,"virginica","versicolor"))
print(table(predictions,test$Species))
##
## predictions virginica versicolor
## versicolor 1 4
## virginica 15 0
pruned prediction
predictions_pr = predict(tree_pruned,newdata=test)
predictions_pr = as.factor(
if_else(predictions_pr[,2]>0.5,
"virginica","versicolor"))
print(table(predictions_pr,test$Species))
##
## predictions_pr virginica versicolor
## versicolor 1 4
## virginica 15 0
Marketing Intelligence
#not in validation set --> exclude
df = df %>% dplyr::select(-iab_4)
names <- c('device','os','os_version','browser'
,'browser_version','prov','day','target')
df[,names] <- lapply(df[,names], factor)
Filtering
cols = sapply(df,
function(x){is.integer(x)|is.numeric(x)})
cor_matrix = cor(df[,cols],as.numeric(df$target)
,method="spearman")
#REALLY low threshold!
names_num <- rownames(subset(cor_matrix
,abs(cor_matrix) >= 0.01))
df <- df %>% dplyr::select(one_of(names_num,names))
#Random Forest cannot handly NAs
df$prov <- NULL
Resample
splits = splitdf(df,percentage=0.9)
test = splits$testset
train = splits$trainset
train <- ovun.sample(formula=target~.,data=train
,method="under", p=0.5)$data
rm(df)
rm(splits)
Random forest
library(randomForest)
model <- randomForest(target~., data=train,
importance=TRUE)
probs <- predict(model, type="prob", newdata = test)
predictions <- data.frame(target=test$target
, pred=probs[,2])
Threshold detection
library(pROC)
getThreshold <- function(predictions){
myROC <- roc(formula = target ~ pred, predictions)
optimalThreshold <- coords(myROC, "best"
, ret = "threshold")
}
threshold <- getThreshold(predictions)
tbl = table(predictions$target,
predictions$pred > threshold)
Compare results
print(tbl)
##
## FALSE TRUE
## 0 133536 55952
## 1 123 235
auc = auc(predictions$target, predictions$pred)
accuracy <- (tbl[1,1] + tbl[2,2]) / sum(tbl)
F1 <- (2*(tbl[1,1]))/((2*(tbl[1,1]))+tbl[2,1]+tbl[1,2])
print(cat("Accuracy:",accuracy,"F1:",F1,"AUC:",auc,""))
## Accuracy: 0.704629 F1: 0.8264722 AUC: 0.7253162 NULL
Variable importance
#high values wanted
print(head(importance(model)[,3:4]))
## MeanDecreaseAccuracy MeanDecreaseGini
## freq 17.55651 213.18706
## adnexus 22.43981 112.44327
## bsw_rubicon 24.14906 150.20596
## bsw_google 32.64226 152.25962
## bsw_adconductor 13.30819 77.42707
## bsw_pubmatic 13.85020 59.55289
VarImpPlot
iab_19
browser_version
domain_2
bsw_google
os_version
26 34
MeanDecreaseAccuracy
domain_1
browser_version
freq
os_version
day
0 200
MeanDecreaseGini
model
K-Fold CV
set.seed(1)
folds <- sample(rep(1:10,length=nrow(train)))
cv_error <- vector()
sequence = seq(1,10)
for(k in sequence){
model <- randomForest(target~.,data=train[folds!=k,])
pred <- predict(model,train[folds==k,],type="class")
cv_error[k] <- mean(train[folds==k,]$target==pred)
}
ggplot2
q1 = qplot(y=cv_error,x=sequence,xlab="Iteration",
ylab="CV Error",main="CV Error per iteration")+
geom_line(colour=I("blue"))+
geom_point(aes(x = which(cv_error==min(cv_error)),
y = min(cv_error)), color = I("red"))+
geom_hline(aes(yintercept=max(cv_error)),
color = I("red"),linetype = 2)
ggplot2 output
0.64
0.65
0.66
0.67
0.68
2.5 5.0 7.5 10.0
Iteration
CVError
CV Error per iteration
Other ideas to play with
1. mtry (default = sqrt(p)): number of variables per tree
2. ntree: number of trees

More Related Content

What's hot

Chap 3php array part 2
Chap 3php array part 2Chap 3php array part 2
Chap 3php array part 2monikadeshmane
 
Feature flagsareflawed
Feature flagsareflawedFeature flagsareflawed
Feature flagsareflawedStephen Young
 
Feature Flags Are Flawed: Let's Make Them Better
Feature Flags Are Flawed: Let's Make Them BetterFeature Flags Are Flawed: Let's Make Them Better
Feature Flags Are Flawed: Let's Make Them BetterStephen Young
 
My sql presentation
My sql presentationMy sql presentation
My sql presentationNikhil Jain
 
Internationalizing CakePHP Applications
Internationalizing CakePHP ApplicationsInternationalizing CakePHP Applications
Internationalizing CakePHP ApplicationsPierre MARTIN
 
Extending Moose
Extending MooseExtending Moose
Extending Moosesartak
 
Professional-grade software design
Professional-grade software designProfessional-grade software design
Professional-grade software designBrian Fenton
 
Advance Techniques In Php
Advance Techniques In PhpAdvance Techniques In Php
Advance Techniques In PhpKumar S
 
[1062BPY12001] Data analysis with R / week 4
[1062BPY12001] Data analysis with R / week 4[1062BPY12001] Data analysis with R / week 4
[1062BPY12001] Data analysis with R / week 4Kevin Chun-Hsien Hsu
 
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...Matt Harrison
 
Class 4 - PHP Arrays
Class 4 - PHP ArraysClass 4 - PHP Arrays
Class 4 - PHP ArraysAhmed Swilam
 
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...Matt Harrison
 
Dependency injection in PHP 5.3/5.4
Dependency injection in PHP 5.3/5.4Dependency injection in PHP 5.3/5.4
Dependency injection in PHP 5.3/5.4Fabien Potencier
 
Symfony2 and Doctrine2 Integration
Symfony2 and Doctrine2 IntegrationSymfony2 and Doctrine2 Integration
Symfony2 and Doctrine2 IntegrationJonathan Wage
 
Hidden treasures of Ruby
Hidden treasures of RubyHidden treasures of Ruby
Hidden treasures of RubyTom Crinson
 

What's hot (20)

Chap 3php array part 2
Chap 3php array part 2Chap 3php array part 2
Chap 3php array part 2
 
Feature flagsareflawed
Feature flagsareflawedFeature flagsareflawed
Feature flagsareflawed
 
Feature Flags Are Flawed: Let's Make Them Better
Feature Flags Are Flawed: Let's Make Them BetterFeature Flags Are Flawed: Let's Make Them Better
Feature Flags Are Flawed: Let's Make Them Better
 
CakeFest 2013 keynote
CakeFest 2013 keynoteCakeFest 2013 keynote
CakeFest 2013 keynote
 
Drupal7 dbtng
Drupal7  dbtngDrupal7  dbtng
Drupal7 dbtng
 
My sql presentation
My sql presentationMy sql presentation
My sql presentation
 
Internationalizing CakePHP Applications
Internationalizing CakePHP ApplicationsInternationalizing CakePHP Applications
Internationalizing CakePHP Applications
 
Dsl
DslDsl
Dsl
 
Extending Moose
Extending MooseExtending Moose
Extending Moose
 
Professional-grade software design
Professional-grade software designProfessional-grade software design
Professional-grade software design
 
Advance Techniques In Php
Advance Techniques In PhpAdvance Techniques In Php
Advance Techniques In Php
 
Agile database access with CakePHP 3
Agile database access with CakePHP 3Agile database access with CakePHP 3
Agile database access with CakePHP 3
 
[1062BPY12001] Data analysis with R / week 4
[1062BPY12001] Data analysis with R / week 4[1062BPY12001] Data analysis with R / week 4
[1062BPY12001] Data analysis with R / week 4
 
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...
Analysis of Fatal Utah Avalanches with Python. From Scraping, Analysis, to In...
 
Class 4 - PHP Arrays
Class 4 - PHP ArraysClass 4 - PHP Arrays
Class 4 - PHP Arrays
 
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...
How to Become a Tree Hugger: Random Forests and Predictive Modeling for Devel...
 
Dependency injection in PHP 5.3/5.4
Dependency injection in PHP 5.3/5.4Dependency injection in PHP 5.3/5.4
Dependency injection in PHP 5.3/5.4
 
Symfony2 and Doctrine2 Integration
Symfony2 and Doctrine2 IntegrationSymfony2 and Doctrine2 Integration
Symfony2 and Doctrine2 Integration
 
Hidden treasures of Ruby
Hidden treasures of RubyHidden treasures of Ruby
Hidden treasures of Ruby
 
Neatly folding-a-tree
Neatly folding-a-treeNeatly folding-a-tree
Neatly folding-a-tree
 

Viewers also liked

〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA
〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA
〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizANohelis RUIZ ARVELO Networking Strategist
 
Classement après journée24 11 12mars
Classement après journée24 11 12marsClassement après journée24 11 12mars
Classement après journée24 11 12marscristeljoiris
 
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XX
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XXTRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XX
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XXGemma Ajenjo Rodriguez
 

Viewers also liked (6)

Pasión al fútbol
Pasión al fútbol Pasión al fútbol
Pasión al fútbol
 
Planeación de mercadotecnia y segmentacion
Planeación de mercadotecnia  y segmentacionPlaneación de mercadotecnia  y segmentacion
Planeación de mercadotecnia y segmentacion
 
〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA
〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA
〝El Networking Relacional desplaza al Networking Tradicional〞 @NohelisRuizA
 
Classement après journée24 11 12mars
Classement après journée24 11 12marsClassement après journée24 11 12mars
Classement après journée24 11 12mars
 
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XX
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XXTRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XX
TRANSFORMACIONS ECONÒMIQUES I SOCIALS AL PRIMER TERÇ DEL SEGLE XX
 
Publicación2
Publicación2Publicación2
Publicación2
 

Similar to Session 02

An Introduction to Data Mining with R
An Introduction to Data Mining with RAn Introduction to Data Mining with R
An Introduction to Data Mining with RYanchang Zhao
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfannikasarees
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxcarliotwaycave
 
Bioinformatics p5-bioperl v2013-wim_vancriekinge
Bioinformatics p5-bioperl v2013-wim_vancriekingeBioinformatics p5-bioperl v2013-wim_vancriekinge
Bioinformatics p5-bioperl v2013-wim_vancriekingeProf. Wim Van Criekinge
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsDataconomy Media
 
Drupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary EditionDrupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary Editionddiers
 
Unit 4_Working with Graphs _python (2).pptx
Unit 4_Working with Graphs _python (2).pptxUnit 4_Working with Graphs _python (2).pptx
Unit 4_Working with Graphs _python (2).pptxprakashvs7
 
Data science with R - Clustering and Classification
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and ClassificationBrigitte Mueller
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01Raman Kannan
 
Classification examp
Classification exampClassification examp
Classification exampRyan Hong
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 

Similar to Session 02 (20)

An Introduction to Data Mining with R
An Introduction to Data Mining with RAn Introduction to Data Mining with R
An Introduction to Data Mining with R
 
R is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdfR is a very flexible and powerful programming language, as well as a.pdf
R is a very flexible and powerful programming language, as well as a.pdf
 
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docxINFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
INFORMATIVE ESSAYThe purpose of the Informative Essay assignme.docx
 
dplyr-tutorial.pdf
dplyr-tutorial.pdfdplyr-tutorial.pdf
dplyr-tutorial.pdf
 
Bioinformatica p6-bioperl
Bioinformatica p6-bioperlBioinformatica p6-bioperl
Bioinformatica p6-bioperl
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
Bioinformatics p5-bioperl v2013-wim_vancriekinge
Bioinformatics p5-bioperl v2013-wim_vancriekingeBioinformatics p5-bioperl v2013-wim_vancriekinge
Bioinformatics p5-bioperl v2013-wim_vancriekinge
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R seminar dplyr package
R seminar dplyr packageR seminar dplyr package
R seminar dplyr package
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forests
 
Drupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary EditionDrupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary Edition
 
cluster(python)
cluster(python)cluster(python)
cluster(python)
 
Unit 4_Working with Graphs _python (2).pptx
Unit 4_Working with Graphs _python (2).pptxUnit 4_Working with Graphs _python (2).pptx
Unit 4_Working with Graphs _python (2).pptx
 
Decision Tree.pptx
Decision Tree.pptxDecision Tree.pptx
Decision Tree.pptx
 
R and data mining
R and data miningR and data mining
R and data mining
 
Data science with R - Clustering and Classification
Data science with R - Clustering and ClassificationData science with R - Clustering and Classification
Data science with R - Clustering and Classification
 
M12 random forest-part01
M12 random forest-part01M12 random forest-part01
M12 random forest-part01
 
Classification examp
Classification exampClassification examp
Classification examp
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 

Recently uploaded

FESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfFESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfMarinCaroMartnezBerg
 
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...amitlee9823
 
Capstone Project on IBM Data Analytics Program
Capstone Project on IBM Data Analytics ProgramCapstone Project on IBM Data Analytics Program
Capstone Project on IBM Data Analytics ProgramMoniSankarHazra
 
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...SUHANI PANDEY
 
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAl Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAroojKhan71
 
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...only4webmaster01
 
April 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's AnalysisApril 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's Analysismanisha194592
 
Accredited-Transport-Cooperatives-Jan-2021-Web.pdf
Accredited-Transport-Cooperatives-Jan-2021-Web.pdfAccredited-Transport-Cooperatives-Jan-2021-Web.pdf
Accredited-Transport-Cooperatives-Jan-2021-Web.pdfadriantubila
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxolyaivanovalion
 
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night StandCall Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Standamitlee9823
 
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men 🔝malwa🔝 Escorts Ser...
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men  🔝malwa🔝   Escorts Ser...➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men  🔝malwa🔝   Escorts Ser...
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men 🔝malwa🔝 Escorts Ser...amitlee9823
 
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...ZurliaSoop
 
Probability Grade 10 Third Quarter Lessons
Probability Grade 10 Third Quarter LessonsProbability Grade 10 Third Quarter Lessons
Probability Grade 10 Third Quarter LessonsJoseMangaJr1
 
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night StandCall Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Standamitlee9823
 
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...amitlee9823
 
BigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxBigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxolyaivanovalion
 

Recently uploaded (20)

FESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdfFESE Capital Markets Fact Sheet 2024 Q1.pdf
FESE Capital Markets Fact Sheet 2024 Q1.pdf
 
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 7737669865 👗 Top Class Call Girl Service B...
 
Capstone Project on IBM Data Analytics Program
Capstone Project on IBM Data Analytics ProgramCapstone Project on IBM Data Analytics Program
Capstone Project on IBM Data Analytics Program
 
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
 
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al BarshaAl Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
Al Barsha Escorts $#$ O565212860 $#$ Escort Service In Al Barsha
 
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...
Call Girls Indiranagar Just Call 👗 9155563397 👗 Top Class Call Girl Service B...
 
April 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's AnalysisApril 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's Analysis
 
Predicting Loan Approval: A Data Science Project
Predicting Loan Approval: A Data Science ProjectPredicting Loan Approval: A Data Science Project
Predicting Loan Approval: A Data Science Project
 
Accredited-Transport-Cooperatives-Jan-2021-Web.pdf
Accredited-Transport-Cooperatives-Jan-2021-Web.pdfAccredited-Transport-Cooperatives-Jan-2021-Web.pdf
Accredited-Transport-Cooperatives-Jan-2021-Web.pdf
 
CebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptxCebaBaby dropshipping via API with DroFX.pptx
CebaBaby dropshipping via API with DroFX.pptx
 
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
 
(NEHA) Call Girls Katra Call Now 8617697112 Katra Escorts 24x7
(NEHA) Call Girls Katra Call Now 8617697112 Katra Escorts 24x7(NEHA) Call Girls Katra Call Now 8617697112 Katra Escorts 24x7
(NEHA) Call Girls Katra Call Now 8617697112 Katra Escorts 24x7
 
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night StandCall Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Bellandur ☎ 7737669865 🥵 Book Your One night Stand
 
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men 🔝malwa🔝 Escorts Ser...
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men  🔝malwa🔝   Escorts Ser...➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men  🔝malwa🔝   Escorts Ser...
➥🔝 7737669865 🔝▻ malwa Call-girls in Women Seeking Men 🔝malwa🔝 Escorts Ser...
 
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...
Jual Obat Aborsi Surabaya ( Asli No.1 ) 085657271886 Obat Penggugur Kandungan...
 
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get CytotecAbortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
 
Probability Grade 10 Third Quarter Lessons
Probability Grade 10 Third Quarter LessonsProbability Grade 10 Third Quarter Lessons
Probability Grade 10 Third Quarter Lessons
 
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night StandCall Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Stand
Call Girls In Doddaballapur Road ☎ 7737669865 🥵 Book Your One night Stand
 
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...
Escorts Service Kumaraswamy Layout ☎ 7737669865☎ Book Your One night Stand (B...
 
BigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptxBigBuy dropshipping via API with DroFx.pptx
BigBuy dropshipping via API with DroFx.pptx
 

Session 02

  • 2. Prepare your R session setwd("~/Google_Drive/github/R/R_Recap/session_02") library(dplyr) library(ggplot2) How to apply models in general Understanding the problem Understanding the data Preparing the data Split in Train & Test Predicting variable selection Resampling Model Tuning of the model Evaluation => ITERATIVE PROCESS! # Iris dataset {.smaller}
  • 3. Loading the Iris dataset glimpse(iris) ## Observations: 150 ## Variables: 5 ## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, ## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, ## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, ## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, ## $ Species <fctr> setosa, setosa, setosa, setosa, se df = iris df[,-5] = scale(df[,-5]) print(any(is.na(df))) ## [1] FALSE
  • 4. Function: Split data splitdf <- function(dataframe, seed=1, percentage=0.8) { if (!is.null(seed)) set.seed(seed) index <- 1:nrow(dataframe) numTrainingSamples <- round(length(index) * percentage) trainindex <- sample(index, numTrainingSamples) trainset <- dataframe[trainindex, ] testset <- dataframe[-trainindex, ] list(trainset=trainset,testset=testset)}
  • 5. resampling the iris data df = df %>% filter(Species!= "setosa") levels(df$Species) = list("virginica"="virginica", "versicolor"=c("versicolor","setosa")) split <- splitdf(df,seed=1) train <- split$trainset test <- split$testset library(ROSE) print(sum(train$Species == "versicolor")) ## [1] 46 train <- ovun.sample(Species~.,data=train, method="under",p=0.7)$data print(sum(train$Species == "versicolor")) ## [1] 14
  • 6. Building and tuning a tree library(tree) set.seed(37) tree = tree(Species ~.,train) tree_cv = cv.tree(tree,method="misclass") print(tree_cv$size) ## [1] 3 2 1 print(tree_cv$dev) ## [1] 7 7 15 tree_pruned = prune.tree(tree,best=2)
  • 7. Test with final test set predictions = predict(tree,newdata=test) predictions = as.factor(if_else( predictions[,2]>0.5,"virginica","versicolor")) print(table(predictions,test$Species)) ## ## predictions virginica versicolor ## versicolor 1 4 ## virginica 15 0
  • 8. pruned prediction predictions_pr = predict(tree_pruned,newdata=test) predictions_pr = as.factor( if_else(predictions_pr[,2]>0.5, "virginica","versicolor")) print(table(predictions_pr,test$Species)) ## ## predictions_pr virginica versicolor ## versicolor 1 4 ## virginica 15 0
  • 9. Marketing Intelligence #not in validation set --> exclude df = df %>% dplyr::select(-iab_4) names <- c('device','os','os_version','browser' ,'browser_version','prov','day','target') df[,names] <- lapply(df[,names], factor)
  • 10. Filtering cols = sapply(df, function(x){is.integer(x)|is.numeric(x)}) cor_matrix = cor(df[,cols],as.numeric(df$target) ,method="spearman") #REALLY low threshold! names_num <- rownames(subset(cor_matrix ,abs(cor_matrix) >= 0.01)) df <- df %>% dplyr::select(one_of(names_num,names)) #Random Forest cannot handly NAs df$prov <- NULL
  • 11. Resample splits = splitdf(df,percentage=0.9) test = splits$testset train = splits$trainset train <- ovun.sample(formula=target~.,data=train ,method="under", p=0.5)$data rm(df) rm(splits)
  • 12. Random forest library(randomForest) model <- randomForest(target~., data=train, importance=TRUE) probs <- predict(model, type="prob", newdata = test) predictions <- data.frame(target=test$target , pred=probs[,2])
  • 13. Threshold detection library(pROC) getThreshold <- function(predictions){ myROC <- roc(formula = target ~ pred, predictions) optimalThreshold <- coords(myROC, "best" , ret = "threshold") } threshold <- getThreshold(predictions) tbl = table(predictions$target, predictions$pred > threshold)
  • 14. Compare results print(tbl) ## ## FALSE TRUE ## 0 133536 55952 ## 1 123 235 auc = auc(predictions$target, predictions$pred) accuracy <- (tbl[1,1] + tbl[2,2]) / sum(tbl) F1 <- (2*(tbl[1,1]))/((2*(tbl[1,1]))+tbl[2,1]+tbl[1,2]) print(cat("Accuracy:",accuracy,"F1:",F1,"AUC:",auc,"")) ## Accuracy: 0.704629 F1: 0.8264722 AUC: 0.7253162 NULL
  • 15. Variable importance #high values wanted print(head(importance(model)[,3:4])) ## MeanDecreaseAccuracy MeanDecreaseGini ## freq 17.55651 213.18706 ## adnexus 22.43981 112.44327 ## bsw_rubicon 24.14906 150.20596 ## bsw_google 32.64226 152.25962 ## bsw_adconductor 13.30819 77.42707 ## bsw_pubmatic 13.85020 59.55289
  • 17. K-Fold CV set.seed(1) folds <- sample(rep(1:10,length=nrow(train))) cv_error <- vector() sequence = seq(1,10) for(k in sequence){ model <- randomForest(target~.,data=train[folds!=k,]) pred <- predict(model,train[folds==k,],type="class") cv_error[k] <- mean(train[folds==k,]$target==pred) }
  • 18. ggplot2 q1 = qplot(y=cv_error,x=sequence,xlab="Iteration", ylab="CV Error",main="CV Error per iteration")+ geom_line(colour=I("blue"))+ geom_point(aes(x = which(cv_error==min(cv_error)), y = min(cv_error)), color = I("red"))+ geom_hline(aes(yintercept=max(cv_error)), color = I("red"),linetype = 2)
  • 19. ggplot2 output 0.64 0.65 0.66 0.67 0.68 2.5 5.0 7.5 10.0 Iteration CVError CV Error per iteration
  • 20. Other ideas to play with 1. mtry (default = sqrt(p)): number of variables per tree 2. ntree: number of trees