SlideShare uma empresa Scribd logo
1 de 29
Baixar para ler offline
R + Hadoop = Big Data Analytics
6
7
8
9
library(rmr)

mapreduce(…)
lapply(data, function)

mapreduce(big.data, map = function)
Expose MR   Hide MR
                             Hive, Pig




  Rmr, Rhipe,               Cascalog,
Rmr
Dumbo, Pydoop,              Scalding,
    Hadoopy                  Scrunch




       Java,                Cascading,
        C++                  Crunch
mapreduce(input, output, map, reduce)
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
small.ints = 1:1000
lapply(small.ints, function(x) x^2)

small.ints = to.dfs(1:1000)
mapreduce(input = small.ints,
        map = function(k,v) keyval(v, v^2))

groups = rbinom(32, n = 50, prob = 0.4)
tapply(groups, groups, length)

groups = to.dfs(groups)
mapreduce(input = groups,
     map = function(k, v) keyval(v,1),
     reduce = function(k,vv)
               keyval(k, length(vv)))
condition = function(x) x > 10


out = mapreduce(
     input = input,
     map = function(k,v)
          if (condition(v)) keyval(k,v))
kmeans =
 function(points, ncenters, iterations = 10,    distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) {  newCenters =
kmeans.iter(points, distfun, centers = newCenters)} newCenters}




kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs(
   mapreduce(
    input = points,      map = if (is.null(centers)) {        function(k,v) keyval(sample(1:ncenters,1),v)}           else
{         function(k,v) {             distances = apply(centers, 1, function(c) distfun(c,v))
keyval(centers[which.min(distances),], v)}},      reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2,
mean))),    to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k=4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
   last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
   initial_centroids = initial_centroids + str(last_centroids[i])
   if i!=k-1:
       initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
            DEFINE find_centroid FindCentroid('$centroids');
            raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
            centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
            grouped = group centroided by centroid;
            result = foreach grouped generate group, AVG(centroided.gpa);
            store result into 'output';
          """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
   Q = P.bind({'centroids':initial_centroids})
   results = Q.runSingle()
if results.isSuccessful() == "FAILED":
      raise "Pig job failed"
  iter = results.result("result").iterator()
  centroids = [None] * k
  distance_move = 0
  # get new centroid of this iteration, caculate the moving distance with last iteration
  for i in range(k):
      tuple = iter.next()
      centroids[i] = float(str(tuple.get(1)))
      distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
  distance_move = distance_move / k;
  Pig.fs("rmr output")
  print("iteration " + str(iter_num))
  print("average distance moved: " + str(distance_move))
  if distance_move<tolerance:
      sys.stdout.write("k-means converged at centroids: [")
      sys.stdout.write(",".join(str(v) for v in centroids))
      sys.stdout.write("]n")
      converged = True
      break
  last_centroids = centroids[:]
  initial_centroids = ""
  for i in range(k):
      initial_centroids = initial_centroids + str(last_centroids[i])
      if i!=k-1:
          initial_centroids = initial_centroids + ":"
  iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
  double[] centroids;
  public FindCentroid(String initialCentroid) {
     String[] centroidStrings = initialCentroid.split(":");
     centroids = new double[centroidStrings.length];
     for (int i=0;i<centroidStrings.length;i++)
        centroids[i] = Double.parseDouble(centroidStrings[i]);
  }
  @Override
  public Double exec(Tuple input) throws IOException {
     double min_distance = Double.MAX_VALUE;
     double closest_centroid = 0;
     for (double centroid : centroids) {
        double distance = Math.abs(centroid - (Double)input.get(0));
        if (distance < min_distance) {
            min_distance = distance;
            closest_centroid = centroid;
        }
     }
     return closest_centroid;
  }

}
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
  left.input, right.input, input,
  output,
  outer,
  map.left, map.right,
  reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
  …
  result = mapreduce(input = input,
              output = output)
  …
  result}
input.format, output.format, format
reduce.on.data.frame, to.data.frame
local, hadoop backends
profiling
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework
R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

Mais conteúdo relacionado

Mais procurados

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on rAbhik Seal
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceTobias Pfeiffer
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processingTim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
Heaps
HeapsHeaps
HeapsIIUM
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final TaglessJohn De Goes
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Sciencehenrygarner
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Codestasimus
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsDataconomy Media
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscexjeffz
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in ScalaBeScala
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.jsNaoyuki Totani
 

Mais procurados (20)

Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
How fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practiceHow fast ist it really? Benchmarking in practice
How fast ist it really? Benchmarking in practice
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
ScalaMeter 2012
ScalaMeter 2012ScalaMeter 2012
ScalaMeter 2012
 
Heaps
HeapsHeaps
Heaps
 
Hammurabi
HammurabiHammurabi
Hammurabi
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
The Death of Final Tagless
The Death of Final TaglessThe Death of Final Tagless
The Death of Final Tagless
 
Academy PRO: ES2015
Academy PRO: ES2015Academy PRO: ES2015
Academy PRO: ES2015
 
Slides
SlidesSlides
Slides
 
Clojure for Data Science
Clojure for Data ScienceClojure for Data Science
Clojure for Data Science
 
Groovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony CodeGroovy vs Boilerplate and Ceremony Code
Groovy vs Boilerplate and Ceremony Code
 
Cloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forestsCloudera - A Taste of random decision forests
Cloudera - A Taste of random decision forests
 
深入浅出Jscex
深入浅出Jscex深入浅出Jscex
深入浅出Jscex
 
Probabilistic Programming in Scala
Probabilistic Programming in ScalaProbabilistic Programming in Scala
Probabilistic Programming in Scala
 
Rug hogan-10-03-2012
Rug hogan-10-03-2012Rug hogan-10-03-2012
Rug hogan-10-03-2012
 
Coffee Scriptでenchant.js
Coffee Scriptでenchant.jsCoffee Scriptでenchant.js
Coffee Scriptでenchant.js
 

Destaque

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationmoldovaictsummit2016
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...CA Technologies
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014TA Telecom
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopAvkash Chauhan
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for HadoopWilly Marroquin (WillyDevNET)
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloudluisw19
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoopEd Kohlwey
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchMichelle Kolbe
 
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from RJeffrey Breen
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...CA Technologies
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User GuideAndrey Karpov
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanApigee | Google Cloud
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with RJeffrey Breen
 

Destaque (20)

Entreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformationEntreprise mobility approach within digital transformation
Entreprise mobility approach within digital transformation
 
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
Inspire Your Enterprise and Become a Digital Change Agent Through Agile Archi...
 
Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014Startup Success = Big Data + Analytics | Cairo innovates 2014
Startup Success = Big Data + Analytics | Cairo innovates 2014
 
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management PlatformWSO2Con EU 2016: Understanding the WSO2 API Management Platform
WSO2Con EU 2016: Understanding the WSO2 API Management Platform
 
Introduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache HadoopIntroduction to Big Data Analytics on Apache Hadoop
Introduction to Big Data Analytics on Apache Hadoop
 
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
WSO2 - Forrester Guest Webinar: API Management is not Enough: You Need an API...
 
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
Accelerating R analytics with Spark and  Microsoft R Server  for HadoopAccelerating R analytics with Spark and  Microsoft R Server  for Hadoop
Accelerating R analytics with Spark and Microsoft R Server for Hadoop
 
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle CloudUKOUG - Implementing Enterprise API Management in the Oracle Cloud
UKOUG - Implementing Enterprise API Management in the Oracle Cloud
 
WSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and RoadmapWSO2 API Platform: Vision and Roadmap
WSO2 API Platform: Vision and Roadmap
 
API strategy with IBM API connect
API strategy with IBM API connectAPI strategy with IBM API connect
API strategy with IBM API connect
 
Data Hacking with RHadoop
Data Hacking with RHadoopData Hacking with RHadoop
Data Hacking with RHadoop
 
OBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage MatchOBIEE Answers Vs Data Visualization: A Cage Match
OBIEE Answers Vs Data Visualization: A Cage Match
 
Excel/R
Excel/RExcel/R
Excel/R
 
Accessing Databases from R
Accessing Databases from RAccessing Databases from R
Accessing Databases from R
 
Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)Applications of R (DataWeek 2014)
Applications of R (DataWeek 2014)
 
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
Case Study: Zain Kuwait Accelerates Digital Transformation in Telco with APIs...
 
Hp distributed R User Guide
Hp distributed R User GuideHp distributed R User Guide
Hp distributed R User Guide
 
The Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business PlanThe Business of APIs: Your 100-Day API Business Plan
The Business of APIs: Your 100-Day API Business Plan
 
R crash course
R crash courseR crash course
R crash course
 
Tapping the Data Deluge with R
Tapping the Data Deluge with RTapping the Data Deluge with R
Tapping the Data Deluge with R
 

Semelhante a R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

Monadologie
MonadologieMonadologie
Monadologieleague
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileKushagraChadha1
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For GoogleEleanor McHugh
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語ikdysfm
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdfkesav24
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIDr. Volkan OBAN
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Evgeny Borisov
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1Ke Wei Louis
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en PythonJavier Abadía
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changeshayato
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programmingLukasz Dynowski
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017Szymon Matejczyk
 

Semelhante a R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
CLUSTERGRAM
CLUSTERGRAMCLUSTERGRAM
CLUSTERGRAM
 
Algorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical FileAlgorithm Design and Analysis - Practical File
Algorithm Design and Analysis - Practical File
 
Go: It's Not Just For Google
Go: It's Not Just For GoogleGo: It's Not Just For Google
Go: It's Not Just For Google
 
Oh Composable World!
Oh Composable World!Oh Composable World!
Oh Composable World!
 
Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語Haskellで学ぶ関数型言語
Haskellで学ぶ関数型言語
 
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
Implement the following sorting algorithms  Bubble Sort  Insertion S.pdfImplement the following sorting algorithms  Bubble Sort  Insertion S.pdf
Implement the following sorting algorithms Bubble Sort Insertion S.pdf
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
 
Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2Groovy puzzlers jug-moscow-part 2
Groovy puzzlers jug-moscow-part 2
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
 
Retos de Programación en Python
Retos de Programación en PythonRetos de Programación en Python
Retos de Programación en Python
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
Python 101 language features and functional programming
Python 101 language features and functional programmingPython 101 language features and functional programming
Python 101 language features and functional programming
 
Scala by Luc Duponcheel
Scala by Luc DuponcheelScala by Luc Duponcheel
Scala by Luc Duponcheel
 
How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017How to extend map? Or why we need collections redesign? - Scalar 2017
How to extend map? Or why we need collections redesign? - Scalar 2017
 
Script jantung copy
Script jantung   copyScript jantung   copy
Script jantung copy
 

Mais de Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudRevolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureRevolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudRevolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondRevolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source CommunitiesRevolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with RRevolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceRevolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudRevolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorRevolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint packageRevolution Analytics
 

Mais de Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Último

Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfAlex Barbosa Coqueiro
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfSeasiaInfotech2
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Mattias Andersson
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024Lorenzo Miniero
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clashcharlottematthew16
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Manik S Magar
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):comworks
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Mark Simos
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticscarlostorres15106
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr BaganFwdays
 
Artificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxArtificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxhariprasad279825
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsSergiu Bodiu
 
What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024Stephanie Beckett
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyAlfredo García Lavilla
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr LapshynFwdays
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embeddingZilliz
 

Último (20)

Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdf
 
The Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdfThe Future of Software Development - Devin AI Innovative Approach.pdf
The Future of Software Development - Devin AI Innovative Approach.pdf
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024SIP trunking in Janus @ Kamailio World 2024
SIP trunking in Janus @ Kamailio World 2024
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
Powerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time ClashPowerpoint exploring the locations used in television show Time Clash
Powerpoint exploring the locations used in television show Time Clash
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):CloudStudio User manual (basic edition):
CloudStudio User manual (basic edition):
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
 
"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan"ML in Production",Oleksandr Bagan
"ML in Production",Oleksandr Bagan
 
Artificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptxArtificial intelligence in cctv survelliance.pptx
Artificial intelligence in cctv survelliance.pptx
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platforms
 
What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easy
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embedding
 

R + Hadoop = Big Data Analytics. How Revolution Analytics' RHadoop Project Allows All Developers to Leverage the MapReduce Framework

  • 1. R + Hadoop = Big Data Analytics
  • 2.
  • 3.
  • 4.
  • 5.
  • 6. 6
  • 7. 7
  • 8. 8
  • 9. 9
  • 10.
  • 11.
  • 12.
  • 15. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Cascalog, Rmr Dumbo, Pydoop, Scalding, Hadoopy Scrunch Java, Cascading, C++ Crunch
  • 18. small.ints = 1:1000 lapply(small.ints, function(x) x^2) small.ints = to.dfs(1:1000) mapreduce(input = small.ints, map = function(k,v) keyval(v, v^2)) groups = rbinom(32, n = 50, prob = 0.4) tapply(groups, groups, length) groups = to.dfs(groups) mapreduce(input = groups, map = function(k, v) keyval(v,1), reduce = function(k,vv) keyval(k, length(vv)))
  • 19. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 20. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 21. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k=4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 22. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 23. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 24. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 25. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 26. input.format, output.format, format reduce.on.data.frame, to.data.frame local, hadoop backends profiling