SlideShare uma empresa Scribd logo
1 de 23
Introduc)on	
  to	
  
Scalding	
  and	
  Monoids	
  

Hugo	
  Gävert	
  
@hgavert	
  
	
  
Map	
  Reduce	
  
•  Programming	
  model	
  for	
  processing	
  large	
  data	
  sets	
  
with	
  a	
  parallel,	
  distributed	
  algorithm	
  on	
  a	
  cluster.	
  
•  Inspired	
  by	
  map	
  and	
  reduce	
  func)ons	
  commonly	
  
found	
  in	
  func)onal	
  programming	
  languages	
  
•  map()	
  performs	
  transla)ons	
  and	
  filtering	
  on	
  given	
  values	
  
•  reduce()	
  performs	
  summary	
  opera)on	
  on	
  given	
  values	
  
How	
  does	
  it	
  work?	
  

Found	
  this	
  from	
  the	
  Internet,	
  forgot	
  from	
  where	
  
The	
  scene	
  
•  Hadoop	
  –	
  open	
  source	
  implementa)on	
  of	
  	
  Google’s	
  
MapReduce	
  and	
  Google	
  File	
  System	
  papers	
  
•  Java…	
  
•  Higher	
  level	
  frameworks/plaOorms	
  
–  Hive	
  ≈	
  SQL	
  
–  Pig	
  	
  	
  (procedural	
  ≈	
  “more	
  programming	
  than	
  SQL”)	
  
–  Cascading	
  –	
  Java	
  MR	
  applica)on	
  framework	
  for	
  enterprise	
  data	
  flows	
  
•  If	
  you	
  must	
  do	
  Java,	
  do	
  this!	
  

–  Scalding	
  	
  -­‐	
  Scala	
  DSL	
  for	
  Cascading,	
  easy	
  to	
  pick	
  up	
  yet	
  very	
  
powerful	
  
–  Cascalog	
  –	
  Clojure	
  DSL	
  for	
  Cascading,	
  declara)ve,	
  logic	
  
programming	
  
The	
  scene	
  (*)	
  

*	
  Borrowed	
  from	
  excellent	
  presenta)on	
  by	
  Vitaly	
  Gordon	
  and	
  Christopher	
  Severs	
  	
  
“Hadoop	
  is	
  a	
  distributed	
  system	
  
for	
  coun)ng	
  words”	
  
package	
  org.myorg;	
  
	
  	
  	
  
import	
  java.io.IOException;	
  
import	
  java.util.*;	
  
	
  	
  
import	
  org.apache.hadoop.fs.Path;	
  
import	
  org.apache.hadoop.conf.*;	
  
import	
  org.apache.hadoop.io.*;	
  
import	
  org.apache.hadoop.mapred.*;	
  
import	
  org.apache.hadoop.util.*;	
  
	
  	
  
public	
  class	
  WordCount	
  {	
  
	
  	
  
	
  	
  	
  	
  public	
  static	
  class	
  Map	
  extends	
  MapReduceBase	
  implements	
  
Mapper<LongWritable,	
  Text,	
  Text,	
  IntWritable>	
  {	
  
	
  	
  	
  	
  	
  	
  private	
  final	
  static	
  IntWritable	
  one	
  =	
  new	
  
IntWritable(1);	
  
	
  	
  	
  	
  	
  	
  private	
  Text	
  word	
  =	
  new	
  Text();	
  
	
  	
  
	
  	
  	
  	
  	
  	
  public	
  void	
  map(LongWritable	
  key,	
  Text	
  value,	
  
OutputCollector<Text,	
  IntWritable>	
  output,	
  Reporter	
  reporter)	
  
throws	
  IOException	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  String	
  line	
  =	
  value.toString();	
  
	
  	
  	
  	
  	
  	
  	
  	
  StringTokenizer	
  tokenizer	
  =	
  new	
  StringTokenizer(line);	
  
	
  	
  	
  	
  	
  	
  	
  	
  while	
  (tokenizer.hasMoreTokens())	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  word.set(tokenizer.nextToken());	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  output.collect(word,	
  one);	
  
	
  	
  	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  }	
  
	
  	
  

	
  	
  
	
  	
  	
  	
  public	
  static	
  class	
  Reduce	
  extends	
  MapReduceBase	
  implements	
  
Reducer<Text,	
  IntWritable,	
  Text,	
  IntWritable>	
  {	
  
	
  	
  	
  	
  	
  	
  public	
  void	
  reduce(Text	
  key,	
  Iterator<IntWritable>	
  values,	
  
OutputCollector<Text,	
  IntWritable>	
  output,	
  Reporter	
  reporter)	
  
throws	
  IOException	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  int	
  sum	
  =	
  0;	
  
	
  	
  	
  	
  	
  	
  	
  	
  while	
  (values.hasNext())	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  sum	
  +=	
  values.next().get();	
  
	
  	
  	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  	
  	
  	
  	
  output.collect(key,	
  new	
  IntWritable(sum));	
  
	
  	
  	
  	
  	
  	
  }	
  
	
  	
  	
  	
  }	
  
	
  	
  
	
  	
  	
  	
  public	
  static	
  void	
  main(String[]	
  args)	
  throws	
  Exception	
  {	
  
	
  	
  	
  	
  	
  	
  JobConf	
  conf	
  =	
  new	
  JobConf(WordCount.class);	
  
	
  	
  	
  	
  	
  	
  conf.setJobName("wordcount");	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setOutputKeyClass(Text.class);	
  
	
  	
  	
  	
  	
  	
  conf.setOutputValueClass(IntWritable.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setMapperClass(Map.class);	
  
	
  	
  	
  	
  	
  	
  conf.setCombinerClass(Reduce.class);	
  
	
  	
  	
  	
  	
  	
  conf.setReducerClass(Reduce.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  conf.setInputFormat(TextInputFormat.class);	
  
	
  	
  	
  	
  	
  	
  conf.setOutputFormat(TextOutputFormat.class);	
  
	
  	
  
	
  	
  	
  	
  	
  	
  FileInputFormat.setInputPaths(conf,	
  new	
  Path(args[0]));	
  
	
  	
  	
  	
  	
  	
  FileOutputFormat.setOutputPath(conf,	
  new	
  Path(args[1]));	
  
	
  	
  
	
  	
  	
  	
  	
  	
  JobClient.runJob(conf);	
  
	
  	
  	
  	
  }	
  
}	
  
	
  
What	
  do	
  we	
  actually	
  want	
  to	
  do?	
  
Documents	
  
(lines)	
  

Tokenize	
  

GroupBy	
  
(token)	
  

Count	
  

Word	
  
count	
  
Word	
  Count	
  in	
  Scalding	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount1(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  line:	
  String	
  =>	
  line.split("s+")	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
}	
  
There	
  is	
  scald.rb	
  to	
  get	
  you	
  started	
  (get	
  it	
  from	
  Github	
  project)	
  
	
  
Building	
  and	
  running	
  a	
  fat	
  jar	
  (for	
  local,	
  include	
  hadoop,	
  for	
  cluster	
  mark	
  it	
  “provided”):	
  
> sbt assembly
> java -jar target/scala-2.10/scalding_talk-assembly-0.1.jar
com.sanoma.cda.examples.WordCount1 --local
--input data/11.txt.utf-8 --output wc.txt
> hadoop jar job-jars/scalding_talk-assembly-0.1.jar
--Dmapred.reduce.tasks=70 com.sanoma.cda.examples.WordCount1 --hdfs
--input /data/AliceInWonderland --output /user/Alice_wc
	
  

the
and
to
a
of
she
said
in
it
was
you
I
as
that
Alice
…	
  
Alice,
Alice.
Alice;
Alice's
Alice:
(Alice
Alice!
Alice,)

	
  1664	
  
	
  1172	
  
	
  780	
  
	
  773	
  
	
  662	
  
	
  596	
  
	
  484	
  
	
  416	
  
	
  401	
  
	
  356	
  
	
  329	
  
	
  301	
  
	
  260	
  
	
  246	
  
	
  226	
  
	
  221	
  
	
  76	
  
	
  54	
  
	
  16	
  
	
  11	
  
	
  7	
  
	
  4	
  
	
  3	
  
	
  2	
  
Word	
  Count	
  in	
  Scalding	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount2(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  line:	
  String	
  =>	
  tokenize(line)	
  }	
  
	
  	
  	
  	
  .filter('word)	
  {	
  word:	
  String	
  =>	
  word	
  !=	
  ""	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  
	
  	
  	
  	
  .groupAll{	
  _.sortBy(('size,	
  'word)).reverse	
  }	
  //	
  this	
  is	
  just	
  for	
  easy	
  results	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
it
she
said
you
in
i
alice
was
that
as
her
with
at
on
all

	
  1804	
  
	
  912	
  
	
  801	
  
	
  684	
  
	
  625	
  
	
  541	
  
	
  538	
  
	
  462	
  
	
  429	
  
	
  428	
  
	
  400	
  
	
  385	
  
	
  358	
  
	
  291	
  
	
  272	
  
	
  248	
  
	
  228	
  
	
  224	
  
	
  204	
  
	
  197	
  
Word	
  count	
  in	
  Scalding	
  
Almost	
  1-­‐to-­‐1	
  rela)on	
  between	
  the	
  
process	
  and	
  the	
  Scalding	
  code!	
  
	
  
UDFs	
  directly	
  in	
  Scala	
  
And	
  Java	
  libraries	
  can	
  be	
  used	
  

Documents	
  
(lines)	
  

Tokenize	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
	
  	
  
class	
  WordCount2(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .groupBy('word)	
  {	
  _.size	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+")	
  
	
  	
  }	
  
}	
  
	
  

GroupBy	
  
(token)	
  

Count	
  

Word	
  
count	
  
About	
  Scalding	
  
•  Started	
  at	
  Twiper	
  –	
  years	
  of	
  produc)on	
  use	
  
•  Well	
  tested	
  and	
  op)mized	
  by	
  different	
  teams,	
  
including	
  Twiper,	
  Concurrent	
  Inc.,	
  Etsy,	
  …	
  
•  Has	
  very	
  fast	
  local	
  mode	
  (no	
  need	
  to	
  install	
  
Hadoop	
  locally)	
  
•  Flow	
  planner	
  is	
  designed	
  to	
  be	
  portable	
  à	
  in	
  
future,	
  the	
  same	
  jobs	
  might	
  run	
  on	
  Storm	
  cluster	
  
for	
  example	
  
•  Scala…	
  very	
  nice	
  programming	
  language	
  –	
  YMMV	
  
–  Func)onal	
  &	
  object	
  oriented,	
  has	
  REPL	
  
Scalding	
  Func)ons	
  
•  3	
  APIs:	
  

–  Fields-­‐based	
  API	
  –	
  easy	
  to	
  start	
  from	
  here	
  
–  Type-­‐safe	
  API	
  
–  Matrix	
  API	
  

•  Field-­‐based	
  API	
  

–  Map-­‐like	
  func)ons	
  

•  map,	
  flatMap,	
  project,	
  insert,	
  filter,	
  limit…	
  

–  Grouping/reducing	
  func)ons	
  

•  groupBy,	
  groupAll	
  
•  .size,	
  .sum,	
  .average,	
  .sizeAveStdev,	
  .toList,	
  .max,	
  
sortBy,	
  .reduce,	
  .foldLeu,	
  .pivot,	
  …	
  

–  Join	
  Opera)ons	
  

•  joinWithSmaller,	
  joinWithLarger,	
  joinWithTiny,	
  crossWithTiny	
  
•  InnerJoin,	
  LeuJoin,	
  RightJoin,	
  OuterJoin	
  
Scalding	
  matrix	
  API	
  
package	
  com.twitter.scalding.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.scalding.mathematics.Matrix	
  
	
  	
  
/**	
  
*	
  Loads	
  a	
  directed	
  graph	
  adjacency	
  matrix	
  where	
  a[i,j]	
  =	
  1	
  if	
  there	
  is	
  an	
  edge	
  from	
  a[i]	
  to	
  b[j]	
  
*	
  and	
  computes	
  the	
  cosine	
  of	
  the	
  angle	
  between	
  every	
  two	
  pairs	
  of	
  vectors	
  
*/	
  
class	
  ComputeCosineJob(args	
  :	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  import	
  Matrix._	
  
	
  	
  
	
  	
  val	
  adjacencyMatrix	
  =	
  Tsv(	
  args("input"),	
  ('user1,	
  'user2,	
  'rel)	
  )	
  
	
  	
  	
  	
  .read	
  
	
  	
  	
  	
  .toMatrix[Long,Long,Double]('user1,	
  'user2,	
  'rel)	
  
	
  	
  
	
  	
  //	
  we	
  compute	
  the	
  L2	
  normalized	
  adjacency	
  graph	
  	
  
	
  	
  val	
  matL2Norm	
  =	
  adjacencyMatrix.rowL2Normalize	
  
	
  
	
  	
  //	
  we	
  compute	
  the	
  innerproduct	
  of	
  the	
  normalized	
  matrix	
  with	
  itself	
  
	
  	
  //	
  which	
  is	
  equivalent	
  with	
  computing	
  cosine:	
  AA^T	
  /	
  ||A||	
  *	
  ||A||	
  
	
  	
  val	
  cosDist	
  =	
  matL2Norm	
  *	
  matL2Norm.transpose	
  
	
  
	
  	
  cosDist.write(Tsv(args("output”)))	
  
}	
  
	
  
What	
  is	
  a	
  monoid?	
  
•  Closure	
  

∀a, b ∈ T : a • b ∈ T

•  Associa)vity	
  

∀a, b, c ∈ T : (a • b)•c = a •(b •c)

•  Iden)ty	
  element	
  

∃I ∈ T : ∀a ∈ T : I • a = a • I = a

Scala	
  trait:	
  

trait	
  Monoid[T]	
  {	
  
	
  	
  	
  def	
  zero:	
  T	
  
	
  	
  	
  def	
  plus(left:	
  T,	
  right:	
  T):	
  T	
  
}	
  
Examples	
  of	
  monoids	
  
•  Numbers,	
  String,	
  list,	
  set,	
  map	
  
•  Algorithms:	
  	
  
–  Min,	
  Max	
  
–  Moments	
  (count,	
  mean,	
  std,	
  …)	
  
–  Approximate	
  histograms,	
  quan)les	
  
–  Approximate	
  data	
  structures	
  
•  Bloom	
  Filter,	
  CountMinSketch,	
  HyperLogLog	
  

–  Stochas)c	
  gradient	
  descent	
  
What’s	
  the	
  point?	
  
a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7	
  
	
  

(a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  +	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  
	
  

(	
  (a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  )	
  +	
  (	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  )	
  
	
  

(	
  (	
  (a0	
  +	
  a1)	
  +	
  (a2	
  +	
  a3)	
  )	
  +	
  (	
  (a4	
  +	
  a5)	
  +	
  (a6	
  +	
  a7)	
  )	
  )	
  
à	
  Parallelism	
  
What’s	
  the	
  point?	
  
a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7	
  
	
  

	
  	
  	
  	
  	
  	
  	
  	
  (a0	
  +	
  a1	
  +	
  a2	
  +	
  a3	
  +	
  a4	
  +	
  a5	
  +	
  a6	
  +	
  a7)	
  +	
  a8	
  
	
  

à	
  Incremental	
  aggrega)on	
  
What’s	
  the	
  point?	
  
•  Easily	
  unit	
  testable	
  opera)ons	
  
•  Simple	
  aggrega)on	
  code	
  

à	
  Beper	
  quality	
  
Word	
  Count	
  with	
  Map	
  Monoid	
  
package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.algebird.Operators._	
  
	
  	
  
class	
  WordCount3(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .map('word	
  -­‐>	
  'word)	
  {	
  w:	
  String	
  =>	
  Map[String,	
  Int](w	
  -­‐>	
  1)	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sum[Map[String,	
  Int]]('word)	
  }	
  
	
  
	
  	
  	
  	
  //	
  We	
  could	
  save	
  the	
  map	
  here,	
  but	
  if	
  we	
  want	
  similar	
  output	
  as	
  in	
  previous...	
  
	
  	
  	
  	
  .flatMap('word	
  -­‐>	
  ('word,	
  'size))	
  {	
  words:	
  Map[String,	
  Int]	
  =>	
  words.toList	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sortBy(('size,	
  'word)).reverse	
  }	
  //	
  this	
  is	
  just	
  for	
  easy	
  results	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+").filter(	
  _	
  !=	
  "")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
it
she
said
you
in
i
alice
was
that
as
her
with
at
on
all

	
  1804	
  
	
  912	
  
	
  801	
  
	
  684	
  
	
  625	
  
	
  541	
  
	
  538	
  
	
  462	
  
	
  429	
  
	
  428	
  
	
  400	
  
	
  385	
  
	
  358	
  
	
  291	
  
	
  272	
  
	
  248	
  
	
  228	
  
	
  224	
  
	
  204	
  
	
  197	
  
Top	
  Words	
  with	
  CMS	
  
•  asd	
  

package	
  com.sanoma.cda.examples	
  
import	
  com.twitter.scalding._	
  
import	
  com.twitter.algebird._	
  
	
  	
  
class	
  WordCount5(args:	
  Args)	
  extends	
  Job(args)	
  {	
  
	
  	
  implicit	
  def	
  utf8(s:	
  String):	
  Array[Byte]	
  =	
  com.twitter.bijection.Injection.utf8(s)	
  
	
  	
  implicit	
  val	
  cmsm	
  =	
  new	
  SketchMapMonoid[String,	
  Long](128,	
  6,	
  0,	
  20)	
  //	
  top	
  20	
  
	
  	
  type	
  ApproxMap	
  =	
  SketchMap[String,	
  Long]	
  
	
  	
  
	
  	
  TextLine(args("input"))	
  
	
  	
  	
  	
  .flatMap('line	
  -­‐>	
  'word)	
  {	
  tokenize	
  }	
  
	
  	
  	
  	
  .map('word	
  -­‐>	
  'word)	
  {	
  w:	
  String	
  =>	
  cmsm.create((w,	
  1L))	
  }	
  
	
  	
  	
  	
  .groupAll{	
  _.sum[ApproxMap]('word)	
  }	
  
	
  	
  	
  	
  .flatMap('word	
  -­‐>	
  ('word,	
  'size))	
  {	
  words:	
  ApproxMap	
  =>	
  words.heavyHitters	
  }	
  
	
  	
  	
  	
  .write(Tsv(args("output")))	
  
	
  	
  
	
  	
  def	
  tokenize(text:	
  String):	
  Array[String]	
  =	
  {	
  
	
  	
  	
  	
  text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",	
  "").split("s+").filter(	
  _	
  !=	
  "")	
  
	
  	
  }	
  
}	
  

the
and
to
a
of
she
it
said
you
in
i
alice
at
was
that
her
with
as
not
be

	
  1859	
  
	
  972	
  
	
  867	
  
	
  748	
  
	
  711	
  
	
  636	
  
	
  619	
  
	
  579	
  
	
  504	
  
	
  495	
  
	
  456	
  
	
  431	
  
	
  407	
  
	
  394	
  
	
  342	
  
	
  341	
  
	
  338	
  
	
  337	
  
	
  290	
  
	
  286	
  
Start	
  using	
  Scalding	
  now!	
  :-­‐)	
  
GitHub:	
  
hpps://github.com/twiper/scalding	
  
	
  
Tutorials:	
  
hpps://github.com/twiper/scalding/tree/
develop/tutorial	
  
	
  
	
  
Thanks!	
  
	
  
	
  
Hugo	
  Gävert	
  
Sanoma	
  

Mais conteúdo relacionado

Mais procurados

Scala introduction
Scala introductionScala introduction
Scala introductionvito jeng
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsCheng Min Chi
 
Spark schema for free with David Szakallas
Spark schema for free with David SzakallasSpark schema for free with David Szakallas
Spark schema for free with David SzakallasDatabricks
 
A Brief Intro to Scala
A Brief Intro to ScalaA Brief Intro to Scala
A Brief Intro to ScalaTim Underwood
 
Spark Schema For Free with David Szakallas
 Spark Schema For Free with David Szakallas Spark Schema For Free with David Szakallas
Spark Schema For Free with David SzakallasDatabricks
 
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...CloudxLab
 
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabCloudxLab
 
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...Maurice Naftalin
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGMatthew McCullough
 
Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013Samir Bessalah
 
Data profiling with Apache Calcite
Data profiling with Apache CalciteData profiling with Apache Calcite
Data profiling with Apache CalciteJulian Hyde
 
Introduction to Scala for Java Developers
Introduction to Scala for Java DevelopersIntroduction to Scala for Java Developers
Introduction to Scala for Java DevelopersMichael Galpin
 
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB
 
Scala for Java programmers
Scala for Java programmersScala for Java programmers
Scala for Java programmers輝 子安
 
Introduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceIntroduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceDr Ganesh Iyer
 

Mais procurados (20)

Scalding
ScaldingScalding
Scalding
 
Spark workshop
Spark workshopSpark workshop
Spark workshop
 
Scala introduction
Scala introductionScala introduction
Scala introduction
 
Meet scala
Meet scalaMeet scala
Meet scala
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
 
Spark schema for free with David Szakallas
Spark schema for free with David SzakallasSpark schema for free with David Szakallas
Spark schema for free with David Szakallas
 
A Brief Intro to Scala
A Brief Intro to ScalaA Brief Intro to Scala
A Brief Intro to Scala
 
Scala+data
Scala+dataScala+data
Scala+data
 
Spark Schema For Free with David Szakallas
 Spark Schema For Free with David Szakallas Spark Schema For Free with David Szakallas
Spark Schema For Free with David Szakallas
 
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
 
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
 
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
Good and Wicked Fairies, and the Tragedy of the Commons: Understanding the Pe...
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013Big Data Analytics with Scala at SCALA.IO 2013
Big Data Analytics with Scala at SCALA.IO 2013
 
Data profiling with Apache Calcite
Data profiling with Apache CalciteData profiling with Apache Calcite
Data profiling with Apache Calcite
 
Introduction to Scala for Java Developers
Introduction to Scala for Java DevelopersIntroduction to Scala for Java Developers
Introduction to Scala for Java Developers
 
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
 
Joy of scala
Joy of scalaJoy of scala
Joy of scala
 
Scala for Java programmers
Scala for Java programmersScala for Java programmers
Scala for Java programmers
 
Introduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceIntroduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduce
 

Destaque

Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Sriram Krishnan
 
How LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentHow LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentSasha Ovsankin
 
Analytics with Splunk (Open edX)
Analytics with Splunk (Open edX)Analytics with Splunk (Open edX)
Analytics with Splunk (Open edX)Philippe Chiu
 
Luigi presentation OA Summit
Luigi presentation OA SummitLuigi presentation OA Summit
Luigi presentation OA SummitOpen Analytics
 
Sterilization and disinfection
Sterilization and disinfectionSterilization and disinfection
Sterilization and disinfectionTamil Silambarasan
 
Luigi presentation NYC Data Science
Luigi presentation NYC Data ScienceLuigi presentation NYC Data Science
Luigi presentation NYC Data ScienceErik Bernhardsson
 
A Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiA Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiGrowth Intelligence
 

Destaque (8)

Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014
 
How LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentHow LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product Development
 
Scalding @ Coursera
Scalding @ CourseraScalding @ Coursera
Scalding @ Coursera
 
Analytics with Splunk (Open edX)
Analytics with Splunk (Open edX)Analytics with Splunk (Open edX)
Analytics with Splunk (Open edX)
 
Luigi presentation OA Summit
Luigi presentation OA SummitLuigi presentation OA Summit
Luigi presentation OA Summit
 
Sterilization and disinfection
Sterilization and disinfectionSterilization and disinfection
Sterilization and disinfection
 
Luigi presentation NYC Data Science
Luigi presentation NYC Data ScienceLuigi presentation NYC Data Science
Luigi presentation NYC Data Science
 
A Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with LuigiA Beginner's Guide to Building Data Pipelines with Luigi
A Beginner's Guide to Building Data Pipelines with Luigi
 

Semelhante a Introduction to Scalding and Monoids

Scala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghScala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghStuart Roebuck
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInVitaly Gordon
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusKoichi Fujikawa
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopSages
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemSages
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with ScalaHimanshu Gupta
 
Apache Spark, the Next Generation Cluster Computing
Apache Spark, the Next Generation Cluster ComputingApache Spark, the Next Generation Cluster Computing
Apache Spark, the Next Generation Cluster ComputingGerger
 
Big-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbaiBig-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbaiUnmesh Baile
 
What can be done with Java, but should better be done with Erlang (@pavlobaron)
What can be done with Java, but should better be done with Erlang (@pavlobaron)What can be done with Java, but should better be done with Erlang (@pavlobaron)
What can be done with Java, but should better be done with Erlang (@pavlobaron)Pavlo Baron
 
Emerging Languages: A Tour of the Horizon
Emerging Languages: A Tour of the HorizonEmerging Languages: A Tour of the Horizon
Emerging Languages: A Tour of the HorizonAlex Payne
 
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaAdvance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaDesing Pathshala
 
Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Gabriele Modena
 
Testing batch and streaming Spark applications
Testing batch and streaming Spark applicationsTesting batch and streaming Spark applications
Testing batch and streaming Spark applicationsŁukasz Gawron
 
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark ApplicationsFuture Processing
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in CassandraJairam Chandar
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseSages
 

Semelhante a Introduction to Scalding and Monoids (20)

Scala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghScala @ TechMeetup Edinburgh
Scala @ TechMeetup Edinburgh
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedIn
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop Papyrus
 
Hadoop + Clojure
Hadoop + ClojureHadoop + Clojure
Hadoop + Clojure
 
Scala in Places API
Scala in Places APIScala in Places API
Scala in Places API
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache Hadoop
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with Scala
 
Hw09 Hadoop + Clojure
Hw09   Hadoop + ClojureHw09   Hadoop + Clojure
Hw09 Hadoop + Clojure
 
Osd ctw spark
Osd ctw sparkOsd ctw spark
Osd ctw spark
 
Apache Spark, the Next Generation Cluster Computing
Apache Spark, the Next Generation Cluster ComputingApache Spark, the Next Generation Cluster Computing
Apache Spark, the Next Generation Cluster Computing
 
Big-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbaiBig-data-analysis-training-in-mumbai
Big-data-analysis-training-in-mumbai
 
What can be done with Java, but should better be done with Erlang (@pavlobaron)
What can be done with Java, but should better be done with Erlang (@pavlobaron)What can be done with Java, but should better be done with Erlang (@pavlobaron)
What can be done with Java, but should better be done with Erlang (@pavlobaron)
 
Emerging Languages: A Tour of the Horizon
Emerging Languages: A Tour of the HorizonEmerging Languages: A Tour of the Horizon
Emerging Languages: A Tour of the Horizon
 
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaAdvance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
 
Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2
 
Testing batch and streaming Spark applications
Testing batch and streaming Spark applicationsTesting batch and streaming Spark applications
Testing batch and streaming Spark applications
 
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications
[QE 2018] Łukasz Gawron – Testing Batch and Streaming Spark Applications
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
 

Último

Emixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentEmixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentPim van der Noll
 
What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfMounikaPolabathina
 
What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024Stephanie Beckett
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 3652toLead Limited
 
Rise of the Machines: Known As Drones...
Rise of the Machines: Known As Drones...Rise of the Machines: Known As Drones...
Rise of the Machines: Known As Drones...Rick Flair
 
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024BookNet Canada
 
Scale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterScale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterMydbops
 
UiPath Community: Communication Mining from Zero to Hero
UiPath Community: Communication Mining from Zero to HeroUiPath Community: Communication Mining from Zero to Hero
UiPath Community: Communication Mining from Zero to HeroUiPathCommunity
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsPixlogix Infotech
 
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo Day
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo DayH2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo Day
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo DaySri Ambati
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024BookNet Canada
 
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...AliaaTarek5
 
Visualising and forecasting stocks using Dash
Visualising and forecasting stocks using DashVisualising and forecasting stocks using Dash
Visualising and forecasting stocks using Dashnarutouzumaki53779
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI AgeCprime
 
Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Hiroshi SHIBATA
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsNathaniel Shimoni
 
How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.Curtis Poe
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity PlanDatabarracks
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfpanagenda
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch TuesdayIvanti
 

Último (20)

Emixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native developmentEmixa Mendix Meetup 11 April 2024 about Mendix Native development
Emixa Mendix Meetup 11 April 2024 about Mendix Native development
 
What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdf
 
What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024What's New in Teams Calling, Meetings and Devices March 2024
What's New in Teams Calling, Meetings and Devices March 2024
 
Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365Ensuring Technical Readiness For Copilot in Microsoft 365
Ensuring Technical Readiness For Copilot in Microsoft 365
 
Rise of the Machines: Known As Drones...
Rise of the Machines: Known As Drones...Rise of the Machines: Known As Drones...
Rise of the Machines: Known As Drones...
 
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
 
Scale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL RouterScale your database traffic with Read & Write split using MySQL Router
Scale your database traffic with Read & Write split using MySQL Router
 
UiPath Community: Communication Mining from Zero to Hero
UiPath Community: Communication Mining from Zero to HeroUiPath Community: Communication Mining from Zero to Hero
UiPath Community: Communication Mining from Zero to Hero
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and Cons
 
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo Day
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo DayH2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo Day
H2O.ai CEO/Founder: Sri Ambati Keynote at Wells Fargo Day
 
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
New from BookNet Canada for 2024: BNC CataList - Tech Forum 2024
 
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
(How to Program) Paul Deitel, Harvey Deitel-Java How to Program, Early Object...
 
Visualising and forecasting stocks using Dash
Visualising and forecasting stocks using DashVisualising and forecasting stocks using Dash
Visualising and forecasting stocks using Dash
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI Age
 
Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024Long journey of Ruby standard library at RubyConf AU 2024
Long journey of Ruby standard library at RubyConf AU 2024
 
Time Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directionsTime Series Foundation Models - current state and future directions
Time Series Foundation Models - current state and future directions
 
How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.How AI, OpenAI, and ChatGPT impact business and software.
How AI, OpenAI, and ChatGPT impact business and software.
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity Plan
 
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdfSo einfach geht modernes Roaming fuer Notes und Nomad.pdf
So einfach geht modernes Roaming fuer Notes und Nomad.pdf
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch Tuesday
 

Introduction to Scalding and Monoids

  • 1. Introduc)on  to   Scalding  and  Monoids   Hugo  Gävert   @hgavert    
  • 2. Map  Reduce   •  Programming  model  for  processing  large  data  sets   with  a  parallel,  distributed  algorithm  on  a  cluster.   •  Inspired  by  map  and  reduce  func)ons  commonly   found  in  func)onal  programming  languages   •  map()  performs  transla)ons  and  filtering  on  given  values   •  reduce()  performs  summary  opera)on  on  given  values  
  • 3. How  does  it  work?   Found  this  from  the  Internet,  forgot  from  where  
  • 4. The  scene   •  Hadoop  –  open  source  implementa)on  of    Google’s   MapReduce  and  Google  File  System  papers   •  Java…   •  Higher  level  frameworks/plaOorms   –  Hive  ≈  SQL   –  Pig      (procedural  ≈  “more  programming  than  SQL”)   –  Cascading  –  Java  MR  applica)on  framework  for  enterprise  data  flows   •  If  you  must  do  Java,  do  this!   –  Scalding    -­‐  Scala  DSL  for  Cascading,  easy  to  pick  up  yet  very   powerful   –  Cascalog  –  Clojure  DSL  for  Cascading,  declara)ve,  logic   programming  
  • 5. The  scene  (*)   *  Borrowed  from  excellent  presenta)on  by  Vitaly  Gordon  and  Christopher  Severs    
  • 6. “Hadoop  is  a  distributed  system   for  coun)ng  words”   package  org.myorg;         import  java.io.IOException;   import  java.util.*;       import  org.apache.hadoop.fs.Path;   import  org.apache.hadoop.conf.*;   import  org.apache.hadoop.io.*;   import  org.apache.hadoop.mapred.*;   import  org.apache.hadoop.util.*;       public  class  WordCount  {              public  static  class  Map  extends  MapReduceBase  implements   Mapper<LongWritable,  Text,  Text,  IntWritable>  {              private  final  static  IntWritable  one  =  new   IntWritable(1);              private  Text  word  =  new  Text();                  public  void  map(LongWritable  key,  Text  value,   OutputCollector<Text,  IntWritable>  output,  Reporter  reporter)   throws  IOException  {                  String  line  =  value.toString();                  StringTokenizer  tokenizer  =  new  StringTokenizer(line);                  while  (tokenizer.hasMoreTokens())  {                      word.set(tokenizer.nextToken());                      output.collect(word,  one);                  }              }          }                  public  static  class  Reduce  extends  MapReduceBase  implements   Reducer<Text,  IntWritable,  Text,  IntWritable>  {              public  void  reduce(Text  key,  Iterator<IntWritable>  values,   OutputCollector<Text,  IntWritable>  output,  Reporter  reporter)   throws  IOException  {                  int  sum  =  0;                  while  (values.hasNext())  {                      sum  +=  values.next().get();                  }                  output.collect(key,  new  IntWritable(sum));              }          }              public  static  void  main(String[]  args)  throws  Exception  {              JobConf  conf  =  new  JobConf(WordCount.class);              conf.setJobName("wordcount");                  conf.setOutputKeyClass(Text.class);              conf.setOutputValueClass(IntWritable.class);                  conf.setMapperClass(Map.class);              conf.setCombinerClass(Reduce.class);              conf.setReducerClass(Reduce.class);                  conf.setInputFormat(TextInputFormat.class);              conf.setOutputFormat(TextOutputFormat.class);                  FileInputFormat.setInputPaths(conf,  new  Path(args[0]));              FileOutputFormat.setOutputPath(conf,  new  Path(args[1]));                  JobClient.runJob(conf);          }   }    
  • 7. What  do  we  actually  want  to  do?   Documents   (lines)   Tokenize   GroupBy   (token)   Count   Word   count  
  • 8. Word  Count  in  Scalding   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount1(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  line:  String  =>  line.split("s+")  }          .groupBy('word)  {  _.size  }          .write(Tsv(args("output")))   }   There  is  scald.rb  to  get  you  started  (get  it  from  Github  project)     Building  and  running  a  fat  jar  (for  local,  include  hadoop,  for  cluster  mark  it  “provided”):   > sbt assembly > java -jar target/scala-2.10/scalding_talk-assembly-0.1.jar com.sanoma.cda.examples.WordCount1 --local --input data/11.txt.utf-8 --output wc.txt > hadoop jar job-jars/scalding_talk-assembly-0.1.jar --Dmapred.reduce.tasks=70 com.sanoma.cda.examples.WordCount1 --hdfs --input /data/AliceInWonderland --output /user/Alice_wc   the and to a of she said in it was you I as that Alice …   Alice, Alice. Alice; Alice's Alice: (Alice Alice! Alice,)  1664    1172    780    773    662    596    484    416    401    356    329    301    260    246    226    221    76    54    16    11    7    4    3    2  
  • 9. Word  Count  in  Scalding   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount2(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  line:  String  =>  tokenize(line)  }          .filter('word)  {  word:  String  =>  word  !=  ""  }          .groupBy('word)  {  _.size  }            .groupAll{  _.sortBy(('size,  'word)).reverse  }  //  this  is  just  for  easy  results          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+")      }   }   the and to a of it she said you in i alice was that as her with at on all  1804    912    801    684    625    541    538    462    429    428    400    385    358    291    272    248    228    224    204    197  
  • 10. Word  count  in  Scalding   Almost  1-­‐to-­‐1  rela)on  between  the   process  and  the  Scalding  code!     UDFs  directly  in  Scala   And  Java  libraries  can  be  used   Documents   (lines)   Tokenize   package  com.sanoma.cda.examples   import  com.twitter.scalding._       class  WordCount2(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .groupBy('word)  {  _.size  }          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+")      }   }     GroupBy   (token)   Count   Word   count  
  • 11. About  Scalding   •  Started  at  Twiper  –  years  of  produc)on  use   •  Well  tested  and  op)mized  by  different  teams,   including  Twiper,  Concurrent  Inc.,  Etsy,  …   •  Has  very  fast  local  mode  (no  need  to  install   Hadoop  locally)   •  Flow  planner  is  designed  to  be  portable  à  in   future,  the  same  jobs  might  run  on  Storm  cluster   for  example   •  Scala…  very  nice  programming  language  –  YMMV   –  Func)onal  &  object  oriented,  has  REPL  
  • 12. Scalding  Func)ons   •  3  APIs:   –  Fields-­‐based  API  –  easy  to  start  from  here   –  Type-­‐safe  API   –  Matrix  API   •  Field-­‐based  API   –  Map-­‐like  func)ons   •  map,  flatMap,  project,  insert,  filter,  limit…   –  Grouping/reducing  func)ons   •  groupBy,  groupAll   •  .size,  .sum,  .average,  .sizeAveStdev,  .toList,  .max,   sortBy,  .reduce,  .foldLeu,  .pivot,  …   –  Join  Opera)ons   •  joinWithSmaller,  joinWithLarger,  joinWithTiny,  crossWithTiny   •  InnerJoin,  LeuJoin,  RightJoin,  OuterJoin  
  • 13. Scalding  matrix  API   package  com.twitter.scalding.examples   import  com.twitter.scalding._   import  com.twitter.scalding.mathematics.Matrix       /**   *  Loads  a  directed  graph  adjacency  matrix  where  a[i,j]  =  1  if  there  is  an  edge  from  a[i]  to  b[j]   *  and  computes  the  cosine  of  the  angle  between  every  two  pairs  of  vectors   */   class  ComputeCosineJob(args  :  Args)  extends  Job(args)  {      import  Matrix._          val  adjacencyMatrix  =  Tsv(  args("input"),  ('user1,  'user2,  'rel)  )          .read          .toMatrix[Long,Long,Double]('user1,  'user2,  'rel)          //  we  compute  the  L2  normalized  adjacency  graph        val  matL2Norm  =  adjacencyMatrix.rowL2Normalize        //  we  compute  the  innerproduct  of  the  normalized  matrix  with  itself      //  which  is  equivalent  with  computing  cosine:  AA^T  /  ||A||  *  ||A||      val  cosDist  =  matL2Norm  *  matL2Norm.transpose        cosDist.write(Tsv(args("output”)))   }    
  • 14.
  • 15. What  is  a  monoid?   •  Closure   ∀a, b ∈ T : a • b ∈ T •  Associa)vity   ∀a, b, c ∈ T : (a • b)•c = a •(b •c) •  Iden)ty  element   ∃I ∈ T : ∀a ∈ T : I • a = a • I = a Scala  trait:   trait  Monoid[T]  {        def  zero:  T        def  plus(left:  T,  right:  T):  T   }  
  • 16. Examples  of  monoids   •  Numbers,  String,  list,  set,  map   •  Algorithms:     –  Min,  Max   –  Moments  (count,  mean,  std,  …)   –  Approximate  histograms,  quan)les   –  Approximate  data  structures   •  Bloom  Filter,  CountMinSketch,  HyperLogLog   –  Stochas)c  gradient  descent  
  • 17. What’s  the  point?   a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7     (a0  +  a1)  +  (a2  +  a3)  +  (a4  +  a5)  +  (a6  +  a7)     (  (a0  +  a1)  +  (a2  +  a3)  )  +  (  (a4  +  a5)  +  (a6  +  a7)  )     (  (  (a0  +  a1)  +  (a2  +  a3)  )  +  (  (a4  +  a5)  +  (a6  +  a7)  )  )   à  Parallelism  
  • 18. What’s  the  point?   a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7                    (a0  +  a1  +  a2  +  a3  +  a4  +  a5  +  a6  +  a7)  +  a8     à  Incremental  aggrega)on  
  • 19. What’s  the  point?   •  Easily  unit  testable  opera)ons   •  Simple  aggrega)on  code   à  Beper  quality  
  • 20. Word  Count  with  Map  Monoid   package  com.sanoma.cda.examples   import  com.twitter.scalding._   import  com.twitter.algebird.Operators._       class  WordCount3(args:  Args)  extends  Job(args)  {      TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .map('word  -­‐>  'word)  {  w:  String  =>  Map[String,  Int](w  -­‐>  1)  }          .groupAll{  _.sum[Map[String,  Int]]('word)  }            //  We  could  save  the  map  here,  but  if  we  want  similar  output  as  in  previous...          .flatMap('word  -­‐>  ('word,  'size))  {  words:  Map[String,  Int]  =>  words.toList  }          .groupAll{  _.sortBy(('size,  'word)).reverse  }  //  this  is  just  for  easy  results          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+").filter(  _  !=  "")      }   }   the and to a of it she said you in i alice was that as her with at on all  1804    912    801    684    625    541    538    462    429    428    400    385    358    291    272    248    228    224    204    197  
  • 21. Top  Words  with  CMS   •  asd   package  com.sanoma.cda.examples   import  com.twitter.scalding._   import  com.twitter.algebird._       class  WordCount5(args:  Args)  extends  Job(args)  {      implicit  def  utf8(s:  String):  Array[Byte]  =  com.twitter.bijection.Injection.utf8(s)      implicit  val  cmsm  =  new  SketchMapMonoid[String,  Long](128,  6,  0,  20)  //  top  20      type  ApproxMap  =  SketchMap[String,  Long]          TextLine(args("input"))          .flatMap('line  -­‐>  'word)  {  tokenize  }          .map('word  -­‐>  'word)  {  w:  String  =>  cmsm.create((w,  1L))  }          .groupAll{  _.sum[ApproxMap]('word)  }          .flatMap('word  -­‐>  ('word,  'size))  {  words:  ApproxMap  =>  words.heavyHitters  }          .write(Tsv(args("output")))          def  tokenize(text:  String):  Array[String]  =  {          text.toLowerCase.replaceAll("[^a-­‐z0-­‐9s]",  "").split("s+").filter(  _  !=  "")      }   }   the and to a of she it said you in i alice at was that her with as not be  1859    972    867    748    711    636    619    579    504    495    456    431    407    394    342    341    338    337    290    286  
  • 22. Start  using  Scalding  now!  :-­‐)   GitHub:   hpps://github.com/twiper/scalding     Tutorials:   hpps://github.com/twiper/scalding/tree/ develop/tutorial      
  • 23. Thanks!       Hugo  Gävert   Sanoma