SlideShare a Scribd company logo
1 of 194
Download to read offline
Apache Spark for 

library developers
William Benton
Erik Erlandson
About Will
The Silex and Isarn libraries
Reusable open-source code that works 

with Spark, factored from internal apps.
We’ve tracked Spark releases since Spark 1.3.0.
See and
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
Basic considerations
Today’s main themes
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your SBT build definition:
Cross-building for Scala
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your shell:
$ sbt +compile
$ sbt "++ 2.11.11" compile
$ sbt +compile # or test, package, publish, etc.
$ sbt "++ 2.11.11" compile
scalaVersion := "2.11.11"
crossScalaVersions := Seq("2.10.6", "2.11.11")
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
in your SBT build definition:
Bring-your-own Spark
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
in your SBT build definition:
“Bring-your-own Spark”
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-sql" % "2.3.0" % Provided,
"org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided,
"joda-time" % "joda-time" % "2.7",
"org.scalatest" %% "scalatest" % "2.2.4" % Test)
Taking care with resources
Taking care with resources
Taking care with resources
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
Caching when necessary
def step(rdd: RDD[_]) = {
val wasUncached = rdd.storageLevel == StorageLevel.NONE
if (wasUncached) { rdd.cache() }
result = trainModel(rdd)
if (wasUncached) { rdd.unpersist() }
nextModel = modelFromState(newState)
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
nextModel = modelFromState(newState)
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
nextModel = modelFromState(newState)
var nextModel = initialModel
for (int i = 0; i < iterations; i++) {
val current = sc.broadcast(nextModel)
val newState =
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
pointer flags size locks element pointer element pointer
pointer flags size locks 1.0
pointer flags size locks 3.0 4.0
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
pointer flags size locks element pointer element pointer
pointer flags size locks 1.0
pointer flags size locks 3.0 4.0
2.0 32 bytes of data…
Minding the JVM heap
val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
pointer flags size locks element pointer element pointer
pointer flags size locks 1.0
pointer flags size locks 3.0 4.0
…and 64 bytes
of overhead!
32 bytes of data…
Continuous integration for
Spark libraries and apps
CPU Memory
Writing generic code for
Spark’s parallel collections
The RDD is invariant
T <: U RDD[T] <: RDD[U]
The RDD is invariant
T <: U RDD[T] <: RDD[U]
dog animal
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
T <: U RDD[T] <: RDD[U]
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
trait HasUserId { val userid: Int }
case class Transaction(override val userid: Int,
timestamp: Int,
amount: Double)
extends HasUserId {}
def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
val xacts = spark.parallelize(Array(
Transaction(1, 1, 1.0),
Transaction(2, 2, 1.0)
<console>: error: type mismatch;
found : org.apache.spark.rdd.RDD[Transaction]
required: org.apache.spark.rdd.RDD[HasUserId]
Note: Transaction <: HasUserID, but class RDD is invariant in type T.
You may wish to define T as +T instead. (SLS 4.5)
An example: natural join
An example: natural join
An example: natural join
Ad-hoc natural join
df1.join(df2, df1("a") === df2("a") &&
df1("b") === df2("b") &&
df1("e") === df2("e"))
= {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
introspecting over column names
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
dynamically constructing expressions
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
dynamically constructing expressions
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
dynamically constructing expressions
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
[left.a === right.a, left.b === right.b, …]
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
left.a === right.a && left.b === right.b && …
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
left.a === right.a && left.b === right.b && …
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
dynamically constructing column lists
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
def natjoin(left: DataFrame, right: DataFrame): DataFrame = {
val lcols = left.columns
val rcols = right.columns
val ccols = lcols.toSet intersect rcols.toSet
.join(right, {col => left(col) === right(col) }.reduce(_ && _))
.select(lcols.collect { case c if ccols.contains(c) => left(c) } ++
lcols.collect { case c if !ccols.contains(c) => left(c) } ++
rcols.collect { case c if !ccols.contains(c) => right(c) } : _*)
dynamically constructing column lists
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
import NaturalJoin.implicits._
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
import NaturalJoin.implicits._
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
import NaturalJoin.implicits._
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
import NaturalJoin.implicits._
case class DFWithNatJoin(df: DataFrame)
extends NaturalJoining {
def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other)
object NaturalJoin extends NaturalJoining {
object implicits {
implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df)
import NaturalJoin.implicits._
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
User-defined functions
{"a": 1, "b": "wilma", ..., "x": "club"}
{"a": 2, "b": "betty", ..., "x": "diamond"}
{"a": 3, "b": "fred", ..., "x": "heart"}
{"a": 4, "b": "barney", ..., "x": "spade"}
wilma club
betty diamond
fred heart
barney spade
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
import json
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def selectively_structure(fields):
resultType = StructType([StructField(f, StringType(), nullable=True)
for f in fields])
def impl(js):
d = json.loads(js)
return [str(d.get(f)) for f in fields]
return [None] * len(fields)
return udf(impl, resultType)
extract_bx = selectively_structure(["b", "x"])
structured_df = df.withColumn("result", extract_bx("json"))
Spark’s ML pipelines
Spark’s ML pipelines
Spark’s ML pipelines
Spark’s ML pipelines model.transform(df)
Working with ML pipelines
Working with ML pipelines
Spark’s ML pipelines
Spark’s ML pipelines
Spark’s ML pipelines model.transform(df)
Spark’s ML pipelines model.transform(df)
Basic considerations for reusable Spark code
Generic functions for parallel collections
Extending data frames with custom aggregates
Exposing JVM libraries to Python
Sharing your work with the world
About Erik
User-defined aggregates:
the fundamentals
Three components
Three components
Three components
Three components
Three components
User-defined aggregates:
the implementation
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int)
(implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N])
extends UserDefinedAggregateFunction {
def deterministic: Boolean = false
def inputSchema: StructType =
StructType(StructField("x", dataTpe.tpe) :: Nil)
def bufferSchema: StructType =
StructType(StructField("tdigest", TDigestUDT) :: Nil)
def dataType: DataType = TDigestUDT
Four main functions: initialize
Four main functions: initialize
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
Four main functions: evaluate
Four main functions: evaluate
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
def initialize(buf: MutableAggregationBuffer): Unit = {
buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV))
def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
Four main functions: update
Four main functions: update
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
Four main functions: merge
Four main functions: merge
1 + 2
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
def update(buf: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest +
def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = {
buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++
User-defined aggregates:
User-defined types
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
package org.apache.spark
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
User-defined types
package org.apache.spark.isarnproject.sketches.udt
@SQLUserDefinedType(udt = classOf[TDigestUDT])
case class TDigestSQL(tdigest: TDigest)
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
// ....
Implementing custom types
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
class TDigestUDT extends UserDefinedType[TDigestSQL] {
def userClass: Class[TDigestSQL] = classOf[TDigestSQL]
override def pyUDT: String =
override def typeName: String = "tdigest"
def sqlType: DataType = StructType(
StructField("delta", DoubleType, false) ::
/* ... */
StructField("clustM", ArrayType(DoubleType, false), false) ::
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest)
private[sketches] def serializeTD(td: TDigest): InternalRow = {
val TDigest(delta, maxDiscrete, nclusters, clusters) = td
val row = new GenericInternalRow(5)
row.setDouble(0, delta)
row.setInt(1, maxDiscrete)
row.setInt(2, nclusters)
val clustX = clusters.keys.toArray
val clustM = clusters.values.toArray
row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX))
row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM))
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td))
private[sketches] def deserializeTD(datum: Any): TDigest =
datum match { case row: InternalRow =>
val delta = row.getDouble(0)
val maxDiscrete = row.getInt(1)
val nclusters = row.getInt(2)
val clustX = row.getArray(3).toDoubleArray()
val clustM = row.getArray(4).toDoubleArray()
val clusters =
.foldLeft(TDigestMap.empty) { case (td, e) => td + e }
TDigest(delta, maxDiscrete, nclusters, clusters)
Extending PySpark with
your Scala library
[ ]
[ ]
[ ]
[ ]
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing =
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing =
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing =
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# class to access the active Spark context for Python
from pyspark.context import SparkContext
# gateway to the JVM from py4j
sparkJVM = SparkContext._active_spark_context._jvm
# use the gateway to access JVM objects and classes
thisThing =
# use the gateway to access JVM objects and classes
thisThing =
A Python-friendly wrapper
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
package org.isarnproject.sketches.udaf
object pythonBindings {
def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) =
TDigestUDAF[Double](delta, maxDiscrete)
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb =
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb =
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb =
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
tdapply apply
from pyspark.sql.column import Column, _to_java_column, _to_seq
from pyspark.context import SparkContext
# one of these for each type parameter Double, Int, Long, etc
def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0):
sc = SparkContext._active_spark_context
pb =
tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply
return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
class TDigestUDT(UserDefinedType):
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
class TDigestUDT(UserDefinedType):
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
class TDigestUDT(UserDefinedType):
def sqlType(cls):
return StructType([
StructField("delta", DoubleType(), False),
StructField("maxDiscrete", IntegerType(), False),
StructField("nclusters", IntegerType(), False),
StructField("clustX", ArrayType(DoubleType(), False), False),
StructField("clustM", ArrayType(DoubleType(), False), False)])
# ...
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def module(cls):
return "isarnproject.sketches.udt.tdigest"
def scalaUDT(cls):
return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT"
def simpleString(self):
return "tdigest"
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT(UserDefinedType):
# ...
def serialize(self, obj):
return (, obj.maxDiscrete, obj.nclusters, 
[float(v) for v in obj.clustX], 
[float(v) for v in obj.clustM])
def deserialize(self, datum):
return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
class TDigestUDT extends UserDefinedType[TDigestSQL] {
// ...
override def pyUDT: String =
Python code in JAR files
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") ->
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") ->
Cross-building for Python
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
lazy val compilePython = taskKey[Unit]("Compile python files")
compilePython := {
val s: TaskStreams = streams.value"compiling python...")
if (stat != 0) {
throw new IllegalStateException("python compile failed")
(packageBin in Compile) <<=
(packageBin in Compile).dependsOn(compilePython)
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
Using versioned JAR files
$ pyspark --packages 
$ pyspark --packages 
Using versioned JAR files
$ pyspark --packages 
$ pyspark --packages 
Using versioned JAR files
$ pyspark --packages 
$ pyspark --packages 
Show your work: 

publishing results
Developing with git-flow
$ brew install git-flow # macOS
$ dnf install git-flow # Fedora
$ yum install git-flow # CentOS
$ apt-get install git-flow # Debian and friends
(Search the internet for “git flow” to learn more!)
# Set up git-flow in this repository
$ git flow init
# Start work on my-awesome-feature; create
# and switch to a feature branch
$ git flow feature start my-awesome-feature
$ ...
# Finish work on my-awesome-feature; merge
# feature/my-awesome-feature to develop
$ git flow feature finish my-awesome-feature
# Start work on a release branch
$ git flow release start 0.1.0
# Hack and bump version numbers
$ ...
# Finish work on v0.1.0; merge
# release/0.1.0 to develop and master;
# tag v0.1.0
$ git flow release finish 0.1.0
Maven Central Bintray
not really easy to set up for library developers trivial
trivial easy to set up for library users mostly
yes, via sbt easy to publish yes, via sbt + plugins
yes easy to resolve artifacts mostly
Conclusions and takeaways
#SAISDD6 • @manyangled • @willb

More Related Content

What's hot

What's hot (20)

Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Basics of RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Spark Cassandra Connector Dataframes
Spark Cassandra Connector DataframesSpark Cassandra Connector Dataframes
Spark Cassandra Connector Dataframes
Spark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-CasesSpark cassandra connector.API, Best Practices and Use-Cases
Spark cassandra connector.API, Best Practices and Use-Cases
Spark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 FuriousSpark and Cassandra 2 Fast 2 Furious
Spark and Cassandra 2 Fast 2 Furious
Cassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data LocalityCassandra and Spark: Optimizing for Data Locality
Cassandra and Spark: Optimizing for Data Locality
Spark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and FutureSpark Cassandra Connector: Past, Present, and Future
Spark Cassandra Connector: Past, Present, and Future
Riak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup GroupRiak at The NYC Cloud Computing Meetup Group
Riak at The NYC Cloud Computing Meetup Group
Apache Spark RDD 101
Apache Spark RDD 101Apache Spark RDD 101
Apache Spark RDD 101
Apache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the fireApache cassandra and spark. you got the the lighter, let's start the fire
Apache cassandra and spark. you got the the lighter, let's start the fire
Django cryptography
Django cryptographyDjango cryptography
Django cryptography
Apache Spark with Scala
Apache Spark with ScalaApache Spark with Scala
Apache Spark with Scala
Escape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* OpsEscape From Hadoop: Spark One Liners for C* Ops
Escape From Hadoop: Spark One Liners for C* Ops
Introduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at lastIntroduction to Spark Datasets - Functional and relational together at last
Introduction to Spark Datasets - Functional and relational together at last
Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016Beyond shuffling - Scala Days Berlin 2016
Beyond shuffling - Scala Days Berlin 2016
Spark Streaming with Cassandra
Spark Streaming with CassandraSpark Streaming with Cassandra
Spark Streaming with Cassandra
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
Cassandra and Spark: Optimizing for Data Locality-(Russell Spitzer, DataStax)
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
An Introduction to time series with Team Apache
An Introduction to time series with Team ApacheAn Introduction to time series with Team Apache
An Introduction to time series with Team Apache
Getting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part IIGetting started in Apache Spark and Flink (with Scala) - Part II
Getting started in Apache Spark and Flink (with Scala) - Part II
Spark cassandra integration, theory and practice
Spark cassandra integration, theory and practiceSpark cassandra integration, theory and practice
Spark cassandra integration, theory and practice

Similar to Apache Spark for Library Developers with Erik Erlandson and William Benton

Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson

Similar to Apache Spark for Library Developers with Erik Erlandson and William Benton (20)

Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
[DSC 2016] 系列活動:李泳泉 / 星火燎原 - Spark 機器學習初探
Beneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek LaskowskiBeneath RDD in Apache Spark by Jacek Laskowski
Beneath RDD in Apache Spark by Jacek Laskowski
Introduction to Apache Spark
Introduction to Apache SparkIntroduction to Apache Spark
Introduction to Apache Spark
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard MaasSpark Streaming Programming Techniques You Should Know with Gerard Maas
Spark Streaming Programming Techniques You Should Know with Gerard Maas
Spark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotronSpark zeppelin-cassandra at synchrotron
Spark zeppelin-cassandra at synchrotron
Apache spark core
Apache spark coreApache spark core
Apache spark core
A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...A really really fast introduction to PySpark - lightning fast cluster computi...
A really really fast introduction to PySpark - lightning fast cluster computi...
Intro to Apache Spark
Intro to Apache SparkIntro to Apache Spark
Intro to Apache Spark
Intro to Apache Spark
Intro to Apache SparkIntro to Apache Spark
Intro to Apache Spark
Spark Programming
Spark ProgrammingSpark Programming
Spark Programming
Introduction to Apache Spark
Introduction to Apache Spark Introduction to Apache Spark
Introduction to Apache Spark
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules DamjiA Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames and Datasets by Jules Damji
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Three Apache Spark APIs: RDDs, DataFrames, and Datasets with Jules ...
A Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In ProductionA Tale of Two APIs: Using Spark Streaming In Production
A Tale of Two APIs: Using Spark Streaming In Production
Intro to apache spark stand ford
Intro to apache spark stand fordIntro to apache spark stand ford
Intro to apache spark stand ford
Spark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted MalaskaSpark Summit EU talk by Ted Malaska
Spark Summit EU talk by Ted Malaska
Scala Meetup Hamburg - Spark
Scala Meetup Hamburg - SparkScala Meetup Hamburg - Spark
Scala Meetup Hamburg - Spark
Apache Spark RDDs
Apache Spark RDDsApache Spark RDDs
Apache Spark RDDs
3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developers3 Dundee-Spark Overview for C* developers
3 Dundee-Spark Overview for C* developers

More from Databricks

Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized Platform
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI Integration
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction Queries
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache Spark

More from Databricks (20)

DW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptxDW Migration Webinar-March 2022.pptx
DW Migration Webinar-March 2022.pptx
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 1
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 1 | Part 2
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 2
Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4Data Lakehouse Symposium | Day 4
Data Lakehouse Symposium | Day 4
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
5 Critical Steps to Clean Your Data Swamp When Migrating Off of Hadoop
Democratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized PlatformDemocratizing Data Quality Through a Centralized Platform
Democratizing Data Quality Through a Centralized Platform
Learn to Use Databricks for Data Science
Learn to Use Databricks for Data ScienceLearn to Use Databricks for Data Science
Learn to Use Databricks for Data Science
Why APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML MonitoringWhy APM Is Not the Same As ML Monitoring
Why APM Is Not the Same As ML Monitoring
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch FixThe Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
The Function, the Context, and the Data—Enabling ML Ops at Stitch Fix
Stage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI IntegrationStage Level Scheduling Improving Big Data and AI Integration
Stage Level Scheduling Improving Big Data and AI Integration
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorchSimplify Data Conversion from Spark to TensorFlow and PyTorch
Simplify Data Conversion from Spark to TensorFlow and PyTorch
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on KubernetesScaling your Data Pipelines with Apache Spark on Kubernetes
Scaling your Data Pipelines with Apache Spark on Kubernetes
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark PipelinesScaling and Unifying SciKit Learn and Apache Spark Pipelines
Scaling and Unifying SciKit Learn and Apache Spark Pipelines
Sawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature AggregationsSawtooth Windows for Feature Aggregations
Sawtooth Windows for Feature Aggregations
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen SinkRedis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Redis + Apache Spark = Swiss Army Knife Meets Kitchen Sink
Re-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and SparkRe-imagine Data Monitoring with whylogs and Spark
Re-imagine Data Monitoring with whylogs and Spark
Raven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction QueriesRaven: End-to-end Optimization of ML Prediction Queries
Raven: End-to-end Optimization of ML Prediction Queries
Processing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache SparkProcessing Large Datasets for ADAS Applications using Apache Spark
Processing Large Datasets for ADAS Applications using Apache Spark
Massive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta LakeMassive Data Processing in Adobe Using Delta Lake
Massive Data Processing in Adobe Using Delta Lake

Recently uploaded

CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
9953056974 Low Rate Call Girls In Saket, Delhi NCR
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdf
Lars Albertsson
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Vip Model Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Vip Model  Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...Vip Model  Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Vip Model Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get CytotecAbortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Riyadh +966572737505 get cytotec

Recently uploaded (20)

Sampling (random) method and Non random.ppt
Sampling (random) method and Non random.pptSampling (random) method and Non random.ppt
Sampling (random) method and Non random.ppt
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Saket (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
VIP Model Call Girls Hinjewadi ( Pune ) Call ON 8005736733 Starting From 5K t...
Halmar dropshipping via API with DroFx
Halmar  dropshipping  via API with DroFxHalmar  dropshipping  via API with DroFx
Halmar dropshipping via API with DroFx
Schema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdfSchema on read is obsolete. Welcome metaprogramming..pdf
Schema on read is obsolete. Welcome metaprogramming..pdf
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Junnasandra Call Girls: 🍓 7737669865 🍓 High Profile Model Escorts | Bangalore...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
Call Girls in Sarai Kale Khan Delhi 💯 Call Us 🔝9205541914 🔝( Delhi) Escorts S...
BabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptxBabyOno dropshipping via API with DroFx.pptx
BabyOno dropshipping via API with DroFx.pptx
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip CallDelhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Delhi Call Girls CP 9711199171 ☎✔👌✔ Whatsapp Hard And Sexy Vip Call
Carero dropshipping via API with DroFx.pptx
Carero dropshipping via API with DroFx.pptxCarero dropshipping via API with DroFx.pptx
Carero dropshipping via API with DroFx.pptx
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 nightCheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Cheap Rate Call girls Sarita Vihar Delhi 9205541914 shot 1500 night
Generative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and MilvusGenerative AI on Enterprise Cloud with NiFi and Milvus
Generative AI on Enterprise Cloud with NiFi and Milvus
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
April 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's AnalysisApril 2024 - Crypto Market Report's Analysis
April 2024 - Crypto Market Report's Analysis
Best VIP Call Girls Noida Sector 39 Call Me: 8448380779
Best VIP Call Girls Noida Sector 39 Call Me: 8448380779Best VIP Call Girls Noida Sector 39 Call Me: 8448380779
Best VIP Call Girls Noida Sector 39 Call Me: 8448380779
Vip Model Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Vip Model  Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...Vip Model  Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Vip Model Call Girls (Delhi) Karol Bagh 9711199171✔️Body to body massage wit...
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get CytotecAbortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Abortion pills in Doha Qatar (+966572737505 ! Get Cytotec
Midocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFxMidocean dropshipping via API with DroFx
Midocean dropshipping via API with DroFx
100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptx100-Concepts-of-AI by Anupama Kate .pptx
100-Concepts-of-AI by Anupama Kate .pptx
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdfMarket Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf
Market Analysis in the 5 Largest Economic Countries in Southeast Asia.pdf

Apache Spark for Library Developers with Erik Erlandson and William Benton

  • 1. Apache Spark for 
 library developers William Benton @willb Erik Erlandson @manyangled
  • 3. #SAISDD6 The Silex and Isarn libraries Reusable open-source code that works 
 with Spark, factored from internal apps. We’ve tracked Spark releases since Spark 1.3.0. See and
  • 4.
  • 5. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 17. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 18. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 19. #SAISDD6 in your SBT build definition: Cross-building for Scala scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11") in your shell: $ sbt +compile $ sbt "++ 2.11.11" compile $ sbt +compile # or test, package, publish, etc. $ sbt "++ 2.11.11" compile scalaVersion := "2.11.11" crossScalaVersions := Seq("2.10.6", "2.11.11")
  • 20. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 21. #SAISDD6 in your SBT build definition: Bring-your-own Spark libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 22. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 23. #SAISDD6 in your SBT build definition: “Bring-your-own Spark” libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test) libraryDependencies ++= Seq( "org.apache.spark" %% "spark-core" % "2.3.0" % Provided, "org.apache.spark" %% "spark-sql" % "2.3.0" % Provided, "org.apache.spark" %% "spark-mllib" % "2.3.0" % Provided, "joda-time" % "joda-time" % "2.7", "org.scalatest" %% "scalatest" % "2.2.4" % Test)
  • 27. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 28. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache()
  • 29. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } rdd.cache() rdd.unpersist()
  • 30. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 31. #SAISDD6 def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result } Caching when necessary def step(rdd: RDD[_]) = { val wasUncached = rdd.storageLevel == StorageLevel.NONE if (wasUncached) { rdd.cache() } result = trainModel(rdd) if (wasUncached) { rdd.unpersist() } result }
  • 32. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 33. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 34. #SAISDD6 nextModel = modelFromState(newState) current.unpersist } var nextModel = initialModel for (int i = 0; i < iterations; i++) { val current = sc.broadcast(nextModel) val newState = current.unpersist sc.broadcast(nextModel)
  • 35. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0))
  • 36. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0
  • 37. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 32 bytes of data…
  • 38. #SAISDD6 Minding the JVM heap val mat = Array(Array(1.0, 2.0), Array(3.0, 4.0)) class pointer flags size locks element pointer element pointer class pointer flags size locks 1.0 class pointer flags size locks 3.0 4.0 2.0 …and 64 bytes of overhead! 32 bytes of data…
  • 39. Continuous integration for Spark libraries and apps
  • 48. Writing generic code for Spark’s parallel collections
  • 49. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U]
  • 50. #SAISDD6 The RDD is invariant T <: U RDD[T] <: RDD[U] dog animal
  • 51. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
  • 52. #SAISDD6 T <: U RDD[T] <: RDD[U] trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x)) trait HasUserId { val userid: Int } case class Transaction(override val userid: Int, timestamp: Int, amount: Double) extends HasUserId {} def badKeyByUserId(r: RDD[HasUserId]) = => (x.userid, x))
  • 53. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 54. #SAISDD6 val xacts = spark.parallelize(Array( Transaction(1, 1, 1.0), Transaction(2, 2, 1.0) )) badKeyByUserId(xacts) <console>: error: type mismatch; found : org.apache.spark.rdd.RDD[Transaction] required: org.apache.spark.rdd.RDD[HasUserId] Note: Transaction <: HasUserID, but class RDD is invariant in type T. You may wish to define T as +T instead. (SLS 4.5) badKeyByUserId(xacts)
  • 56. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 57. #SAISDD6 An example: natural join A B C D E A EB X Y
  • 58. #SAISDD6 An example: natural join A B C D E X Y
  • 59. #SAISDD6 Ad-hoc natural join df1.join(df2, df1("a") === df2("a") && df1("b") === df2("b") && df1("e") === df2("e"))
  • 60. #SAISDD6 = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame
  • 61. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 62. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } introspecting over column names
  • 63. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 64. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 65. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 66. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing expressions
  • 67. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } [left.a === right.a, left.b === right.b, …]
  • 68. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 69. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } left.a === right.a && left.b === right.b && …
  • 70. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) }
  • 71. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 72. #SAISDD6 def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } def natjoin(left: DataFrame, right: DataFrame): DataFrame = { val lcols = left.columns val rcols = right.columns val ccols = lcols.toSet intersect rcols.toSet if(ccols.isEmpty) left.limit(0).crossJoin(right.limit(0)) else left .join(right, {col => left(col) === right(col) }.reduce(_ && _)) .select(lcols.collect { case c if ccols.contains(c) => left(c) } ++ lcols.collect { case c if !ccols.contains(c) => left(c) } ++ rcols.collect { case c if !ccols.contains(c) => right(c) } : _*) } dynamically constructing column lists
  • 73. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 74. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf) case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 75. #SAISDD6 case class DFWithNatJoin(df: DataFrame) extends NaturalJoining { def natjoin(other: DataFrame): DataFrame = super.natjoin(df, other) } object NaturalJoin extends NaturalJoining { object implicits { implicit def dfWithNatJoin(df: DataFrame) = DFWithNatJoin(df) } } import NaturalJoin.implicits._ df.natjoin(otherdf)
  • 76. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"}
  • 77. #SAISDD6 User-defined functions {"a": 1, "b": "wilma", ..., "x": "club"} {"a": 2, "b": "betty", ..., "x": "diamond"} {"a": 3, "b": "fred", ..., "x": "heart"} {"a": 4, "b": "barney", ..., "x": "spade"} wilma club betty diamond fred heart barney spade
  • 78. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 79. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 80. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 81. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 82. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType)
  • 83. #SAISDD6 import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) import json from pyspark.sql.types import * from pyspark.sql.functions import udf def selectively_structure(fields): resultType = StructType([StructField(f, StringType(), nullable=True) for f in fields]) def impl(js): try: d = json.loads(js) return [str(d.get(f)) for f in fields] except: return [None] * len(fields) return udf(impl, resultType) extract_bx = selectively_structure(["b", "x"]) structured_df = df.withColumn("result", extract_bx("json"))
  • 88. #SAISDD6 Working with ML pipelines model.transform(df)
  • 89. #SAISDD6 Working with ML pipelines model.transform(df)
  • 93. #SAISDD6 Spark’s ML pipelines model.transform(df) inputCol epochs seed outputCol
  • 95. #SAISDD6 Forecast Basic considerations for reusable Spark code Generic functions for parallel collections Extending data frames with custom aggregates Exposing JVM libraries to Python Sharing your work with the world
  • 104. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 105. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 106. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 107. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 108. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 109. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 110. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 111. #SAISDD6 case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT case class TDigestUDAF[N](deltaV: Double, maxDiscreteV: Int) (implicit num: Numeric[N], dataTpe: TDigestUDAFDataType[N]) extends UserDefinedAggregateFunction { def deterministic: Boolean = false def inputSchema: StructType = StructType(StructField("x", dataTpe.tpe) :: Nil) def bufferSchema: StructType = StructType(StructField("tdigest", TDigestUDT) :: Nil) def dataType: DataType = TDigestUDT
  • 112. #SAISDD6 Four main functions: initialize initialize
  • 113. #SAISDD6 Four main functions: initialize initialize
  • 114. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 115. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 116. #SAISDD6 Four main functions: evaluate evaluate
  • 117. #SAISDD6 Four main functions: evaluate evaluate
  • 118. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 119. #SAISDD6 def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0) def initialize(buf: MutableAggregationBuffer): Unit = { buf(0) = TDigestSQL(TDigest.empty(deltaV, maxDiscreteV)) } def evaluate(buf: Row): Any = buf.getAs[TDigestSQL](0)
  • 122. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 123. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 124. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 125. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 126. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 127. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 128. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 129. #SAISDD6 Four main functions: merge 1 merge 2
  • 130. #SAISDD6 Four main functions: merge merge 1 + 2
  • 131. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 132. #SAISDD6 def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) } def update(buf: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)) { buf(0) = TDigestSQL(buf.getAs[TDigestSQL](0).tdigest + num.toDouble(input.getAs[N](0))) } } def merge(buf1: MutableAggregationBuffer, buf2: Row): Unit = { buf1(0) = TDigestSQL(buf1.getAs[TDigestSQL](0).tdigest ++ buf2.getAs[TDigestSQL](0).tdigest) }
  • 134. #SAISDD6 User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... package org.apache.spark
  • 135. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 136. #SAISDD6 package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // .... User-defined types package org.apache.spark.isarnproject.sketches.udt @SQLUserDefinedType(udt = classOf[TDigestUDT]) case class TDigestSQL(tdigest: TDigest) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] // ....
  • 137. #SAISDD6 Implementing custom types class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 138. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 139. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 140. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil) class TDigestUDT extends UserDefinedType[TDigestSQL] { def userClass: Class[TDigestSQL] = classOf[TDigestSQL] override def pyUDT: String = "isarnproject.sketches.udt.tdigest.TDigestUDT" override def typeName: String = "tdigest" def sqlType: DataType = StructType( StructField("delta", DoubleType, false) :: /* ... */ StructField("clustM", ArrayType(DoubleType, false), false) :: Nil)
  • 141. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 142. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 143. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 144. #SAISDD6 def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row } def serialize(tdsql: TDigestSQL): Any = serializeTD(tdsql.tdigest) private[sketches] def serializeTD(td: TDigest): InternalRow = { val TDigest(delta, maxDiscrete, nclusters, clusters) = td val row = new GenericInternalRow(5) row.setDouble(0, delta) row.setInt(1, maxDiscrete) row.setInt(2, nclusters) val clustX = clusters.keys.toArray val clustM = clusters.values.toArray row.update(3, UnsafeArrayData.fromPrimitiveArray(clustX)) row.update(4, UnsafeArrayData.fromPrimitiveArray(clustM)) row }
  • 145. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 146. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 147. #SAISDD6 def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) } def deserialize(td: Any): TDigestSQL = TDigestSQL(deserializeTD(td)) private[sketches] def deserializeTD(datum: Any): TDigest = datum match { case row: InternalRow => val delta = row.getDouble(0) val maxDiscrete = row.getInt(1) val nclusters = row.getInt(2) val clustX = row.getArray(3).toDoubleArray() val clustM = row.getArray(4).toDoubleArray() val clusters = .foldLeft(TDigestMap.empty) { case (td, e) => td + e } TDigest(delta, maxDiscrete, nclusters, clusters) }
  • 148. Extending PySpark with your Scala library
  • 153. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing =
  • 154. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm
  • 155. #SAISDD6 # class to access the active Spark context for Python from pyspark.context import SparkContext # gateway to the JVM from py4j sparkJVM = SparkContext._active_spark_context._jvm # use the gateway to access JVM objects and classes thisThing = # use the gateway to access JVM objects and classes thisThing =
  • 156. #SAISDD6 A Python-friendly wrapper package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) }
  • 157. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } tdigestDoubleUDAF
  • 158. #SAISDD6 package org.isarnproject.sketches.udaf object pythonBindings { def tdigestDoubleUDAF(delta: Double, maxDiscrete: Int) = TDigestUDAF[Double](delta, maxDiscrete) } Double
  • 159. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))
  • 160. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column))) tdapply apply
  • 161. #SAISDD6 from pyspark.sql.column import Column, _to_java_column, _to_seq from pyspark.context import SparkContext # one of these for each type parameter Double, Int, Long, etc def tdigestDoubleUDAF(col, delta=0.5, maxDiscrete=0): sc = SparkContext._active_spark_context pb = tdapply = pb.tdigestDoubleUDAF(delta, maxDiscrete).apply return Column(tdapply(_to_seq(sc, [col], _to_java_column)))tdapply(_to_seq(sc, [col], _to_java_column))
  • 162. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 163. #SAISDD6 class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) class TDigestUDT(UserDefinedType): @classmethod def sqlType(cls): return StructType([ StructField("delta", DoubleType(), False), StructField("maxDiscrete", IntegerType(), False), StructField("nclusters", IntegerType(), False), StructField("clustX", ArrayType(DoubleType(), False), False), StructField("clustM", ArrayType(DoubleType(), False), False)]) # ...
  • 164. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 165. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 166. #SAISDD6 class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest" class TDigestUDT(UserDefinedType): # ... @classmethod def module(cls): return "isarnproject.sketches.udt.tdigest" @classmethod def scalaUDT(cls): return "org.apache.spark.isarnproject.sketches.udt.TDigestUDT" def simpleString(self): return "tdigest"
  • 167. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 168. #SAISDD6 class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4]) class TDigestUDT(UserDefinedType): # ... def serialize(self, obj): return (, obj.maxDiscrete, obj.nclusters, [float(v) for v in obj.clustX], [float(v) for v in obj.clustM]) def deserialize(self, datum): return TDigest(datum[0], datum[1], datum[2], datum[3], datum[4])
  • 169. #SAISDD6 class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" } class TDigestUDT extends UserDefinedType[TDigestSQL] { // ... override def pyUDT: String = “isarnproject.sketches.udt.tdigest.TDigestUDT" }
  • 170. #SAISDD6 Python code in JAR files mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 171. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 172. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 173. #SAISDD6 mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" ) mappings in (Compile, packageBin) ++= Seq( (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" )
  • 174. #SAISDD6 Cross-building for Python lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 175. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython)
  • 176. #SAISDD6 lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) lazy val compilePython = taskKey[Unit]("Compile python files") compilePython := { val s: TaskStreams = streams.value"compiling python...") if (stat != 0) { throw new IllegalStateException("python compile failed") } } (packageBin in Compile) <<= (packageBin in Compile).dependsOn(compilePython) val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
  • 177. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 178. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 179. #SAISDD6 Using versioned JAR files $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7' $ pyspark --packages 'org.isarnproject:isarn-sketches-spark_2.11:0.3.0-sp2.2-py2.7'
  • 180. Show your work: 
 publishing results
  • 181. #SAISDD6 Developing with git-flow $ brew install git-flow # macOS $ dnf install git-flow # Fedora $ yum install git-flow # CentOS $ apt-get install git-flow # Debian and friends (Search the internet for “git flow” to learn more!)
  • 182. #SAISDD6 # Set up git-flow in this repository $ git flow init # Start work on my-awesome-feature; create # and switch to a feature branch $ git flow feature start my-awesome-feature $ ... # Finish work on my-awesome-feature; merge # feature/my-awesome-feature to develop $ git flow feature finish my-awesome-feature
  • 183. #SAISDD6 # Start work on a release branch $ git flow release start 0.1.0 # Hack and bump version numbers $ ... # Finish work on v0.1.0; merge # release/0.1.0 to develop and master; # tag v0.1.0 $ git flow release finish 0.1.0
  • 184.
  • 185.
  • 186.
  • 187.
  • 188. #SAISDD6 Maven Central Bintray not really easy to set up for library developers trivial trivial easy to set up for library users mostly yes, via sbt easy to publish yes, via sbt + plugins yes easy to resolve artifacts mostly