24. !
◦ val res = rtnFunc1(param1, param2) ( res
)
◦ val (res1, res2) = rtnFunc2(param1, param2) (
res1,res2 )
◦ val (_, res2) = rtnFunc2(param1, param2) (
)
! For Loop
◦ for (i <- collection) { … }
! For Loop ( yield )
◦ val rtnArr = for (i <- collection) yield { … }
24
Scala
val intArr = Array(1,2,3,4,5,6,7,8,9)
val multiArr=
for (i <- intArr; j <- intArr)
yield { i*j }
//multiArr 81 99
val (min,max)=getMinMax(intArr)
val (_, max)=getMinMax(intArr)
25. ! Tuple
◦ Tuple
◦ val v=(v1,v2,v3...) v._1, v._2, v._3…
◦ lambda
◦ lambda (_)
25
Scala val intArr = Array(1,2,3,4,5,7,8,9)
val res=getMinMax(intArr) //res=(1,9)=>tuple
val min=res._1 // res
val max=res._2 // res
val intArr = Array((1,2,3),(4,5,6),(7,8,9)) //intArr Tuple
val intArr2=intArr.map(x=> (x._1 * x._2 * x._3))
//intArr2: Array[Int] = Array(6, 120, 504)
val intArr3=intArr.filter(x=> (x._1 + x._2 > x._3))
//intArr3: Array[(Int, Int, Int)] = Array((4,5,6), (7,8,9))
val intArr = Array((1,2,3),(4,5,6),(7,8,9)) //intArr Tuple
def getThird(x:(Int,Int,Int)): Int = { (x._3) }
val intArr2=intArr.map(getThird(_))
val intArr2=intArr.map(x=>getThird(x)) //
//intArr2: Array[Int] = Array(3, 6, 9)
26. ! Class
◦ Scala Class JAVA Class
● private /
protected public
● Class
26
Scala
Scala:
class Person(userID: Int, name: String) // private
class Person(val userID: Int, var name: String)
// public userID
val person = new Person(102, John Smith)//
person.userID // 102
Person class Java :
public Class Person {
private final int userID;
private final String name;
public Person(int userID, String name) {
this.userID = userID;
this.name = name;
}}
27. ! Object
◦ Scala static
instance
◦ Scala Object static
● Scala Object singleton class instance
! Scala Object vs Class
◦ object utility Spark Driver Program
◦ class Entity
27
Scala
Scala Object:
object Utility {
def isNumeric(input: String): Boolean = input.trim()
.matches(s[+-]?((d+(ed+)?[lL]?)|(((d+(.d*)?)|(.d+))(ed+)?[fF]?)))
def toDouble(input: String): Double = {
val rtn = if (input.isEmpty() || !isNumeric(input)) Double.NaN else input.toDouble
rtn
}}
val d = Utility.toDouble(20) // new
29. ! sum:
◦ val sum = Array(1,2,3,4,5,7,8,9).sum
! max:
◦ val max = Array(1,2,3,4,5,7,8,9).max
! min:
◦ val max = Array(1,2,3,4,5,7,8,9).min
! distinct:
29
scala
val intArr = Array(1,2,3,4,5,7,8,9)
val sum = intArr.sum
//sum = 45
val max = intArr.max
//max = 9
val min = intArr.min
//min = 1
val disc = Array(1,1,1,2,2,2,3,3)
//disc = Array(1,2,3)
79. ! KMeans.train Model(KMeansModel
◦ val model=KMeans.train(data, numClusters, maxIterations,
runs)
● data (RDD[Vector])
● numClusters (K)
● maxIterations run Iteration
iteration maxIterations model
● runs KMeans run
model
! model.clusterCenters Feature
! model.computeCost WCSS model
79
K-Means in Spark MLlib
102. //import spark rdd library
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
//import decision tree library
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
object BikeShareClassificationDT {
def main(args: Array[String]): Unit = {
Logger.getLogger(com).setLevel(Level.OFF) //set logger
//initialize SparkContext
val sc = new SparkContext(new SparkConf().setAppName("BikeClassificationDT").setMaster("local[*]"))
}
}
! Decision Tree Library
! Driver Program sc
◦ appName - Driver Program
◦ master - master URL
102
103. 103
Model
Model
! (Class)
◦ BikeSummary
! prepare
◦ input file Features Label
RDD[LabeledPoint]
◦ RDD[LabeledPoint]
! getFeatures
◦ Model feature
! getCategoryInfo
◦ categroyInfoMap
104. 104
prepare
def prepare(sc: SparkContext): (RDD[LabeledPoint], RDD[LabeledPoint])= {
val rawData=sc.textFile(data/hour.csv) //read hour.csv in data folder
val rawDataNoHead=rawData.mapPartitionsWithIndex { (idx, iter) =>
{ if (idx == 0) iter.drop(1) else iter } } //ignore first row(column name)
val lines:RDD[Array[String]] = rawDataNoHead.map { x =>
x.split(, ).map { x => x.trim() } } //split columns with comma
val bikeData = lines.map{ x => BikeShareEntity(⋯) }//RDD[BikeShareEntity]
val lpData=bikeData.map { x => {
val label = if (x.cnt > 200) 1 else 0 //大於200為1,否則為0
val features = Vectors.dense(getFeatures(x))
new LabeledPoint(label, features ) //LabeledPoint由label及Vector組成
}
//以6:4的比例隨機分割,將資料切分為訓練及驗證用資料
val Array(trainData, validateData) = lpData.randomSplit(Array(0.6, 0.4))
(trainData, validateData)
}
139. //import spark rdd library
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
//import decision tree library
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
object BikeShareRegressionDT {
def main(args: Array[String]): Unit = {
Logger.getLogger(com).setLevel(Level.OFF) //set logger
//initialize SparkContext
val sc = new SparkContext(new SparkConf().setAppName("BikeRegressionDT").setMaster("local[*]"))
}
}
! Decision Tree Library
! Driver Program sc
◦ appName - Driver Program
◦ master - master URL
139
140. 140
Model
Model
! (Class)
◦ BikeSummary
! prepare
◦ input file Features Label
RDD[LabeledPoint]
◦ RDD[LabeledPoint]
! getFeatures
◦ Model feature
! getCategoryInfo
◦ categroyInfoMap
141. 141
prepare
def prepare(sc: SparkContext): (RDD[LabeledPoint], RDD[LabeledPoint])= {
val rawData=sc.textFile(data/hour.csv) //read hour.csv in data folder
val rawDataNoHead=rawData.mapPartitionsWithIndex { (idx, iter) =>
{ if (idx == 0) iter.drop(1) else iter } } //ignore first row(column name)
val lines:RDD[Array[String]] = rawDataNoHead.map { x =>
x.split(, ).map { x => x.trim() } } //split columns with comma
val bikeData = lines.map{ x => BikeShareEntity(⋯) }//RDD[BikeShareEntity]
val lpData=bikeData.map { x => {
val label = x.cnt //預測目標為租借量欄位
val features = Vectors.dense(getFeatures(x))
new LabeledPoint(label, features ) //LabeledPoint由label及Vector組成
}
//以6:4的比例隨機分割,將資料切分為訓練及驗證用資料
val Array(trainData, validateData) = lpData.randomSplit(Array(0.6, 0.4))
(trainData, validateData)
}
173. 1) Data Feature LabeledPoint Data
◦ Feature : 6~9 View/Order
◦ Label : 10 Order ( 1 0)
◦ Feature : …
2) LabeledPoint Data Training Set Validating Set( 6:4
Split)
3) Training Set Validating Set Machine Learning Model
4) Testing Set
◦ Feature : 6~10 View/Order
◦ Features : 1)
5) 3) Model Testing Set
6)
7) 1) ~ 6)
173