1. 下载数据,并写入hdfs中
miaofu@master:~$ hadoop fs -ls /user/miaofu/covtype
-rw-r--r-- 2 miaofu supergroup 75169317 2016-09-17 23:20 /user/miaofu/covtype
2. 启动spark集群
miaofu@master:~/spark-1.6.2-bin-hadoop2.6$ jps
6649 ResourceManager
10821 Worker
2434 NameNode
2680 DataNode
2938 SecondaryNameNode
31714 SparkSubmit
10705 Master
32000 Jps
6786 NodeManager
3. 进入spark shell
miaofu@master:~/spark-1.6.2-bin-hadoop2.6$ bin/spark-shell --master spark://master:7077
16/09/19 13:19:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.6.2
/_/
Using Scala version 2.10.5 (OpenJDK 64-Bit Server VM, Java 1.7.0_95)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
16/09/19 13:19:30 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
16/09/19 13:19:30 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
16/09/19 13:19:37 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
16/09/19 13:19:37 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
16/09/19 13:19:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
16/09/19 13:19:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
SQL context available as sqlContext.
4. 读入数据,并简单分析
scala> val rawData = sc.textFile("hdfs:////user/miaofu/covtype")
rawData: org.apache.spark.rdd.RDD[String] = hdfs:////user/miaofu/covtype MapPartitionsRDD[1] at textFile at <console>:27
scala> rawData.counts()
<console>:30: error: value counts is not a member of org.apache.spark.rdd.RDD[String]
rawData.counts()
^
scala> rawData.count()
res1: Long = 581012
scala> var line = rawData.take(4)(1)
line: String = 2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
scala> line
res3: String = 2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
scala> val values = line.split(",").map(_.toDouble)
values: Array[Double] = Array(2590.0, 56.0, 2.0, 212.0, -6.0, 390.0, 220.0, 235.0, 151.0, 6225.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0)
scala> values
res4: Array[Double] = Array(2590.0, 56.0, 2.0, 212.0, -6.0, 390.0, 220.0, 235.0, 151.0, 6225.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0)
scala> values.init
res5: Array[Double] = Array(2590.0, 56.0, 2.0, 212.0, -6.0, 390.0, 220.0, 235.0, 151.0, 6225.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
scala> values.last
res6: Double = 5.05. 构建训练,测试,验证集
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.regression._
scala> val data1 = rawData.map{ line =>
| line.split(",").map(_.toDouble)
| }
data1: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[4] at map at <console>:37
scala> data1
res16: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[4] at map at <console>:37
scala> val data2 = data1.map{ v=>
| LabeledPoint(v.last-1,Vectors.dense(v.init))
| }
data2: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[5] at map at <console>:39
scala> val Array(trainData,cvData,testData) =
| data2.randomSplit(Array(0.8,0.1,0.1))
trainData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[9] at randomSplit at <console>:63
cvData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[10] at randomSplit at <console>:63
testData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[11] at randomSplit at <console>:63
scala> trainData.cache()
res27: trainData.type = MapPartitionsRDD[9] at randomSplit at <console>:63
scala> cvData.cache()
res28: cvData.type = MapPartitionsRDD[10] at randomSplit at <console>:63
scala> testData.cache()
res29: testData.type = MapPartitionsRDD[11] at randomSplit at <console>:63
scala> import org.apache.spark.mllib.evaluation._
import org.apache.spark.mllib.evaluation._
scala> import org.apache.spark.mllib.tree._
import org.apache.spark.mllib.tree._
scala> import org.apache.spark.mllib.tree.model._
import org.apache.spark.mllib.tree.model._
import org.apache.spark.rdd._
scala> def getMetrics(model:DecisionTreeModel,data:RDD[LabeledPoint]):
| MulticlassMetrics = {
| val predictionsAndLabels = data.map( e =>
| (model.predict(e.features),e.label)
| )
| new MulticlassMetrics(predictionsAndLabels)
| }
getMetrics: (model: org.apache.spark.mllib.tree.model.DecisionTreeModel, data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint])org.apache.spark.mllib.evaluation.MulticlassMetrics7. 模型训练与测试
scala> val model = DecisionTree.trainClassifier(
| trainData,7,Map[Int,Int](),"gini",4,100 )
model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 4 with 31 nodes
scala> model
res30: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel classifier of depth 4 with 31 nodes
scala> val metrics = getMetrics(model,cvData)
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@2c181731
scala> metrics.confusionMatrix
def confusionMatrix: org.apache.spark.mllib.linalg.Matrix
scala> metrics.confusionMatrix
def confusionMatrix: org.apache.spark.mllib.linalg.Matrix
scala> metrics.confusionMatrix
res31: org.apache.spark.mllib.linalg.Matrix =
14260.0 6593.0 7.0 0.0 0.0 0.0 340.0
5485.0 22277.0 483.0 20.0 3.0 0.0 38.0
0.0 443.0 3042.0 82.0 0.0 0.0 0.0
0.0 0.0 169.0 104.0 0.0 0.0 0.0
0.0 864.0 27.0 0.0 14.0 0.0 0.0
0.0 440.0 1168.0 100.0 0.0 0.0 0.0
1101.0 26.0 0.0 0.0 0.0 0.0 927.0
scala> metrics.precision
res32: Double = 0.7002568389843655
scala> metrics.
asInstanceOf confusionMatrix fMeasure falsePositiveRate isInstanceOf labels
precision recall toString truePositiveRate weightedFMeasure weightedFalsePositiveRate
weightedPrecision weightedRecall weightedTruePositiveRate
scala> metrics.
asInstanceOf confusionMatrix fMeasure falsePositiveRate isInstanceOf labels
precision recall toString truePositiveRate weightedFMeasure weightedFalsePositiveRate
weightedPrecision weightedRecall weightedTruePositiveRate
scala> metrics.
asInstanceOf confusionMatrix fMeasure falsePositiveRate isInstanceOf labels
precision recall toString truePositiveRate weightedFMeasure weightedFalsePositiveRate
weightedPrecision weightedRecall weightedTruePositiveRate
scala> metrics.
asInstanceOf confusionMatrix fMeasure falsePositiveRate isInstanceOf labels
precision recall toString truePositiveRate weightedFMeasure weightedFalsePositiveRate
weightedPrecision weightedRecall weightedTruePositiveRate
scala> metrics.recall
res33: Double = 0.7002568389843655
8. web UI
本文介绍了使用Apache Spark进行数据处理及决策树模型训练的过程。包括数据下载与导入、Spark集群启动、数据读取与分析、训练集划分、模型训练与评估等关键步骤。

3689

被折叠的 条评论
为什么被折叠?



