









































object _06SparkSQLAndCoreTest {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
val spark = SparkSession.builder()
.appName("_06SparkSQLAndCoreTest")
.master("local[*]")
.getOrCreate()
import spark.implicits._
val lines =
spark.sparkContext.textFile("file:///E:/data/spark/sql/dailykey.txt")
// date name keyword province client searchType
// * 2018-11-13 tom china beijing pc web
//date+keyword+user
val baseRDD:RDD[String] = lines.map(line => {
val fields = line.split("\\s+")
val date = fields(0)
val user = fields(1)
val keyword = fields(2)
s"${date}|${keyword}|${user}"
})
println("========>原始数据经过转化之后的内容")
baseRDD.foreach(println)
println("=========>去重之后的结果=====================")
val distinctRDD:RDD[String] = baseRDD.distinct()
distinctRDD.foreach(println)
println("======>统计每天每个关键字被检索的次数===============")
val keyCount:RDD[(String, Int)] = distinctRDD.map(fhkey => {
val key = fhkey.substring(0, fhkey.lastIndexOf("|"))
(key, 1)
}).reduceByKey(_ + _)
keyCount.foreach(println)
println("=========>每天关键字检索Top3=================")
val finalRDD = keyCount.map { case (key, count) => {
val fields = key.split("\\|")
val date = fields(0)
val keyword = fields(1)
MyRow(date, keyword, count)
}}
val ds:Dataset[MyRow] = finalRDD.toDS()
ds.createOrReplaceTempView("daily_keyword_tmp")
ds.printSchema()
val sql =
"""
|select
| tmp.*
|from (
|select
| `date`,
| keyword,
| `count`,
| row_number() over(partition by `date` order by `count` desc)
rank
|from daily_keyword_tmp
|) tmp
|where tmp.rank < 4
""".stripMargin
spark.sql(sql).show()
spark.stop()
}
}
case class MyRow(date:String, keyword:String, count:Int)
14,SparkSQL以及wordcount的数据倾斜处理
(1)SparkSQL编写wordcount
/**
* 使用SparkSQL来统计wordcount
*/
object _07SparkSQLWordCountOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
val spark = SparkSession.builder()
.appName("_07SparkSQLWordCountOps")
.master("local[*]")
.getOrCreate()
val linesDF =
spark.read.text("file:///E:/data/spark/core/hello.txt").toDF("line")
linesDF.createOrReplaceTempView("test")
linesDF.printSchema()
linesDF.show()
//求wordcount
//step 1、将每一行的数据进行拆分
println(">>>>step 1、将每一行的数据进行拆分")
val flatMapSQL =
"""
|select
| split(line, '\\s+') words
|from test
""".stripMargin
spark.sql(flatMapSQL).show()
//step 2、强一个数组转化为为多行,使用explode函数
println(">>>>step 2、强一个数组转化为为多行,使用explode函数")
val explodeSQL =
"""
|select
| explode(split(line, '\\s+')) word
|from test
""".stripMargin
spark.sql(explodeSQL).show()
//step 3、分组统计
val groupSQL =
"""
|select
| tmp.word,
| count(tmp.word) as countz
|from (
| select
| explode(split(line, '\\s+')) word
| from test
|) tmp
|group by tmp.word
|order by countz desc
""".stripMargin
spark.sql(groupSQL).show()
spark.stop()
}
}
(2)解决groupBy产生的数据倾斜
/**
* 使用SparkSQL来解决group by操作的数据倾斜
* 在SparkCore中,使用两阶段聚合来解决dataskew,
* 局部聚合+全局聚合
* 对key拆分打散(添加随机前缀),在此基础之上做group by的统计---->局部聚合
* 在局部聚合的基础之上,去掉对应的随机前缀,再做group by的统计--->全局聚合
* 双重group-by
*/
object _08SparkSQLWordCountDataSkewOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
val spark = SparkSession.builder()
.appName("_08SparkSQLWordCountDataSkewOps")
.master("local[*]")
.getOrCreate()
val linesDF =
spark.read.text("file:///E:/data/spark/core/hello.txt").toDF("line")
linesDF.createOrReplaceTempView("test")
linesDF.printSchema()
linesDF.show()
//求wordcount
//step 1、将每一行的数据进行拆分
println(">>>>step 1、将每一行的数据进行拆分")
val flatMapSQL =
"""
|select
| split(line, '\\s+') words
|from test
""".stripMargin
spark.sql(flatMapSQL).show()
//step 2、强一个数组转化为为多行,使用explode函数
println(">>>>step 2、强一个数组转化为为多行,使用explode函数")
val explodeSQL =
"""
|select
| explode(split(line, '\\s+')) word
|from test
""".stripMargin
spark.sql(explodeSQL).show()
//对拆分出来的每一个单词添加随机前缀,添加4以内的随机前缀
println("step 3 对拆分出来的每一个单词添加随机前缀,添加4以内的随机前
缀")
val prefixSQL =
"""
|select
| concat_ws("_", cast(floor(rand(10) * 2) as string), t1.word)
as prefix_word
|from (
| select
| explode(split(line, '\\s+')) word
| from test
|) t1
""".stripMargin
spark.sql(prefixSQL).show()
println(">>>>>step 4在添加好前缀之后做局部聚合")
val partAggrSQL =
"""
|select
| t2.prefix_word,
| count(t2.prefix_word) as prefix_count
|from (
| select
| concat_ws("_", cast(floor(rand(10) * 2) as string), t1.word)
as prefix_word
| from (
| select
| explode(split(line, '\\s+')) word
| from test
| ) t1
|) t2
|group by t2.prefix_word
""".stripMargin
spark.sql(partAggrSQL).show()
println(">>>在局部聚合的基础至少去除前缀")
/*
去除前缀的两种方式:
1、select substr("1_baidu", instr("1_baidu", "_") + 1);索引法
2、select split("1_baidu", "_")[1]; 切割法
*/
val removePrefixSQL =
"""
|select
| t2.prefix_word,
| substr(t2.prefix_word, instr(t2.prefix_word, "_") + 1) index_m,
| split(t2.prefix_word, "_")[1] split_m,
| count(t2.prefix_word) as prefix_count
|from (
| select
| concat_ws("_", cast(floor(rand(10) * 2) as string), t1.word)
as prefix_word
| from (
| select
| explode(split(line, '\\s+')) word
| from test
| ) t1
|) t2
|group by t2.prefix_word
""".stripMargin
spark.sql(removePrefixSQL).show()
println(">>>>全局聚合统计")
val fullAggrSQL =
"""
|select
| t3.index_m,
| sum(t3.prefix_count) as countz
|from (
|select
| t2.prefix_word,
| substr(t2.prefix_word, instr(t2.prefix_word, "_") + 1) index_m,
| split(t2.prefix_word, "_")[1] split_m,
| count(t2.prefix_word) as prefix_count
|from (
| select
| concat_ws("_", cast(floor(rand(10) * 2) as string), t1.word)
as prefix_word
| from (
| select
| explode(split(line, '\\s+')) word
| from test
| ) t1
|) t2
|group by t2.prefix_word)
|t3
|group by t3.index_m
""".stripMargin
spark.sql(fullAggrSQL).show()
spark.stop()
}
private def addPrefix(str:String) = {
val random = new Random()
random.nextInt(2) + "_" + str
}
private def removePrefix(str:String) = {
str.substring(str.indexOf("_") + 1)
}
}