package com.xxx
import java.util.Random
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object DataSkewSparkSQLTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("DataSkewSparkSQLTest").setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val list = List("hello hello hello you hello me", "hello you me hello you", "hello you me")
val rowRDD = sc.parallelize(list).map(word => Row(word))
val schema = StructType(List(
StructField("word", DataTypes.StringType, true)
))
val df = sqlContext.createDataFrame(rowRDD, schema)
df.registerTempTable("test")
sqlContext.udf.register[String, String]("addPrefix", (field) => randomPrefixUDF(field))
sqlContext.udf.register[String, String]("rmPrefix", (field) => removePrefixUDF(field))
sqlContext.sql("SELECT " +
"r.rword, " +
"sum(r.pcount) " +
"FROM (SELECT " +
"rmPrefix(p.pword) rword, " +
"count(p.pword) as pcount " +
"FROM (SELECT " +
"addPrefix(w.word) pword " +
"FROM (SELECT " +
"explode(split(word, ' ')) word " +
"FROM test" +
") w " +
") p " +
"group by p.pword) r " +
"group by r.rword").show()
sc.stop()
}
def randomPrefixUDF(field:String):String = {
val random = new Random()
val prefix = random.nextInt(10)
prefix + "_" + field
}
def removePrefixUDF(field:String):String = {
field.split("_")(1)
}
}
hive双重groupby 随机前缀
最新推荐文章于 2026-04-02 02:29:07 发布

1545

被折叠的 条评论
为什么被折叠?



