hive双重groupby 随机前缀

package com.xxx

import java.util.Random

import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}


object DataSkewSparkSQLTest {
	def main(args: Array[String]): Unit = {
		val conf = new SparkConf().setAppName("DataSkewSparkSQLTest").setMaster("local[2]")
		val sc = new SparkContext(conf)
		val sqlContext = new SQLContext(sc)
		val list = List("hello hello hello you hello me", "hello you me hello you", "hello you me")
		val rowRDD = sc.parallelize(list).map(word => Row(word))
		val schema = StructType(List(
			StructField("word", DataTypes.StringType, true)
		))
		val df = sqlContext.createDataFrame(rowRDD, schema)
		df.registerTempTable("test")
		sqlContext.udf.register[String, String]("addPrefix", (field) => randomPrefixUDF(field))
		sqlContext.udf.register[String, String]("rmPrefix", (field) => removePrefixUDF(field))
		sqlContext.sql("SELECT " +
							"r.rword, " +
							"sum(r.pcount) " +
						"FROM (SELECT " +
								  "rmPrefix(p.pword) rword, " +
								  "count(p.pword) as pcount " +
				               "FROM (SELECT " +
										"addPrefix(w.word) pword " +
									  "FROM (SELECT " +
												"explode(split(word, ' ')) word " +
											"FROM test" +
											") w " +
									  ") p " +
							   "group by p.pword) r " +
						"group by r.rword").show()
		sc.stop()
	}
	def randomPrefixUDF(field:String):String = {
		val random = new Random()
		val prefix = random.nextInt(10)
		prefix + "_" + field
	}

	def removePrefixUDF(field:String):String = {
		field.split("_")(1)
	}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值