本文主要是讲解spark中DataFrame 和SparkSQL的综合使用,以join操作为例。示例代码都是使用java和scala语言编写的。
java版本
package com.dt.sparkql.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
public class SparkSqlWithJoin {
public static void main(String[] args) {
/**
* 创建spark配置对象,设置spark程序的运行时配置信息,例如通过setMaster设置集群URL,如果为local则为本地模式,
* 适合机器配置差的情况
*/
SparkConf conf = new SparkConf();
conf.setAppName("Rdd to DataFrame");
conf.setMaster("local");
/**
* 创建sparkcontext对象,这是spark程序所有功能的唯一入口,
* 核心作用: 初始化spark应用程序所需要的核心组件
* 同时还会负责spark程序网master注册程序
*/
JavaSparkContext sc = new JavaSparkContext(conf);
//创建SQLcontext上下文对象用于SQL解析
SQLContext sqlContext = new SQLContext(sc);
//针对json文件数据源创建dataframe
DataFrame peopleSDf=sqlContext.read().json("F://testData//people.json");
//基于json构建的dataframe来注册临时表
peopleSDf.registerTempTable("peopleScores");
//查询出name score 分数大于80的人
DataFrame execellentScoresName=sqlContext.sql("select name,score from peopleScores where score>80");
execellentScoresName.show();
//在dataframe的基础上转化成rdd,,通过map操作计算出分数大于80的所有人的姓名
List<String> personsRdd=execellentScoresName.javaRDD().map(new Function<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.getAs("name");
}
}).collect();
//动态组拼出json
List<String> peopleInformations=new ArrayList<String>();
peopleInformations.add("{\"name\":\"Michael\",\"age\":23}");
peopleInformations.add("{\"name\":\"Andy\",\"age\":25}");
peopleInformations.add("{\"name\":\"Justin\",\"age\":21}");
//通过内容为json的rdd来构造dataframe
JavaRDD<String> peoplesRdd= sc.parallelize(peopleInformations);
DataFrame peopleInfoDf=sqlContext.read().json(peoplesRdd);
peopleInfoDf.show();
//注册为临时表
peopleInfoDf.registerTempTable("peopleInfo");
String sqls="select name,age from peopleInfo where name in (";
for(int i=0;i<personsRdd.size();i++){
sqls+="'"+personsRdd.get(i)+"'";
if(i<personsRdd.size()-1){
sqls+=",";
}
}
sqls+=")";
//System.out.println(sqls);
DataFrame execllentsNameAge=sqlContext.sql(sqls);
execllentsNameAge.show();
JavaPairRDD<String,Tuple2<Long,Long>> resultRDD=
execellentScoresName.javaRDD().mapToPair(new PairFunction<Row, String, Long>() {
private static final long serialVersionUID=1l;
@Override
public Tuple2<String, Long> call(Row row) throws Exception {
return new Tuple2<String,Long>(row.getAs("name"),row.getAs("score"));
}
}).join(execllentsNameAge.javaRDD().mapToPair(new PairFunction<Row, String, Long>() {
private static final long serialVersionUID=1l;
@Override
public Tuple2<String, Long> call(Row row) throws Exception {
return new Tuple2<String,Long>(row.getAs("name"),row.getAs("age"));
}
}));
JavaRDD<Row> resultRowRDD=resultRDD.map(new Function<Tuple2<String, Tuple2<Long, Long>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Long, Long>> tuple) throws Exception {
return RowFactory.create(tuple._1,tuple._2._1,tuple._2._2);
}
});
List<StructField> structFields=new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("score",DataTypes.LongType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.LongType,true));
StructType structType=DataTypes.createStructType(structFields);
DataFrame peoplesDf=sqlContext.createDataFrame(resultRowRDD,structType);
peoplesDf.show();
//peoplesDf.write().format("json").save("F://testData//ScoreAge");
}
}
jdk版本:1.8
scala 版本:2.10.4
spark版本:spark-1.6.1-bin-hadoop2.6



F://testData//people.json文件内容

输出结果:
1.由于是本地模式所以会出现这个异常,但程序是正常执行的
2. //查询出name score 分数大于80的人
DataFrame execellentScoresName=sqlContext.sql("select name,score from peopleScores where score>80"); execellentScoresName.show();

3.自己定义的DataFrame 内容
//通过内容为json的rdd来构造dataframe
JavaRDD<String> peoplesRdd= sc.parallelize(peopleInformations);
DataFrame peopleInfoDf=sqlContext.read().json(peoplesRdd);
peopleInfoDf.show();

4.输出 成绩大于80分的学生姓名和姓名
String sqls="select name,age from peopleInfo where name in (";
for(int i=0;i<personsRdd.size();i++){
sqls+="'"+personsRdd.get(i)+"'";
if(i<personsRdd.size()-1){
sqls+=",";
}
}
sqls+=")";
//System.out.println(sqls);
DataFrame execllentsNameAge=sqlContext.sql(sqls);
execllentsNameAge.show();

5.执行join操作,将学生姓名 年龄 成绩输出
JavaPairRDD<String,Tuple2<Long,Long>> resultRDD=
execellentScoresName.javaRDD().mapToPair(new PairFunction<Row, String, Long>() {
private static final long serialVersionUID=1l;
@Override
public Tuple2<String, Long> call(Row row) throws Exception {
return new Tuple2<String,Long>(row.getAs("name"),row.getAs("score"));
}
}).join(execllentsNameAge.javaRDD().mapToPair(new PairFunction<Row, String, Long>() {
private static final long serialVersionUID=1l;
@Override
public Tuple2<String, Long> call(Row row) throws Exception {
return new Tuple2<String,Long>(row.getAs("name"),row.getAs("age"));
}
}));
JavaRDD<Row> resultRowRDD=resultRDD.map(new Function<Tuple2<String, Tuple2<Long, Long>>, Row>() {
@Override
public Row call(Tuple2<String, Tuple2<Long, Long>> tuple) throws Exception {
return RowFactory.create(tuple._1,tuple._2._1,tuple._2._2);
}
});
List<StructField> structFields=new ArrayList<StructField>();
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("score",DataTypes.LongType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.LongType,true));
StructType structType=DataTypes.createStructType(structFields);
DataFrame peoplesDf=sqlContext.createDataFrame(resultRowRDD,structType);
peoplesDf.show();

scala 版本
package com.dt.sparkql.scala
import org.apache.spark.{SparkConf, SparkContext, rdd}
import org.apache.spark.sql.SQLContext
object SparkSqlWithJoinScala {
def main(args: Array[String]): Unit = {
val conf=new SparkConf()
conf.setAppName("sparkSql with join")
conf.setMaster("local")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val personSourceDF=sqlContext.read.json("F://testData//people.json")
personSourceDF.show()
personSourceDF.registerTempTable("personScources")
val execellentStudentDF=sqlContext.sql("select name,score from personScources where score>80")
val excellentStudents=execellentStudentDF.rdd.map(
row=>row(0)
).collect
val peopleInformations=Array("{\"name\":\"Michael\",\"age\":23}", "{\"name\":\"Andy\",\"age\":25}", "{\"name\":\"Justin\",\"age\":21}")
val peopleInformationRDD=sc.parallelize(peopleInformations)
val peopleInformationDF=sqlContext.read.json(peopleInformationRDD)
peopleInformationDF.registerTempTable("peopleInfo")
var sqlText="select name,age from peopleInfo where name in ("
sqlText += "'" + excellentStudents.mkString("','")+ "'"
sqlText += ")"
print(sqlText)
val execllentsNameAge=sqlContext.sql(sqlText)
execllentsNameAge.show()
val resultRdd=execellentStudentDF.join(execllentsNameAge,"name").show()
}
}
其输出同上。
本文详细介绍了如何在Spark中使用DataFrame和SparkSQL进行JOIN操作,以Java和Scala语言为例,展示了查询分数大于80的学生姓名,以及如何执行JOIN操作整合学生姓名、年龄和成绩的数据。


被折叠的 条评论
为什么被折叠?



