最近新入职了一家公司,这家公司主要以开发大数据平台为主,主流语言用的是java语言,涉及到的组件有很多很多 kafaka、pg数据库、hbase、hudi、atlas、flink、spark、hive、hadoop、mongdb、presto、hbase、dolphinscheduler、alluxio等等,以前很少用java去写spark代码,如此一来感觉十分懵逼,以后准备更新以Java语言方向为主的大数据开发。这里先从基础开发,我这里先整理的是Java,JavaLambda,Scala三种方式的wordCount入门版的Spark。
来上代码:
Java版:
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/*
@Auther ze.liu
*/
public class SortWordCountJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("myCount").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 创建lines RDD
/* //文本中数据
spark hadoop oracle
hive spark oracle
oracle spark hbase
* */
JavaRDD<String> lines = sc.textFile("D:\\spark.txt");
//将文本中数据按照空格进行切分,得到一个个单词 value
JavaRDD<String> wordRdd = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] fields = s.split(" ");
List<String> list = Arrays.asList(fields);
Iterator<String> iterator = list.iterator();
return iterator;
}
});
//将单词value加上标签,每个单词打成标签成(value,1)
JavaPairRDD<String, Integer> wordKeyValueRdd = wordRdd.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
Tuple2<String, Integer> word = new Tuple2<>(s, 1);
return word;
}
});
//将达成标签的(value,1)中的数量进行相加,得出每个单词的数量
JavaPairRDD<String, Integer> resultRDD = wordKeyValueRdd.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
/* //结果输出到控制台
resultRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple2) throws Exception {
System.out.println("单词"+tuple2._1+"的数量->"+tuple2._2);
}
});
*/
/*
单词spark的数量->3
单词hive的数量->1
单词hadoop的数量->1
单词oracle的数量->3
单词hbase的数量->1
* */
//收集RDD
Iterator<Tuple2<String, Integer>> iter = resultRDD.sortByKey(false).collect().iterator();
while(iter.hasNext()){
Tuple2<String, Integer> result = iter.next();
System.out.println("单词"+result._1+"的数量->"+result._2);
}
sc.stop();
}
}
JavaLambda版
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.lang.reflect.Array;
import java.util.Arrays;
public class SortWordLambda {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("myCount").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("D:\\spark.txt");
JavaRDD<String> wordRdd = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaPairRDD<String, Integer> wordPairRDD = wordRdd.mapToPair(word -> new Tuple2<>(word, 1));
JavaPairRDD<String, Integer> wordCountRdd = wordPairRDD.reduceByKey((a, b) -> (a + b));
wordCountRdd.sortByKey(false).foreach( result -> System.out.println("单词"+result._1+"的数量->"+result._2));
sc.close();
}
}
Scala版
package com.liuze
import org.apache.spark.sql.SparkSession
object SortWordScala {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("SortWordCount").master("local").getOrCreate()
val lines = spark.sparkContext.textFile("D:\\spark.txt")
val words = lines.flatMap{line => line.split(" ")}
val wordCounts = words.map{word => (word,1)}.reduceByKey(_ + _)
val countWord = wordCounts.map{word =>(word._2,word._1)}
val sortedCountWord = countWord.sortByKey(false)
val sortedWordCount = sortedCountWord.map{word => (word._2, word._1)}
sortedWordCount.foreach(s=>
{
println("word \""+s._1+ "\" appears "+s._2+" times.")
})
spark.stop()
}
}
本文介绍了使用Java、Java Lambda及Scala实现Spark WordCount的过程。通过读取文本文件,利用flatMap和mapToPair等操作实现单词计数,并展示了如何对结果进行排序。

1883

被折叠的 条评论
为什么被折叠?



