1.注入的依赖
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.2.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
数据的准备
Hello you
hello me
hello you
hello him
2.WordCountTopologie
package com.zpark.stu;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.AlreadyAliveException;
import org.apache.storm.generated.AuthorizationException;
import org.apache.storm.generated.InvalidTopologyException;
import org.apache.storm.topology.TopologyBuilder;
/**
* wordcount的驱动类,用来提交任务的。
*/
public class WordCountTopologie {
/**
* 需求:对数据源进行数据的切割与统计每个单词出现的个数
* 1.需要一个读取数据的Spout
* 2.将数据交由SplitBolt进行单词的切割
* 3.对切割后的数据进行字数的统计 WordCount
*/
public static void main(String[] args) throws InvalidTopologyException, AuthorizationException, AlreadyAliveException, InterruptedException {
//拿到一个TOPO
TopologyBuilder topologyBuilder = new TopologyBuilder();
//创建一个Spout
topologyBuilder.setSpout("readSpout",new ReadFileSpout(),1);
//创建一个Bolt进行单词的切割
topologyBuilder.setBolt("splitBolt",new SplitBolt(),1).shuffleGrouping("readSpout");
//创建一个Bolt对切割好的单词进行字数统计
topologyBuilder.setBolt("wordCountBolt",new WordCountBolt(),1).shuffleGrouping("splitBolt");
Config config = new Config();
config.setNumWorkers(1);
if (args != null && args.length > 0){
System.out.println("非本地运行");
config.setNumWorkers(1);
StormSubmitter.submitTopologyWithProgressBar(args[0],config ,topologyBuilder.createTopology());
}else {
System.out.println("本地-运行");
config.setMaxTaskParallelism(1);
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology("wordCount",config,topologyBuilder.createTopology());
Thread.sleep(10*60*1000);
localCluster.shutdown();
}
}
}
3.ReadFileSpout
package com.zpark.stu;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichSpout;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import java.io.*;
import java.util.Arrays;
import java.util.Map;
/**
* 读取外部的文件,将一行一行的数据发送给下游的bolt
* 类似于hadoop MapReduce中的inputformat
*/
public class ReadFileSpout extends BaseRichSpout{
private BufferedReader reader;
private SpoutOutputCollector collector;
@Override
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
//读取数据
try {
reader = new BufferedReader(new FileReader(new File("E:/hadoop/wordcount/input/a.txt")));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
//需要将数据提交给Bolt,如果不需要的情况,可以不写
this.collector = spoutOutputCollector;
}
@Override
public void nextTuple() {
//将读取到的数据提交给SplitBolt
try {
String lines = reader.readLine();
if (lines != null){
collector.emit(Arrays.asList(lines));
}
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("readSpout"));
}
}
4.SplitBolt
package com.zpark.stu;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import java.util.Arrays;
import java.util.Map;
/**
* 输入:一行数据
* 计算:对一行数据进行切割
* 输出:单词及单词出现的次数
*/
public class SplitBolt extends BaseRichBolt {
private OutputCollector collector;
@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector = outputCollector;
}
@Override
public void execute(Tuple tuple) {
//拿到Spolt发送的数据,通过ReadFileSpout中的outputFieldsDeclarer.declare(new Fields("readSpout"));方法来发送数据
String readSpout = tuple.getStringByField("readSpout");
//通过空格切割单词
String[] splits = readSpout.split(" ");
for (String split: splits) {
collector.emit(Arrays.asList(split,"1"));
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("word" , "num"));
}
}
5.WordCountBolt
package com.zpark.stu;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;
import java.util.HashMap;
import java.util.Map;
/**
* 输入:单词及单词出现的次数
* 输出:打印在控制台
* 负责统计每个单词出现的次数
* 类似于hadoop MapReduce中的reduce函数
*/
public class WordCountBolt extends BaseRichBolt{
HashMap<String, Integer> wordCountMap = new HashMap<String, Integer>();
@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
//由于这是最后一个Bolt,不需要发送数据给下一个Bolt所以i不需要在声明
}
@Override
public void execute(Tuple tuple) {
//读取SplitBolt发送过来的数据
String word = tuple.getStringByField("word");
String num = tuple.getStringByField("num");
//对单词进行计数
if (wordCountMap.containsKey(word)){
Integer integer = wordCountMap.get(word);
wordCountMap.put(word, integer + Integer.parseInt(num));
}else {
wordCountMap.put(word, Integer.parseInt(num));
}
System.out.println(wordCountMap);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
}
6.输出结果
{Hello=1}
{Hello=1, you=1}
{Hello=1, hello=1, you=1}
{Hello=1, me=1, hello=1, you=1}
{Hello=1, me=1, hello=2, you=1}
{Hello=1, me=1, hello=2, you=2}
{Hello=1, me=1, hello=3, you=2}
{Hello=1, me=1, hello=3, him=1, you=2}
本文详细介绍使用Apache Storm实现WordCount实时计算的过程,包括依赖注入、数据准备、WordCount拓扑设计、Spout与Bolt组件开发,以及最终的输出结果展示。

1654

被折叠的 条评论
为什么被折叠?



