简单多线程爬虫+Jsoup分析

本文介绍了如何利用Java的多线程技术和Jsoup库来抓取并分析CSDN首页的所有子网页链接,展示了具体的运行效果。

使用简单多线程和Jsoup分析,得到CSDN的首页的所有子网页连接。

运行效果如下图


------------------------------------------------------------------------------------------------------



---------------------------------------------------------------------------------------------------------------------------------




代码如下


import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.http.HttpEntity;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WebCrawler {

	ArrayList<String> allurlSet = new ArrayList<String>();
	ArrayList<String> notCrawlurlSet = new ArrayList<String>();
	HashMap<String, Integer> depth = new HashMap<String, Integer>();
	int crawDepth = 2;
	int threadCount = 10;
	int count = 0;
	public static final Object signal = new Object();// 线程间通信

	public static void main(String args[]) {
		final WebCrawler wc = new WebCrawler();
		wc.addUrl("http://www.csdn.net", 1);
		long start = System.currentTimeMillis();
		System.out.println("**************开始爬虫**************");
		wc.begin();

	       while(true){  
	            if(wc.notCrawlurlSet.isEmpty()&& Thread.activeCount() == 1||wc.count==wc.threadCount){  
	                long end = System.currentTimeMillis();  
	                System.out.println("总共爬了"+wc.allurlSet.size()+"个网页");  
	                System.out.println("总共耗时"+(end-start)/1000+"秒");  
	                System.exit(1);  
//	              break;  
	            }
	       }

	}

	private void begin() {
		for (int i = 0; i < threadCount; ++i) {
			new Thread(new Runnable() {
				public void run() {

					while (true) {
						String tmp = getAUrl();
						if (tmp != null) {
							crawler(tmp);
						} else {
							synchronized (signal) {
								try {
									count++;
									System.out.println(Thread.currentThread().getName() + ": 等待");
									signal.wait();
								} catch (Exception e) {
									e.printStackTrace();
								}

							}
						}
					}

				}
			}, "thread-" + i).start();
		}
	}

	public void crawler(String sUrl) {
		URL url;
		try {
			HttpClient client = HttpClients.createDefault();
			HttpGet get = new HttpGet(sUrl);
			get.setHeader("User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
			CloseableHttpResponse response = (CloseableHttpResponse) client.execute(get);
			HttpEntity entity = response.getEntity();
			String content = EntityUtils.toString(entity);
			int d = depth.get(sUrl);
			System.out.println("爬网页" + sUrl + "成功,深度为" + d + " 是由线程" + Thread.currentThread().getName() + "来爬");

			if (d < crawDepth) {
				Document doc = Jsoup.parseBodyFragment(content);
				Elements es = doc.select("a");
				String temp = "";
				for (Element e : es) {
					temp = e.attr("href");
					if (temp != null) {
						synchronized (signal) {
							addUrl(temp, d + 1);
							if (count > 0) {
								signal.notify();
								count--;
							}
						}
					}
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

	}

	public synchronized String getAUrl() {
		if (notCrawlurlSet.isEmpty())
			return null;
		String tmpAUrl;
		tmpAUrl = notCrawlurlSet.get(0);
		notCrawlurlSet.remove(0);
		return tmpAUrl;
	}

	public synchronized void addUrl(String url, int d) {
		notCrawlurlSet.add(url);
		allurlSet.add(url);
		depth.put(url, d);
	}

}



评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值