先写一个简单的爬虫项目
package com.kgc;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest {
public static void main(String[] args) {
String url = "https://www.baidu.com/s?ie=utf-8&f=3&rsv_bp=1&tn=monline_3_dg&wd=jsoup%E8%A7%A3%E6%9E%90html&oq=httpclient4.4.9&rsv_pq=d7f6243e00006886&rsv_t=1c21FPkhF%2BgQg6I4fQ2ZuApWm%2B5jszdGTEjEmVgQAeQV1%2FQcJwcpl1e9fVIk6IexhrHV&rqlang=cn&rsv_enter=1&inputT=5488&rsv_sug3=34&rsv_sug1=35&rsv_sug7=100&rsv_sug2=1&prefixsug=jsoup&rsp=1&rsv_sug4=6912&rsv_sug=1";
try {
Document doc = Jsoup.connect(url).get();
// System.out.println(doc.html());
Elements es = doc.select("h3.t a");
for (Element e : es) {
System.out.println("h3.t a:\n" + e.attr("href") + "\n" + e.text());
}
} catch (IOException e) {
e.printStackTrace();
}
}
}用多线程爬取智联招聘的信息
package cn.itrip.test;
import java.io.IOException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import cn.kgc.beans.Recruit;
public class JsoupTest {
public static void main(String[] args) {
zhiLianTest();//智联招聘
}
/**
* 智联招聘
*/
public static void zhiLianTest(){
String url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=java&isadv=0&sg=172c6d7dcafe4755ad7cec36bd1d3683&p=temp";
//创建一个定长线程池,可控制线程最大并发数,超过的线程会在队列中等待。
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(10);
for(int pageii = 0;pageii<50;pageii++){//爬几页
final String listUrl = url.replaceAll("temp", (pageii+1) + "");
final int page = pageii + 1;
fixedThreadPool.execute(new Runnable(){
@Override
public void run() {
// TODO Auto-generated method stub
//线程内代码
try {
Document document = Jsoup.connect(listUrl).get();
String selector = "table[class=newlist]";
Elements elements = document.select(selector);
for(int i = 1;i<elements.size();i++){
Element e = elements.get(i);
//String ss = e.text();
String zwmc = e.select("td[class=zwmc]").text();//职位名称
String gsmc = e.select("td[class=gsmc]").text();//公司名称
String zwyx = e.select("td[class=zwyx]").text();//职位月薪
String gzdd = e.select("td[class=gzdd]").text();//工作地点
String detailurl = e.select("td[class=zwmc]").select("a").attr("href");//详情页
Document detailD = Jsoup.connect(detailurl).get();
String jobDescription = "";
Elements detailEs = detailD.select("div .terminalpage-main .tab-cont-box .tab-inner-cont");
if(detailEs.size() > 0){
Element de = detailEs.get(0);//第一个是岗位职责
jobDescription = de.text();//岗位职责
}
//公司的详情页
String companyurl = e.select("td[class=gsmc]").select("a").attr("href");
Document companyD = Jsoup.connect(companyurl).get();
Elements cElements = companyD.select(".comTinyDes").select("tr");
//公司类型
String companyType = "";//cElements.get(0).select("td").eq(1).text();
//公司网址
String companyUrl = "";//cElements.get(2).select("td").eq(1).text();
for(Element ce : cElements){
String td1 = ce.select("td").eq(0).text();
String td2 = ce.select("td").eq(1).text();
if(td1.contains("公司性质")){
companyType = td2;
}else if(td1.contains("公司网站")){
companyUrl = td2;
}
}
//插入数据
System.out.println("page"+page+"i"+i+"title:" + zwmc);
System.out.println("page"+page+"i"+i+"companyname:"+gsmc);
System.out.println("page"+page+"i"+i+"monthsalary:"+zwyx);
System.out.println("page"+page+"i"+i+"companyaddress:"+gzdd);
System.out.println("page"+page+"i"+i+"companyType:"+companyType);
System.out.println("page"+page+"i"+i+"companyUrl:"+companyUrl);
System.out.println("page"+page+"i"+i+"detailurl:"+detailurl);
System.out.println("page"+page+"i"+i+"jobDescription:"+jobDescription);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
});
}
}
}
Anti-Spider(反爬虫)
为什么要反爬反爬虫策略
1.限IP
2.限User Agent
3.组合
User Agent限制
概念

Anti-Anti-Spider(反反爬)策略
1.设置User Agent
2.动态代理IP
概念:爬虫过程中使用不同ip去进行爬虫

Jsonp设置User Agent
1.语法
Jsonp.connect(url).header("User-Agent","Mozilla/5.0 .... ");
2.策略
设置真实的User Agent
Jsonp设置 IP
1.设置IP
2.获取IP方法
(1).免费代理IP库
(2).付费IP代理
http://www.goubanjia.com/buy/dynamic.html
反反爬
代理ip
这个是免费的
package cn.kgc.util;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import cn.kgc.beans.ProxyIp;
public class ProxyUtilsFree {
public static int currentIpIndex = 1;//取第几个,每次取,都取下一个
public static ProxyIp getProxyIp(){
return getProxyIp(currentIpIndex);
}
/**
* 在这个地址取出的ip是很多很多的。所以每次都取下一个。
* @param index
* @return
*/
public static ProxyIp getProxyIp(int index){
String url = "http://www.xicidaili.com/nn/pagetemp";
//一页显示100
int page = index/100;
url = url.replaceAll("pagetemp", (page + 1) + "");//第几页
int limit = index%100;//第几行
Document document;
try {
document = Jsoup.connect(url).get();
Elements trs = document.select("tr[class=odd]");
if(limit > trs.size()){//如果取完了ip,重新再取一遍
currentIpIndex = 1;
}
//for(int i = 0;i<trs.size();i++){
Element e = trs.get(limit - 1);
//第二列是ip
String ip = e.select("td:eq(1)").text();
//第三列是端口号
String port = e.select("td:eq(2)").text();
ProxyIp pp = new ProxyIp();
pp.setPort(Integer.parseInt(port));
pp.setIp(ip);
System.out.println(ip + "\t" +port);
currentIpIndex ++;
return pp;
//}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
currentIpIndex ++;
}
return null;
}
}
收费的写一个
package cn.kgc.util;
import cn.kgc.beans.ProxyIp;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import java.io.IOException;
/**
* Created by zezhong.shang on 17-9-19.
*/
public class ProxyUtils {
static Logger logger=Logger.getLogger(ProxyUtils.class);
public static ProxyIp getProxyIp() {
String ipUrl = "http://dynamic.goubanjia.com/dynamic/get/32ea0e44b1497118f5fda8b5e5e22748.html";
ProxyIp proxyIp = new ProxyIp();
try {
String ipstr = Jsoup.connect(ipUrl).get().text();
proxyIp.setIp(ipstr.split(":")[0]);
proxyIp.setPort(Integer.parseInt(ipstr.split(":")[1]));
} catch (IOException e) {
logger.info(e.getCause()+":获取动态IP异常,暂休2秒");
try {
Thread.sleep(2000);
return getProxyIp();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
}
return proxyIp;
}
public static void main(String[] args) {
}
}
DocumentUtil
package cn.kgc.util;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import cn.kgc.beans.ProxyIp;
import cn.kgc.common.Constants;
/**
* Created by zezhong.shang on 17-9-19.
*/
public class DocumentUtilFree {
static Logger logger = Logger.getLogger(DocumentUtilFree.class);
/**
url:要爬取的地址
**/
public static Document getDocument(String url){
Connection connection = Jsoup.connect(url);
String usergent = Constants.UserAgent.Chrome;//设置合法的useragent
connection.userAgent(usergent);//不同浏览器有不同的userAgent
//connection.header();
//设置动态代理ip
Document document = setProxyIp(connection);
return document;
}
public static Document setProxyIp(Connection connection){
Document document = null;
ProxyIp proxyIp = new ProxyIp();
try{
proxyIp = ProxyUtilsFree.getProxyIp();
connection.proxy(proxyIp.getIp(),proxyIp.getPort());
document = connection.get();
}catch(IOException e){
// e.printStackTrace();
return setProxyIp(connection);
}
return document;
}
}
取Document
Document document = DocumentUtilFree.getDocument(listUrl);//Jsoup.connect(listUrl).get();
分词器

2.过滤掉无效词汇
3.统计有效词汇出现概率
算法:(出现次数/总招聘数)
前两部用分词
分词作用
切割:将一段文字进行词语切分
筛选:
无效词过滤
拓展词定义
常见分词器
IKAnalyzer
1.引入依赖
2.配置IKAnalyzer.cfg.xml

3.设置停止词,扩展词
IKController
package cn.kgc.controller;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;
import cn.kgc.beans.Keyword;
import cn.kgc.service.keyword.KeywordService;
import cn.kgc.service.recruit.RecruitService;
import cn.kgc.util.IKUtils;
import cn.kgc.vo.RecruitVo;
@Controller
@RequestMapping("/ik")
public class IKController {
@Resource
private RecruitService rService;
@Resource
private KeywordService kservice;
@RequestMapping("/ikDataDeal")
@ResponseBody
public String ikDataDeal(){
//遍历招聘信息
Map<String,Object> map = new HashMap<String,Object>();
map.put("datatype",1);//数据类型,1:后端人员。如果是2就是前段人员
List<RecruitVo> list;
try {
list = rService.getRecruitListByMap(map);
//遍历招聘信息,将所有的招聘信息进行分词处理
for(int i = 0;i<list.size();i++){
String jobDesc = list.get(i).getJobDescription();
Integer rid = list.get(i).getId();
//再用刚才写的类进行分词处理
Set<String> set = new HashSet<String>();
set = IKUtils.getKeyWord(jobDesc);
for(String word: set){
Keyword keyword = new Keyword();//实体类和keyword表进行映射
keyword.setRid(rid);
keyword.setName(word);//分词出的每一个词
keyword.setStatus(0);//0:没有删除;1:已经删除
keyword.setDataType(1);
kservice.itriptxAddKeyword(keyword);//入库操作
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "success";
}
}
IKUtils
package cn.kgc.util;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* 分词工具类
* @author Administrator
*
*/
public class IKUtils {
public static Set<String> getKeyWord(String jobDesc) throws Exception{
//estword.etc:扩展词词典
//stopword.etc:停用词词典,这个是不用关心的词
Set<String> set = new HashSet<String>();
//set中不允许重复
String jobstr = jobDesc.trim();
StringReader reader = new StringReader(jobstr);
IKSegmenter ikSementer = new IKSegmenter(reader,true);
Lexeme lexene = null;//词,分析出来的词
while((lexene = ikSementer.next()) != null){
String job = lexene.getLexemeText().trim();
set.add(job);
}
return set;
}
}
本文介绍了使用Jsoup创建一个简单的网络爬虫,专注于爬取智联招聘的数据。同时,讨论了如何应对网站的反爬虫策略,包括使用代理IP和分词技术。还提到了免费与收费代理IP的使用,并展示了DocumentUtil、IKAnalyzer的配置和IKController、IKUtils的相关应用。

2822

被折叠的 条评论
为什么被折叠?



