获取页面的TITLE、XML代码、文本
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlBody;
import java.util.List;
public class helloHtmlUnit{
public static void main(String[] args) throws Exception{
String str;
//创建一个webclient
WebClient webClient = new WebClient();
//htmlunit 对css和javascript的支持不好,所以请关闭之
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
//获取页面
HtmlPage page = webClient.getPage("http://www.baidu.com/");
//获取页面的TITLE
str = page.getTitleText();
System.out.println(str);
//获取页面的XML代码
str = page.asXml();
System.out.println(str);
//获取页面的文本
str = page.asText();
System.out.println(str);
//关闭webclient
webClient.closeAllWindows();
}
}使用不同版本的浏览器打开
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.WebClientOptions;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlBody;
import java.util.List;
public class helloHtmlUnit{
public static void main(String[] args) throws Exception{
String str;
//使用FireFox读取网页
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
//htmlunit 对css和javascript的支持不好,所以请关闭之
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
HtmlPage page = webClient.getPage("http://www.baidu.com/");
str = page.getTitleText();
System.out.println(str);
//关闭webclient
webClient.closeAllWindows();
}
}找到页面中特定的元素
public class helloHtmlUnit{
public static void main(String[] args) throws Exception{
//创建webclient
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//htmlunit 对css和javascript的支持不好,所以请关闭之
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
HtmlPage page = (HtmlPage)webClient.getPage("http://www.baidu.com/");
//通过id获得"百度一下"按钮
HtmlInput btn = (HtmlInput)page.getHtmlElementById("su");
System.out.println(btn.getDefaultValue());
//关闭webclient
webClient.closeAllWindows();
}
}元素检索
public class helloHtmlUnit{
public static void main(String[] args) throws Exception{
//创建webclient
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//htmlunit 对css和javascript的支持不好,所以请关闭之
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
HtmlPage page = (HtmlPage)webClient.getPage("http://www.baidu.com/");
//查找所有div
List<?> hbList = page.getByXPath("//div");
HtmlDivision hb = (HtmlDivision)hbList.get(0);
System.out.println(hb.toString());
//查找并获取特定input
List<?> inputList = page.getByXPath("//input[@id='su']");
HtmlInput input = (HtmlInput)inputList.get(0);
System.out.println(input.toString());
//关闭webclient
webClient.closeAllWindows();
}
}提交搜索
public class helloHtmlUnit{
public static void main(String[] args) throws Exception{
//创建webclient
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//htmlunit 对css和javascript的支持不好,所以请关闭之
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
HtmlPage page = (HtmlPage)webClient.getPage("http://www.baidu.com/");
//获取搜索输入框并提交搜索内容
HtmlInput input = (HtmlInput)page.getHtmlElementById("kw");
System.out.println(input.toString());
input.setValueAttribute("雅蠛蝶");
System.out.println(input.toString());
//获取搜索按钮并点击
HtmlInput btn = (HtmlInput)page.getHtmlElementById("su");
HtmlPage page2 = btn.click();
//输出新页面的文本
System.out.println(page2.asText());
}
}转载地址:http://www.cnblogs.com/cation/p/3933408.html
本文介绍如何利用HtmlUnit库进行网页数据抓取,包括获取页面标题、XML及文本内容,选择不同浏览器版本,定位特定元素,以及提交搜索表单。
&spm=1001.2101.3001.5002&articleId=49506899&d=1&t=3&u=f32e8312973c46f6bddedf822b859946)
2814

被折叠的 条评论
为什么被折叠?



