import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
public class HttpClientTest {
public static void main(String[] args) {
//解析Excel
try {
Workbook workbook = Workbook.getWorkbook(new File("E://beijing.xls"));
Sheet sheet = workbook.getSheet(0);
//单条:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/11/01/01/110101001.html
System.out.println("excel总行数:"+sheet.getRows());
String [][] values = new String[8000][3];
int size = 0;
for (int i = 0; i<sheet.getRows(); i++){
System.out.println(sheet.getCell(0,i).getContents());
int size1 = getHtml(sheet.getCell(0, i).getContents(), values, size);
size += size1;
}
//System.out.println("111111111111");
getHSSFWorkbook("xingzhengquhua.xls",null,values,null);
} catch (IOException e) {
e.printStackTrace();
} catch (BiffException e) {
e.printStackTrace();
}
//------------------------------------------
//1.生成httpclient,相当于该打开一个浏览器
}
public static int getHtml(String html1, String [][] values,int size){
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(html1);
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "gb2312");
System.out.println("获取内容:"+html);
org.jsoup.nodes.Document parse = Jsoup.parse(html);
org.jsoup.nodes.Element title = parse.getElementsByTag("title").first();
System.out.println(title);
Elements elementsByClass = parse.getElementsByClass("villagetr");
System.out.println("获取数据:"+elementsByClass);
int z=size;
for (Element byClass : elementsByClass) {
Elements td = byClass.getElementsByTag("td");
String[] strings = new String[3];
int k=0;
for (Element element : td) {
String text = element.text();
System.out.println("td数据:"+text);
strings[k]=text;
k++;
}
values[z]=strings;
z++;
}
return elementsByClass.size();
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return 0;
}
public static HSSFWorkbook getHSSFWorkbook(String sheetName,String []title,String [][]values, HSSFWorkbook wb){
if(wb == null){
wb = new HSSFWorkbook();
}
HSSFSheet sheet = wb.createSheet(sheetName);
sheet.createRow(0);
HSSFCellStyle style = wb.createCellStyle();
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
for(int i=0;i<values.length;i++){
HSSFRow row = sheet.createRow(i + 1);
for(int j=0;j<values[i].length;j++){
row.createCell(j).setCellValue(values[i][j]);
}
}
try {
FileOutputStream fileOutputStream = new FileOutputStream("E:/beijingshi.xls");
wb.write(fileOutputStream);
} catch (IOException e) {
e.printStackTrace();
}
return wb;
}
}
/*所需jar
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.jexcelapi</groupId>
<artifactId>jxl</artifactId>
<version>2.6.12</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.6</version>
</dependency>
*/

该代码示例演示了如何使用Apache HttpClient发起HTTP GET请求,然后利用Jsoup解析HTML内容,提取特定标签内的数据。程序读取Excel文件中的URL,抓取每个页面上的数据,并将结果存储到新的Excel文件中。

902

被折叠的 条评论
为什么被折叠?



