java 爬虫代码优化
package com.company;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.http.HttpEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class MingLuSpider {
private String ReponseBody;
public void MingLuSpider() {
MingLuSpider mingspider = new MingLuSpider();
}
public void GetRequestData(String url) throws IOException {
String ResponseBody = null;
String ResponseInsideBody=null;
try {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0(Windows NT 6.1;Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity httpEntity = response.getEntity();
ResponseBody = EntityUtils.toString(httpEntity, "utf-8");
Document document = Jsoup.parse(ResponseBody);
Elements getItems = document.select("td[class='views-field views-field-name']");
for (Element getItem : getItems) {
String link = "https://gongshang.mingluji.com" + getItem.select("a").attr("href");
System.out.println("每个公司链接为:" + link);
HttpGet GetInsideDate = new HttpGet(link);
GetInsideDate.setHeader("User-Agent", "Mozilla/5.0(Windows NT 6.1;Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse ResponseInside = httpClient.execute(GetInsideDate);
HttpEntity httpinsideEntity = ResponseInside.getEntity();
ResponseInsideBody = EntityUtils.toString(httpinsideEntity, "utf-8");
System.out.println(ResponseInsideBody);
System.out.println("这个链接为");
System.out.println(link);
}
response.close();
httpClient.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
这里的 每次都需要new new HttpGet(link);请求新的url,而且每次还需set同样的header,有没有不需要new 的而且不需要set header 一次设置就可以了?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
试试fluent-hc吧,是httpclient的官方包装,使用起来比httpclient方便太多了。