java网页抓取问题
package com.rensanning;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.ProtocolException;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.sun.org.apache.xpath.internal.XPathAPI;
public class DhlDetail {
private static final String HTML_DETAIL_HTML = "html/detail.html";
private static String url1 = "http://www.cn.dhl.com/content/cn/zh/express/tracking.shtml?brand=DHL&AWB=8545665704%0D%0A";
public static void main(String[] args) {
try {
//抓取追踪信息页面HTML .
getHtml(url1, HTML_DETAIL_HTML, null);
//获取 抓取运输进程页面HTML时 需要的参数
Map<String, String> data = getHiddenValue(HTML_DETAIL_HTML);
//抓取运输进程页面HTML
getHtml(url1, HTML_DETAIL_HTML, data);
//获取运输进程
List<DetailBean> list = getDetailList(HTML_DETAIL_HTML);
//打印详细的运输进程
DetailBean bean = null;
System.out.println("地点" + "t" + "日期" + "t" + "当地时间" + "t" + "处理");
for (int i = 0; i < list.size(); i++) {
bean = list.get(i);
System.out.println(bean.getLocation() + "t" + bean.getDate() + "t" + bean.getTime() + "t" + bean.getOperation());
}
} catch (Exception e) {
e.printStackTrace();
}
}
private static List<DetailBean> getDetailList(String html) throws Exception {
List<DetailBean> list = new ArrayList<DetailBean>();
DOMParser parser = new DOMParser();
parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(html);
Node node = parser.getDocument();
Node tb = XPathAPI.selectSingleNode(node, "//TABLE[@class='dataTable']");
NodeList tdlist = XPathAPI.selectNodeList(tb, "//TR/TD");
int line = 0;
while (line < tdlist.getLength() / 4) {
DetailBean bean = new DetailBean();
bean.setLocation(deleteSpace(tdlist.item(line * 4 + 0).getTextContent()));
bean.setDate(deleteSpace(tdlist.item(line * 4 + 1).getTextContent()));
bean.setTime(deleteSpace(tdlist.item(line * 4 + 2).getTextContent()));
bean.setOperation(deleteSpace(tdlist.item(line * 4 + 3).getTextContent()));
line++;
list.add(bean);
}
return list;
}
private static Map<String, String> getHiddenValue(String html) throws Exception {
Map<String, String> data = new HashMap<String, String>();
List<String> params = new ArrayList<String>();
params.add("loc".toLowerCase());
params.add("USER_HISTORY_LIST".toLowerCase());
params.add("progressIsLoaded".toLowerCase());
params.add("refresh_sii".toLowerCase());
params.add("showSpPkgProg1".toLowerCase());
params.add("datakey".toLowerCase());
params.add("HIDDEN_FIELD_SESSION".toLowerCase());
params.add("trackNums".toLowerCase());
DOMParser parser = new DOMParser();
// parser.setFeature("http://xml.org/sax/features/namespaces", false);
parser.parse(html);
Node node = parser.getDocument();
NodeList nodeList = XPathAPI.selectNodeList(node, "//INPUT");
for (int i = 0; i < nodeList.getLength(); i++) {
Element e = (Element) nodeList.item(i);
if ("hidden".equalsIgnoreCase(e.getAttribute("type"))
&& params.contains(e.getAttribute("name").toLowerCase())) {
data.put(e.getAttribute("name"), e.getAttribute("value"));
}
}
System.out.println("订单编号:" + data.get("trackNums"));
return data;
}
private static void getHtml(String url, String filename, Map<String, String> data) throws Exception {
//创建一个客户端
DefaultHttpClient client = new DefaultHttpClient();
HttpResponse res = null;
if (data == null) {
//创建一个get方法
HttpGet get = new HttpGet(url);
//执行请求
res = client.execute(get);
} else {
client.setRedirectStrategy(new DefaultRedirectStrategy() {
public boolean isRedirected(HttpRequest request, HttpResponse response, HttpContext context) {
boolean isRedirect = false;
try {
isRedirect = super.isRedirected(request, response, context);
} catch (ProtocolException e) {
e.printStackTrace();
}
if (!isRedirect) {
int responseCode = response.getStatusLine().getStatusCode();
if (responseCode == 301 || responseCode == 302) {
return true;
}
}
return isRedirect;
}
});
//作成post参数Entity
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
Iterator i = data.keySet().iterator();
while(i.hasNext()) {
String key = (String)i.next();
formparams.add(new BasicNameValuePair(key, data.get(key)));
}
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8");
//创建一个post方法
HttpPost post = new HttpPost(url);
//设置post参数
post.setEntity(entity);
//执行请求
res = client.execute(post);
}
//获取完整的StatusLine・・・「HTTP/1.1 200 OK」
System.out.println(res.getStatusLine().toString());
//获取返回内容
if (res.getEntity() != null) {
String result = EntityUtils.toString(res.getEntity());
//System.out.println(result);
//生成HTML文件保存到本地(测试用可以不保存直接解析)
createHtmlFile(filename, result);
}
//关闭流
EntityUtils.consume(res.getEntity());
//关闭连接
client.getConnectionManager().shutdown();
}
private static void createHtmlFile(String filename, String data) throws Exception {
File file = new File(filename);
OutputStream os = new FileOutputStream(file);
os.write(data.getBytes("UTF-8"));
os.close();
}
private static String deleteSpace(String in) {
Pattern pattern = Pattern.compile("\s*|t|r|n");
Matcher re = pattern.matcher(in);
return re.replaceAll("");
}
}
抓取上面url中,查询结果汇总的信息 好像是我解析错了,求指导
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(6)
那您看我的代码需要做什么修改 请给些意见 现在报org.w3c.dom.DOMException: HIERARCHY_REQUEST_ERR: An attempt was made to insert a node where it is not permitted. 错误
不解,问什么同时用了htmlparser和httpclient。单纯抓网页的话htmlparser就可以了啊
或者在 java 中执行 js.
换jcop吧,简单
这个问题解决好久了 HtmlParser 解析到底应该是什么 能帮分析下么
HtmlParser解析有点问题,html中多一空行都会被解析多一个结点出来,你试试一步步print出来看看你解析出来的那个结点是不是就是你要的那个结点
个人推荐jsoup,比较准确,并且可以使用类似jquery的选择器来直接提取结点。