HTMLParser 解析连接的问题
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class Catenate {
public static void main(String[] args) throws Exception {
String url = "http://www.bradfordexchange.com/";
extracLinks(url);
}
// 获取一个网页上所有的链接和图片链接
@SuppressWarnings("serial")
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")
//||node.getText().contains("mcategory")
//||(node.getText().contains("category")
//||node.getText().startsWith("img src=")
) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(frame.length(), end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
}
后面怎么写可以只抓取目录链接不抓别的
只抓这些http://www.bradfordexchange.com/mcategory/decor-and-tabletop_8244/wall-decor.html********Wall Décor
http://www.bradfordexchange.com/mcategory/holidays.html********Holiday Décor
http://www.bradfordexchange.com/mcategory/decor-and-tabletop_8226/outdoor-flags.html********Outdoor Flags
http://www.bradfordexchange.com/mcategory/decor-and-tabletop.html********All Décor & Tabletop
http://www.bradfordexchange.com/mcategory/villages-and-trains.html********Villages & Trains
http://www.bradfordexchange.com/mcategory/villages-and-trains_565/villages.html********Villages
不抓这些http://www.bradfordexchange.com/mcategory/coins.html********
http://www.bradfordexchange.com/mcategory/holidays.html********
http://www.bradfordexchange.com/mcategory/dolls.html********
http://www.bradfordexchange.com/mcategory/books-and-music.html********
http://www.bradfordexchange.com/mcategory/gifts.html********
http://www.bradfordexchange.com/mcategory/christmas-gifts.html********
http://www.bradfordexchange.com/mcategory/shop-by-theme.html********
http://www.bradfordexchange.com/store/20091217001/modal-source/smtWin_infoVeriSign.html********
http://www.bradfordexchange.com/store/20091217001/modal-source/smtWin_infoGuarantee.html********
********
http://www.ashtondrake.com********
http://www.hamiltoncollection.com********
http://www.bradfordexchangechecks.com********
http://www.bradfordexchange.com/service/international-sites.html********
http://www.bradford.co.uk/********
http://www.bradford.com.au/********
http://www.bradfordltd.co.nz/********
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
可以定义 一个 filter,没获取到一个 LinkTag就在filter中进行过滤。应该可以满足你的需求。