heritrix爬取友人网(http://mobile.younet.com/)信息后遇到的问题
最近在使用heritrix爬取了http://mobile.younet.com/网站的网站产品页面后,在运行写入main函数的 Extractor后,控制台并没有出现所想要的信息,只有count输出为0 的信息,我由于初学实在是解决不出来,贴出我用的两个类Extractor和ExtractYounetMobile希望大家能帮我找找是什么原因了
package com.backSearch.extractor; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlparser.Parser; import com.backSearch.extractor.younet.ExtractYounetMobile; public abstract class Extractor { protected static final String NEWLINE = "rn"; /** * 表示所有结果的输出路径 */ private String outputPath = ""; /** * 表示当前正在被处理的文件 */ private String inputFilePath; /** * 表示当前所有被抓取的网页的镜象根目录 在Heritrix用mirror目录表示 */ private String mirrorDir = ""; /** * 用于存放被处理过后的产口的图片的目录 */ private String imageDir = ""; /** * HTMLParser的实例 */ private Parser parser; /** * 对图片路径进行哈希的算法,这里采用MD5算法 */ protected static final String HASH_ALGORITHM = "md5"; /** * 分隔符 */ public static final String SEPARATOR = "======================"; /** * 装载需要的网页文件 * */ public void loadFile(String path) { try { parser = new Parser(path); inputFilePath = path; parser.setEncoding("UTF-8"); } catch (Exception e) { e.printStackTrace(); } } /** * 获取输出的路径 */ public String getOutputPath() { return outputPath; } /** * 设置输出的路径,通常在初始化Extractor时就应该做 */ public void setOutputPath(String outputPath) { this.outputPath = outputPath; } public Parser getParser() { return parser; } /** * 使用正则来匹配并获得网页中的字符串 */ protected String getProp(String pattern, String match, int index) { Pattern sp = Pattern.compile(pattern); Matcher matcher = sp.matcher(match); while (matcher.find()) { return matcher.group(index); } return null; } /** * 抽象方法,用于供子类实现。 其功能主要是解释网页文件 将产品信息保存到 * */ public abstract void extract(); /** * 获取正在处理的文件的路径 */ public String getInputFilePath() { return inputFilePath; } /** * 从mirror目录下拷贝文件至所设定的图片目录 * 该方法可能需要被改变 */ protected boolean copyImage(String image_url, String new_image_file) { String dirs = image_url.substring(7); try { // instance the File as file_in and file_out File file_in = new File(new File(mirrorDir), dirs); if (file_in == null || !file_in.exists()) { file_in = new File("f:\sousuo\noimage.jpg"); } File file_out = new File(new File(imageDir), new_image_file); FileInputStream in1 = new FileInputStream(file_in); FileOutputStream out1 = new FileOutputStream(file_out); byte[] bytes = new byte[1024]; int c; while ((c = in1.read(bytes)) != -1) out1.write(bytes, 0, c); // close in1.close(); out1.close(); return (true); // if success then return true } catch (Exception e) { e.printStackTrace(); return (false); // if fail then return false } } public String getImageDir() { return imageDir; } public void setImageDir(String imageDir) { this.imageDir = imageDir; } public String getMirrorDir() { return mirrorDir; } public void setMirrorDir(String mirrorDir) { this.mirrorDir = mirrorDir; } public void setInputFilePath(String inputFilePath) { this.inputFilePath = inputFilePath; } // public static void main(String[] args) throws Exception { // // Extractor extractor = new Extract163Moblie(); // extractor.setOutputPath("c:\product\test\mobile\"); // extractor.setImageDir("c:\product\test\image\"); // extractor.setMirrorDir("F:\data\163手机\mirror\"); // // traverse(extractor, new File("F:\data\163手机\mirror\mobile.163.com\0011\product\0011000B\product")); // System.out.println(count); // // } static int count = 0; public static void main(String[] args) throws Exception { Extractor extractor = new ExtractYounetMobile(); extractor.setOutputPath("F:\product\mobile\"); extractor.setImageDir("F:\product\image\"); extractor.setMirrorDir("F:\learn\Workspaces\MyEclipse 7.0\heritrixProject_1\jobs\YounetMobile-20100514064948846\mirror\"); //try { //long s = System.currentTimeMillis(); traverse(extractor, new File("F:\learn\Workspaces\MyEclipse 7.0\heritrixProject_1\jobs\YounetMobile-20100514064948846\mirror\mobile.younet.com\files\")); //long e = System.currentTimeMillis(); //System.out.println("1---------------------" + e); //System.out.println("2---------------------" + s); //System.out.println("用时: " + (e - s) / 1000 + " 秒"); System.out.println("总数" + count); // } catch (Exception e) { // e.printStackTrace(); //} } public static void traverse(Extractor extractor, File path) throws Exception { if (path == null) { return; } if (path.isDirectory()) { String[] files = path.list(); for (int i = 0; i < files.length; i++) { traverse(extractor, new File(path, files[i])); } } else { if (path.getAbsolutePath().endsWith(".html") && path.getAbsolutePath().indexOf("_") == -1) { System.out.println(path); count++; extractor.loadFile(path.getAbsolutePath()); extractor.extract(); } } } } package com.backSearch.extractor.younet; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Date; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.HasChildFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import com.backSearch.extractor.Extractor; import com.backSearch.util.StringUtils; public class ExtractYounetMobile extends Extractor { @Override public void extract() { BufferedWriter bw = null; NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit")); NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1")))); NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img"))); //提取标题信息 try { //Parser根据过滤器返回所有满足过滤条件的节点 // 迭代逐渐查找 NodeList nodeList=this.getParser().parse(title_filter); NodeIterator it = nodeList.elements(); StringBuffer title = new StringBuffer(); while (it.hasMoreNodes()) { Node node = (Node) it.nextNode(); String[] names = node.toPlainTextString().split(" "); for(int i = 0; i < names.length; i++) title.append(names[i]).append("-"); title.append(new Date().getTime()); //创建要生成的文件 bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt"))); //获取当前提取页的完整URL地址 int startPos = this.getInputFilePath().indexOf("mirror") + 6; String url_seg = this.getInputFilePath().substring(startPos); url_seg = url_seg.replaceAll("\\", "/"); String url = "http:/" + url_seg; //写入当前提取页的完整URL地址 bw.write(url + NEWLINE); bw.write(names[0] + NEWLINE); bw.write(names[1] + NEWLINE); } // 重置Parser this.getParser().reset(); Parser attNameParser = null; Parser attValueParser = null; //Parser parser=new Parser("http://www.sina.com.cn"); NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1")); NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2")); String attName = ""; String attValue = ""; // 迭代逐渐查找 nodeList=this.getParser().parse(attribute_filter); it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = (Node) it.nextNode(); attNameParser = new Parser(); attNameParser.setEncoding("UTF-8"); attNameParser.setInputHTML(node.toHtml()); NodeList attNameNodeList = attNameParser.parse(attributeName_filter); attName = attNameNodeList.elements().nextNode().toPlainTextString(); attValueParser = new Parser(); attValueParser.setEncoding("UTF-8"); attValueParser.setInputHTML(node.toHtml()); NodeList attValueNodeList = attValueParser.parse(attributeValue_filter); attValue = attValueNodeList.elements().nextNode().toPlainTextString(); bw.write(attName.trim() + attValue.trim()); bw.newLine(); } // 重置Parser this.getParser().reset(); String imgUrl = ""; String fileType =""; // 迭代逐渐查找 nodeList=this.getParser().parse(img_filter); it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = (Node) it.nextNode(); ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode(); imgUrl = imgNode.getAttribute("src"); fileType = imgUrl.trim().substring(imgUrl .lastIndexOf(".") + 1); //生成新的图片的文件名 String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType; //imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " "); //利用miorr目录下的图片生成的新的图片 this.copyImage(imgUrl, new_iamge_file); bw.write(SEPARATOR + NEWLINE); bw.write(new_iamge_file + NEWLINE); } } catch(Exception e) { e.printStackTrace(); } finally { try{ if (bw != null) bw.close(); }catch(IOException e){ e.printStackTrace(); } } } }
我是在heritrix里面写了一个MobileYounetExtractor 正则表达式选定
“http://mobile.younet.com/choose.php?groupid=1,2,3,4,&tradeid=[\d]+,& ”
来抓取该网站下的各种手机型号的页面和相关图片。
希望大家能给我点儿帮助,支持邮箱及QQ联系。谢谢大家
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
这几天研究了下,htmlparser还是吃不通,将ExtractYounetMobile的代码改了一通后,新的问题马上出现了,控制台提示
F:YounetMobile-20100514064948846mirrormobile.younet.comfiles2323734.html
java.io.FileNotFoundException: F:productmobile
诺基亚-N97-Mini黄金版
-1274142325918.txt (文件名、目录名或卷标语法不正确。)
有哪位同仁也在研究这东西,麻烦点拨我一下吧
果然是路径问题,我把路径F:learnWorkspacesMyEclipse 7.0heritrixProject_1jobs下的YounetMobile-20100514064948846移到F盘根目录下,控制台正常的打印出了各手机品牌页面。不过在我前面制定的F:product xia de mobile 和image下什么都没有。。。明天研究。。。要熄灯了