Jsoup Html解析查找内部链接数据问题

发布于 2024-12-04 06:47:15 字数 6587 浏览 1 评论 0 原文

通常我们在一个文件中有很多内部链接。我想解析一个 html 文件，以便在地图中获取页面的标题及其相应的数据。

我所做的步骤：
1) 获取所有内部参考元素
2) 解析文档中的 id = XXX，其中 XXX ==（元素 ）。 3) 它带我到这里的小文本； <这里还有一些标签>

实际文本在这里


 这里也是

  4) 如何从  前往到  ???

  5）我尝试转到 span 的父级，并认为它的子级之一也是 
 ...这是真的。但它也涉及其他内部链接的
。

编辑：添加了示例 html 文件部分：

<li class="toclevel-1 tocsection-1"><a href="#Enforcing_mutual_exclusion">  
<span class="tocnumber">1</span> <span class="toctext">Enforcing mutual   exclusion</span>  </a><ul>  
<li class="toclevel-2 tocsection-2"><a href="#Hardware_solutions">  
<span class="tocnumber">1.1</span> <span class="toctext">Hardware solutions</span>  
</a></li> 
<li class="toclevel-2 tocsection-3"><a href="#Software_solutions">

<h2><span class="editsection">[<a href="/w/index.php?title=Mutual_exclusion&     
 amp;action=edit&amp;section=1" title="Edit section: Enforcing mutual exclusion">
 edit</a>]</span> <span class="mw-headline" id="Enforcing_mutual_exclusion">

<comment --------------------------------------------------------------------    

**see the id above = Enforcing_mutual_exclusion**  which is same as first internal   
link . Jsoup takes me to this span element.  i want to access every <p> element after 
this <span> tag  before another <span> tag with id="any of the internal links"
------------------------------------------------------------------------------!>
Enforcing mutual exclusion</span></h2>
<p>There are both software and hardware solutions for enforcing mutual exclusion.  
The   different solutions are shown below.</p>
<h3><span class="editsection">[<a href="/w/index.php?title=Mutual_exclusion&  
amp;action=edit&amp;section=2" title="Edit section: Hardware solutions">
edit</a>]</span> <span class="mw-headline" id="Hardware_solutions">Hardware   
solutions</span></h3>

<p>On a <a href="/wiki/Uniprocessor" title="Uniprocessor" class="mw-  
 redirect">uniprocessor</a> system a common way to achieve mutual exclusion inside 
 <a href="/wiki/Kernel_(computing)" title="Kernel (computing)">kernels</a> is 
 disable <a href="/wiki/Interrupt" title="Interrupt">

这是我的代码：

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}

原文

Usually we have many internal links in a file. I want to parse a html file such that i get the headings of a page and its corresponding data in a map.

Steps i did:
1) Got all the internal reference elements
2) Parsed the document for the id = XXX where XXX == (element <a href="#XXX").
3) it takes me to the little text here <some tags here too > actual text here here too 
4) How to go from  to  ???
5) I tried going to parent of span and thought that its one of the child is  too... its true. But it also involves  of other internal links too.

EDIT: added an sample html file portion:

<li class="toclevel-1 tocsection-1"><a href="#Enforcing_mutual_exclusion">  
<span class="tocnumber">1</span> <span class="toctext">Enforcing mutual   exclusion</span>  </a><ul>  
<li class="toclevel-2 tocsection-2"><a href="#Hardware_solutions">  
<span class="tocnumber">1.1</span> <span class="toctext">Hardware solutions</span>  
</a></li> 
<li class="toclevel-2 tocsection-3"><a href="#Software_solutions">

<h2><span class="editsection">[<a href="/w/index.php?title=Mutual_exclusion&     
 amp;action=edit&section=1" title="Edit section: Enforcing mutual exclusion">
 edit</a>]</span> <span class="mw-headline" id="Enforcing_mutual_exclusion">

<comment --------------------------------------------------------------------    

**see the id above = Enforcing_mutual_exclusion**  which is same as first internal   
link . Jsoup takes me to this span element.  i want to access every <p> element after 
this <span> tag  before another <span> tag with id="any of the internal links"
------------------------------------------------------------------------------!>
Enforcing mutual exclusion</span></h2>
<p>There are both software and hardware solutions for enforcing mutual exclusion.  
The   different solutions are shown below.</p>
<h3><span class="editsection">[<a href="/w/index.php?title=Mutual_exclusion&  
amp;action=edit&section=2" title="Edit section: Hardware solutions">
edit</a>]</span> <span class="mw-headline" id="Hardware_solutions">Hardware   
solutions</span></h3>

<p>On a <a href="/wiki/Uniprocessor" title="Uniprocessor" class="mw-  
 redirect">uniprocessor</a> system a common way to achieve mutual exclusion inside 
 <a href="/wiki/Kernel_(computing)" title="Kernel (computing)">kernels</a> is 
 disable <a href="/wiki/Interrupt" title="Interrupt">

Here is my code:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}

分享到QQ

分享到微博