Jsoup:在不同类之间选择 HTML

发布于 2024-12-03 22:55:56 字数 1578 浏览 0 评论 0原文

我正在尝试做类似的事情: Jsoup:如何获取 2 个标头标签之间的所有 html

但是,我的代码似乎避免使用纯文本。 我正在解析的网站以这样的方式设置代码:

div class = "quoted-message"
     Response. Can contain images, text, etc.
div class = "quoted-message"
     Another response to another quoted message

用于处理实际消息的代码片段:

Element quote = msg.select(".quoted-message").first();
Boolean hasQuote = false;
Elements siblings = null;
siblings = quote.siblingElements();
createQuotePost(quote);
List<Element> elementsBetween = new ArrayList<Element>();
    for (int i = 1; i < siblings.size(); i++) {
        Element sibling = siblings.get(i);
        if (! "div.quoted-message".equals(sibling.tagName())) {
            elementsBetween.add(sibling);
            }

        else {
            Log.v("location", "Clear and Process");
            processElementsBetween(elementsBetween);
            elementsBetween.clear();
        }
    }
    if (! elementsBetween.isEmpty())
        processElementsBetween(elementsBetween);

但是,这似乎并不像我想要的那样工作。对代码的响应没有任何特殊的格式(即:位于 ap 标记中)。通过一些日志记录,我可以看到它们没有被放入 Elements 兄弟姐妹中。 兄弟姐妹似乎只包括换行符等。

注意:我只在小帖子(简单的衬纸)上测试过这一点,以节省筛选长页打印输出的时间。

有什么建议吗?

编辑: 以下是 2 个引用消息 div 之间的 HTML 代码片段:

    MESSAGE TO BE QUOTED
    </div>
    <br />
    <br />
    Hello quoted message
    <br />
    I am a response
    <br />
    <br />
    <div class="quoted-message">

I'm trying to do something similar to: Jsoup: How to get all html between 2 header tags

However, it seems my code is avoiding plain text.
The site I'm parsing has code setup in such a way:

div class = "quoted-message"
     Response. Can contain images, text, etc.
div class = "quoted-message"
     Another response to another quoted message

Code Snippet used to handle the actual messages:

Element quote = msg.select(".quoted-message").first();
Boolean hasQuote = false;
Elements siblings = null;
siblings = quote.siblingElements();
createQuotePost(quote);
List<Element> elementsBetween = new ArrayList<Element>();
    for (int i = 1; i < siblings.size(); i++) {
        Element sibling = siblings.get(i);
        if (! "div.quoted-message".equals(sibling.tagName())) {
            elementsBetween.add(sibling);
            }

        else {
            Log.v("location", "Clear and Process");
            processElementsBetween(elementsBetween);
            elementsBetween.clear();
        }
    }
    if (! elementsBetween.isEmpty())
        processElementsBetween(elementsBetween);

This, however, does not seem to work as I want it to. The responses to the code do not have any special formatting to them (ie: sitting in a p tag). Using a bit of logging, I can see they aren't getting put into Elements siblings.
Siblings seems to just include line breaks and such.

Note: I've only tested this on small posts (simple one liners) to save on sifting through long pages of printouts.

Any suggestions on what to do?

EDIT:
Here is the HTML code snippet between 2 quoted-message divs:

    MESSAGE TO BE QUOTED
    </div>
    <br />
    <br />
    Hello quoted message
    <br />
    I am a response
    <br />
    <br />
    <div class="quoted-message">

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

心是晴朗的。 2024-12-10 22:55:56

认为问题之一是您需要元素而不是节点。文本节点是节点而不是元素。

试试这个:

package grimbo.test;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

public class StackOverflow {
    public static void main(String[] args) {
        String html = "<div class=quoted-message>message-1</div>\n    <br />\n    <br />\n    Hello quoted message\n    <br />\n    I am a response\n    <br />\n    <br />\n";
        html += "<div class=quoted-message>message-2</div>\n    <br />\n    <br />\n    Hello quoted message\n    <br />\n    I am a response\n    <br />\n    <br />\n";
        Document doc = Jsoup.parse(html);
        handleQuotedMessages(doc.select(".quoted-message"));
    }

    private static void handleQuotedMessages(Elements quotedMessages) {
        Element firstQuotedMessage = quotedMessages.first();
        List<Node> siblings = firstQuotedMessage.siblingNodes();
        List<Node> elementsBetween = new ArrayList<Node>();
        Element currentQuotedMessage = firstQuotedMessage;
        for (int i = 1; i < siblings.size(); i++) {
            Node sibling = siblings.get(i);

            // see if this Node is a quoted message
            if (!isQuotedMessage(sibling)) {
                elementsBetween.add(sibling);
            } else {
                createQuotePost(currentQuotedMessage, elementsBetween);
                currentQuotedMessage = (Element) sibling;
                elementsBetween.clear();
            }
        }
        if (!elementsBetween.isEmpty()) {
            createQuotePost(currentQuotedMessage, elementsBetween);
        }
    }

    private static boolean isQuotedMessage(Node node) {
        if (node instanceof Element) {
            Element el = (Element) node;
            return "div".equals(el.tagName()) && el.hasClass("quoted-message");
        }
        return false;
    }

    private static List<Element> filterElements(String tagName, List<Node> nodes) {
        List<Element> els = new ArrayList<Element>();
        for (Iterator<Node> it = nodes.iterator(); it.hasNext();) {
            Node n = it.next();
            if (n instanceof Element) {
                Element el = (Element) n;
                if (el.tagName().equals(tagName)) {
                    els.add(el);
                }
            }
        }
        return els;
    }

    private static void createQuotePost(Element quote, List<Node> elementsBetween) {
        System.out.println("createQuotePost: " + quote);
        System.out.println("createQuotePost: " + elementsBetween);
        List<Element> imgs = filterElements("img", elementsBetween);
        // handle imgs
    }
}

Think one of the problems is you're asking for Elements and not Nodes. Text nodes are Nodes and not Elements.

Try this:

package grimbo.test;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

public class StackOverflow {
    public static void main(String[] args) {
        String html = "<div class=quoted-message>message-1</div>\n    <br />\n    <br />\n    Hello quoted message\n    <br />\n    I am a response\n    <br />\n    <br />\n";
        html += "<div class=quoted-message>message-2</div>\n    <br />\n    <br />\n    Hello quoted message\n    <br />\n    I am a response\n    <br />\n    <br />\n";
        Document doc = Jsoup.parse(html);
        handleQuotedMessages(doc.select(".quoted-message"));
    }

    private static void handleQuotedMessages(Elements quotedMessages) {
        Element firstQuotedMessage = quotedMessages.first();
        List<Node> siblings = firstQuotedMessage.siblingNodes();
        List<Node> elementsBetween = new ArrayList<Node>();
        Element currentQuotedMessage = firstQuotedMessage;
        for (int i = 1; i < siblings.size(); i++) {
            Node sibling = siblings.get(i);

            // see if this Node is a quoted message
            if (!isQuotedMessage(sibling)) {
                elementsBetween.add(sibling);
            } else {
                createQuotePost(currentQuotedMessage, elementsBetween);
                currentQuotedMessage = (Element) sibling;
                elementsBetween.clear();
            }
        }
        if (!elementsBetween.isEmpty()) {
            createQuotePost(currentQuotedMessage, elementsBetween);
        }
    }

    private static boolean isQuotedMessage(Node node) {
        if (node instanceof Element) {
            Element el = (Element) node;
            return "div".equals(el.tagName()) && el.hasClass("quoted-message");
        }
        return false;
    }

    private static List<Element> filterElements(String tagName, List<Node> nodes) {
        List<Element> els = new ArrayList<Element>();
        for (Iterator<Node> it = nodes.iterator(); it.hasNext();) {
            Node n = it.next();
            if (n instanceof Element) {
                Element el = (Element) n;
                if (el.tagName().equals(tagName)) {
                    els.add(el);
                }
            }
        }
        return els;
    }

    private static void createQuotePost(Element quote, List<Node> elementsBetween) {
        System.out.println("createQuotePost: " + quote);
        System.out.println("createQuotePost: " + elementsBetween);
        List<Element> imgs = filterElements("img", elementsBetween);
        // handle imgs
    }
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文