在 Lucene 中使用 XML 字段进行邻近搜索

发布于 2024-12-09 07:45:15 字数 234 浏览 1 评论 0原文

我有一个如下所示的文档语料库:

<doc>
text sample text <x>text</x> words lipsum words words <x>text</x> some other text
</doc>

我希望能够搜索注释中一定数量的标记中出现的短语(以“”为单位)。我怎样才能像这样索引和搜索?

I have a corpus of documents that look like this:

<doc>
text sample text <x>text</x> words lipsum words words <x>text</x> some other text
</doc>

I would like to be able to search for phrases (in "") that occur within a certain number of tokens from an annotation. How can I index and search like this?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

秋心╮凉 2024-12-16 07:45:15

您可以使用自定义分析器来解析 xml 流。我破解了一个在空白处分割的“>”和 '/',以便 XML 标记由 ' 标识

public class SpanQueryTests {
    private IndexSearcher searcher;
    private IndexReader reader;
    private Analyzer analyzer;

    static class XMLTokenizer extends CharTokenizer {
        public XMLTokenizer(Reader input) {
            super(input);
        }

        final static Set<Character> chars = ImmutableSet.of('/', '>');

        @Override
        protected boolean isTokenChar(char c) {
            return !(Character.isWhitespace(c) || chars.contains(c));
        }
    }

    @Before
    public void setUp() throws Exception {
        Directory dir = new RAMDirectory();
        analyzer = new Analyzer() {
            @Override
            public TokenStream tokenStream(String fieldName, Reader reader) {
                return new XMLTokenizer(reader);
            }

            @Override
            public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
                Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
                if (tokenizer == null) {
                    tokenizer = new XMLTokenizer(reader);
                    setPreviousTokenStream(tokenizer);
                } else
                    tokenizer.reset(reader);
                return tokenizer;
            }
        };
        IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
        ImmutableList<String> docs = ImmutableList.of("<doc>text sample text <x>test</x> words lipsum words words " +
                                                              "<x>text</x> some other text </doc>",
                                                             "<foobar>test</foobar> some more text flop");
        int id = 0;
        for (String content: docs) {
            Document doc = new Document();
            doc.add(new Field("id", String.valueOf(id++), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
            writer.addDocument(doc);
            id++;
        }
        writer.close();

        searcher = new IndexSearcher(dir);
        reader = searcher.getIndexReader();
    }

    @After
    public void tearDown() throws Exception {
        searcher.close();
    }

    @Test
    public void testTermNearQuery() throws Exception {
        SpanTermQuery tq1 = new SpanTermQuery(new Term("content", "lipsum"));
        dumpSpans(tq1);
        SpanTermQuery tq2 = new SpanTermQuery(new Term("content", "other"));
        dumpSpans(tq2);
        SpanTermQuery tq3 = new SpanTermQuery(new Term("content", "<x"));
        dumpSpans(tq3);
        SpanNearQuery snq1 = new SpanNearQuery(new SpanQuery[] { tq1, tq3 }, 2, false);
        dumpSpans(snq1);
        SpanNearQuery snq2 = new SpanNearQuery(new SpanQuery[] { tq2, tq3 }, 2, false);
        dumpSpans(snq2);
    }
}

结果是:

query content:lipsum
   <doc text sample text <x test< x words <lipsum> words words <x text< x some other text < doc (0.15467961)

query content:other
   <doc text sample text <x test< x words lipsum words words <x text< x some <other> text < doc (0.15467961)

query content:<x
   <doc text sample text <<x> test< x words lipsum words words <x text< x some other text < doc (0.21875)
   <doc text sample text <x test< x words lipsum words words <<x> text< x some other text < doc (0.21875)

query spanNear([content:lipsum, content:<x], 2, false)
   <doc text sample text <x test< x words <lipsum words words <x> text< x some other text < doc (0.19565594)

query spanNear([content:other, content:<x], 2, false)
    NO spans

You could use a custom analyzer to parse your xml stream. I hacked one that splits on whitespace, '>' and '/', so that XML tokens are identified by '

public class SpanQueryTests {
    private IndexSearcher searcher;
    private IndexReader reader;
    private Analyzer analyzer;

    static class XMLTokenizer extends CharTokenizer {
        public XMLTokenizer(Reader input) {
            super(input);
        }

        final static Set<Character> chars = ImmutableSet.of('/', '>');

        @Override
        protected boolean isTokenChar(char c) {
            return !(Character.isWhitespace(c) || chars.contains(c));
        }
    }

    @Before
    public void setUp() throws Exception {
        Directory dir = new RAMDirectory();
        analyzer = new Analyzer() {
            @Override
            public TokenStream tokenStream(String fieldName, Reader reader) {
                return new XMLTokenizer(reader);
            }

            @Override
            public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
                Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
                if (tokenizer == null) {
                    tokenizer = new XMLTokenizer(reader);
                    setPreviousTokenStream(tokenizer);
                } else
                    tokenizer.reset(reader);
                return tokenizer;
            }
        };
        IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
        ImmutableList<String> docs = ImmutableList.of("<doc>text sample text <x>test</x> words lipsum words words " +
                                                              "<x>text</x> some other text </doc>",
                                                             "<foobar>test</foobar> some more text flop");
        int id = 0;
        for (String content: docs) {
            Document doc = new Document();
            doc.add(new Field("id", String.valueOf(id++), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
            writer.addDocument(doc);
            id++;
        }
        writer.close();

        searcher = new IndexSearcher(dir);
        reader = searcher.getIndexReader();
    }

    @After
    public void tearDown() throws Exception {
        searcher.close();
    }

    @Test
    public void testTermNearQuery() throws Exception {
        SpanTermQuery tq1 = new SpanTermQuery(new Term("content", "lipsum"));
        dumpSpans(tq1);
        SpanTermQuery tq2 = new SpanTermQuery(new Term("content", "other"));
        dumpSpans(tq2);
        SpanTermQuery tq3 = new SpanTermQuery(new Term("content", "<x"));
        dumpSpans(tq3);
        SpanNearQuery snq1 = new SpanNearQuery(new SpanQuery[] { tq1, tq3 }, 2, false);
        dumpSpans(snq1);
        SpanNearQuery snq2 = new SpanNearQuery(new SpanQuery[] { tq2, tq3 }, 2, false);
        dumpSpans(snq2);
    }
}

The results are:

query content:lipsum
   <doc text sample text <x test< x words <lipsum> words words <x text< x some other text < doc (0.15467961)

query content:other
   <doc text sample text <x test< x words lipsum words words <x text< x some <other> text < doc (0.15467961)

query content:<x
   <doc text sample text <<x> test< x words lipsum words words <x text< x some other text < doc (0.21875)
   <doc text sample text <x test< x words lipsum words words <<x> text< x some other text < doc (0.21875)

query spanNear([content:lipsum, content:<x], 2, false)
   <doc text sample text <x test< x words <lipsum words words <x> text< x some other text < doc (0.19565594)

query spanNear([content:other, content:<x], 2, false)
    NO spans
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文