Lucene、高亮和 NullPointerException

发布于 2024-10-20 05:23:43 字数 8168 浏览 1 评论 0原文

我试图强调一些结果。我在“内容”字段中索引文档的正文(文本),当我尝试使用highiter.getBestFragment(...) 突出显示时,我得到一个 NullPointerException 。

但是,例如,当我尝试突出显示文件名时,它可以正常工作。 我知道,因为我只使用 fileReader 或 (ParsingReader) 的一个字段,所以我的文本被标记化,这与文件名不同。

这是我的代码,请帮助我。

package xxxxxx;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParsingReader;

public class Indexer {

    static long start = 0;

    public static void main(String[] args) throws Exception {
        System.out.println("l'index se trouve à " + args[0]);
        System.out.println("le dossier ou s'effectue l'indexation est :" + args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
                    + " <index dir> <data dir>");
        }

        String indexDir = args[0];
        String dataDir = args[1];


        start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());


        } finally {

            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }
    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException, InterruptedException {
        Directory dir = FSDirectory.open(new File(indexDir));

        writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
    }

    public void close() throws IOException {
        writer.optimize();
        writer.close();
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {

            if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) {

                if (!(f.getCanonicalPath().endsWith("~"))) {
                    indexFile(f);
                }
            } else {
                index(f.toString(), filter);
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter {

        public boolean accept(File path) {
            return true;
        }
    }

    protected Document getDocument(File f) throws Exception {
       // FileReader frf = new FileReader(f);
        Document doc = new Document();
        Reader reader = new ParsingReader(f);

        doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));
        doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
        return doc;
    }

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
        System.out.println(System.currentTimeMillis() - start);
    }
}

-------------------------------------------------------------------



    package xxxxxxxxxxxxxxxxxxxx;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException, InvalidTokenOffsetsException {
        System.out.println("endroit ou se situe l'index " + args[0]);
        System.out.println(args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName()
                    + " <index dir> <query>");
        }

        String indexDir = args[0];
        String q = args[1];
        search(indexDir, q);
    }


    public static void search(String indexDir, String q) throws IOException, ParseException, InvalidTokenOffsetsException {
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexSearcher indexSearcher = new IndexSearcher(dir);
        QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
  //      QueryParser parserN = new QueryParser(Version.LUCENE_30, "filename", new StandardAnalyzer(Version.LUCENE_30));
        QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath", new StandardAnalyzer(Version.LUCENE_30));
        parserC.setDefaultOperator(QueryParser.Operator.OR);
    //    parserN.setDefaultOperator(QueryParser.Operator.OR);
        parserC.setPhraseSlop(10);
      //  parserN.setPhraseSlop(10);
        DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6);

        Query query = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"contents", "filename"},
                new CustomAnalyzer()).parse(q);

        Query queryC = parserC.parse(q);
        //Query queryN = parserN.parse(q);
        dmq.add(queryC);
        //dmq.add(queryN);
        //     dmq.add(query)      ;
        QueryScorer scorer = new QueryScorer(dmq, "contents");
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));


        System.out.println(query.toString());
        long start = System.currentTimeMillis();
        TopDocs hits = indexSearcher.search(dmq, 15);
        System.out.println(hits.totalHits);
        long end = System.currentTimeMillis();
        System.err.println("Found " + hits.totalHits
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '"
                + q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.print(scoreDoc.score);
            System.out.println(doc.get("fullpath"));

 String contents = doc.get("contents"); // I am pretty sure the mistake is here , contents is always Null
 //But what can I do to make this thing work ?
            TokenStream stream =
                    TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(),
                    scoreDoc.doc,
                    "contents",
                    doc,
                    new StandardAnalyzer(Version.LUCENE_30));
            String fragment =
                    highlighter.getBestFragment(stream, contents);
            System.out.println(fragment);
        }
        indexSearcher.close();
    }
}

----------------------------------------------------------------------

I am trying to highlight some results . I index the body (the text) of my documents in the field "contents" and when I try to highilight using highlighter.getBestFragment(...) I get a NullPointerException .

But when,for exemple I try to highlight the fileName it works properly.
I know since I use only one field with the fileReader or (ParsingReader) my text is tokenized which is different from a file name .

Here's my code ,please help me .

package xxxxxx;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParsingReader;

public class Indexer {

    static long start = 0;

    public static void main(String[] args) throws Exception {
        System.out.println("l'index se trouve à " + args[0]);
        System.out.println("le dossier ou s'effectue l'indexation est :" + args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
                    + " <index dir> <data dir>");
        }

        String indexDir = args[0];
        String dataDir = args[1];


        start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());


        } finally {

            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }
    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException, InterruptedException {
        Directory dir = FSDirectory.open(new File(indexDir));

        writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
    }

    public void close() throws IOException {
        writer.optimize();
        writer.close();
    }

    public int index(String dataDir, FileFilter filter) throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {

            if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) {

                if (!(f.getCanonicalPath().endsWith("~"))) {
                    indexFile(f);
                }
            } else {
                index(f.toString(), filter);
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter {

        public boolean accept(File path) {
            return true;
        }
    }

    protected Document getDocument(File f) throws Exception {
       // FileReader frf = new FileReader(f);
        Document doc = new Document();
        Reader reader = new ParsingReader(f);

        doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));
        doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
        return doc;
    }

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
        System.out.println(System.currentTimeMillis() - start);
    }
}

-------------------------------------------------------------------



    package xxxxxxxxxxxxxxxxxxxx;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class Searcher {

    public static void main(String[] args) throws IllegalArgumentException,
            IOException, ParseException, InvalidTokenOffsetsException {
        System.out.println("endroit ou se situe l'index " + args[0]);
        System.out.println(args[1]);
        if (args.length != 2) {
            throw new IllegalArgumentException("Usage: java "
                    + Searcher.class.getName()
                    + " <index dir> <query>");
        }

        String indexDir = args[0];
        String q = args[1];
        search(indexDir, q);
    }


    public static void search(String indexDir, String q) throws IOException, ParseException, InvalidTokenOffsetsException {
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexSearcher indexSearcher = new IndexSearcher(dir);
        QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
  //      QueryParser parserN = new QueryParser(Version.LUCENE_30, "filename", new StandardAnalyzer(Version.LUCENE_30));
        QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath", new StandardAnalyzer(Version.LUCENE_30));
        parserC.setDefaultOperator(QueryParser.Operator.OR);
    //    parserN.setDefaultOperator(QueryParser.Operator.OR);
        parserC.setPhraseSlop(10);
      //  parserN.setPhraseSlop(10);
        DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6);

        Query query = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"contents", "filename"},
                new CustomAnalyzer()).parse(q);

        Query queryC = parserC.parse(q);
        //Query queryN = parserN.parse(q);
        dmq.add(queryC);
        //dmq.add(queryN);
        //     dmq.add(query)      ;
        QueryScorer scorer = new QueryScorer(dmq, "contents");
        Highlighter highlighter = new Highlighter(scorer);
        highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));


        System.out.println(query.toString());
        long start = System.currentTimeMillis();
        TopDocs hits = indexSearcher.search(dmq, 15);
        System.out.println(hits.totalHits);
        long end = System.currentTimeMillis();
        System.err.println("Found " + hits.totalHits
                + " document(s) (in " + (end - start)
                + " milliseconds) that matched query '"
                + q + "':");

        for (ScoreDoc scoreDoc : hits.scoreDocs) {

            Document doc = indexSearcher.doc(scoreDoc.doc);
            System.out.print(scoreDoc.score);
            System.out.println(doc.get("fullpath"));

 String contents = doc.get("contents"); // I am pretty sure the mistake is here , contents is always Null
 //But what can I do to make this thing work ?
            TokenStream stream =
                    TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(),
                    scoreDoc.doc,
                    "contents",
                    doc,
                    new StandardAnalyzer(Version.LUCENE_30));
            String fragment =
                    highlighter.getBestFragment(stream, contents);
            System.out.println(fragment);
        }
        indexSearcher.close();
    }
}

----------------------------------------------------------------------

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

咽泪装欢 2024-10-27 05:23:43

如果您想使用该荧光笔,则需要存储它。 “文件名”被存储,但“内容”没有被存储,这就是为什么你会看到它们的行为不同:

    doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));

You need it to be stored if you want to use that highlighter. "filename" is stored but "contents" isn't, which is why you see them behaving differently:

    doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED ));
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文