如何在 Lucene.Net 中设置索引词长度

发布于 2025-01-03 20:28:17 字数 1113 浏览 4 评论 0原文

如何限制 Lucene.Net 只索引这些长度大于 x 的术语。 我将文档索引为:

        String indexDirectory = @"C:\Users\user\Desktop\Index";
        String dataDirectory = @"C:\Users\user\Desktop\Data";


        StandardAnalyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDirectory, analyzer);

        Document doc = new Document();

        Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO);
        Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);

        doc.Add(fPath);
        doc.Add(fContent);

我正在使用以下代码从 Lucene 索引文件中获取索引术语。

        TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0);

        foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
        {
            String[] terms = vector.GetTerms();

            foreach (String term in terms)
            {
                // loop through indexed terms
            }

        }

How can restrict Lucene.Net to index only these terms that has length greater than x.
I am indexing the document as:

        String indexDirectory = @"C:\Users\user\Desktop\Index";
        String dataDirectory = @"C:\Users\user\Desktop\Data";


        StandardAnalyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDirectory, analyzer);

        Document doc = new Document();

        Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO);
        Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);

        doc.Add(fPath);
        doc.Add(fContent);

I am using the following code to get indexed Terms from Lucene Index file.

        TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0);

        foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
        {
            String[] terms = vector.GetTerms();

            foreach (String term in terms)
            {
                // loop through indexed terms
            }

        }

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

怪我闹别瞎闹 2025-01-10 20:28:17

您可以实现自己的分析器,或扩展标准分析器。

示例:

TokenFilter + 分析器

public class MinTermLengthTokenFilter : TokenFilter
{
    private int minTermLength;
    private TermAttribute termAtt;
    public MinTermLengthTokenFilter(int maxTermLength, TokenStream input)
        : base(input)
    {
        this.minTermLength = maxTermLength;
        termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
    }

    public override bool IncrementToken()
    {
        while (input.IncrementToken())
        {
            if (termAtt.TermLength() >= minTermLength)
            {
                return true;
            }
        }
        return false;
    }        

}


public class MinTermLengthAnalyzer : StandardAnalyzer
{
    private int minTermLength;
    public MinTermLengthAnalyzer(int minTermLength)
        :base()
    {
        this.minTermLength = minTermLength;
    }

    public override TokenStream TokenStream(string fieldName, TextReader reader)
    {   
        return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader));
    }

    public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
    {
        return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader));

    }
}

索引:

 FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST");
 IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5));
 Document document = new Document();

 document.Add(new Field(
     "text",
     "some sample text for demonstration",
     Field.Store.YES,
     Field.Index.ANALYZED,
     Field.TermVector.WITH_POSITIONS_OFFSETS));
 writer.AddDocument(document);       
 writer.Close();

搜索:

        var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST"));

        var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25);

        foreach (var result in results.ScoreDocs)
        {
            TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc);

            foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
            {
                String[] terms = vector.GetTerms();

                foreach (String term in terms)
                {
                    Console.WriteLine(term);
                }

            }
        }

        indexSearcher.Close();
        // outputs:
        // demonstration
        // sample

You could implement your own Analyzer, or extend the StandardAnalyzer.

Example:

TokenFilter + Analyzer

public class MinTermLengthTokenFilter : TokenFilter
{
    private int minTermLength;
    private TermAttribute termAtt;
    public MinTermLengthTokenFilter(int maxTermLength, TokenStream input)
        : base(input)
    {
        this.minTermLength = maxTermLength;
        termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
    }

    public override bool IncrementToken()
    {
        while (input.IncrementToken())
        {
            if (termAtt.TermLength() >= minTermLength)
            {
                return true;
            }
        }
        return false;
    }        

}


public class MinTermLengthAnalyzer : StandardAnalyzer
{
    private int minTermLength;
    public MinTermLengthAnalyzer(int minTermLength)
        :base()
    {
        this.minTermLength = minTermLength;
    }

    public override TokenStream TokenStream(string fieldName, TextReader reader)
    {   
        return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader));
    }

    public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
    {
        return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader));

    }
}

Indexing:

 FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST");
 IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5));
 Document document = new Document();

 document.Add(new Field(
     "text",
     "some sample text for demonstration",
     Field.Store.YES,
     Field.Index.ANALYZED,
     Field.TermVector.WITH_POSITIONS_OFFSETS));
 writer.AddDocument(document);       
 writer.Close();

Searching :

        var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST"));

        var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25);

        foreach (var result in results.ScoreDocs)
        {
            TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc);

            foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
            {
                String[] terms = vector.GetTerms();

                foreach (String term in terms)
                {
                    Console.WriteLine(term);
                }

            }
        }

        indexSearcher.Close();
        // outputs:
        // demonstration
        // sample
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文