adammatusiak: Lucene large queries efficiency

Reading large data sets from Lucene even after adding document cache can still be slow due to algorithm that loads only 100 results and queries again each time loaded amount is smaller than required. Documentation advises to use HitCollector to speed up but it's hard to find an example. Big disadvantage of this approach is no sorting support when HitCollector is used.
Below you can find example of LuceneHitCollector storing items in LuceneHitCollectorItems and using LuceneSorter.
LuceneSorter is based on internal Lucene sort algorithm and therefore uses it's dictionaries built while directory is created. It implements only subset of Lucene capabilities:

sorting by rank (when empty string is passed to Sort method)
sorting by string field
reverse order sorting

Last code example shows how to use HitCollector. It's as easy as using Lucene itself and again much faster.

public class LuceneHitCollectorItem
{
    public float Score;
    public int DocId;
    public Document Doc;
}

public class LuceneHitCollector : HitCollector
{        
    private readonly IndexReader reader;
    private List<LuceneHitCollectorItem> docs = new List<LuceneHitCollectorItem>();

    public LuceneHitCollector(IndexReader reader)
    {
        this.reader = reader;
    }

    public override void Collect(int doc, float score)
    {
        Document document = reader.Document(doc);
        docs.Add(
            new LuceneHitCollectorItem
                {
                    DocId = doc,
                    Doc = document,
                    Score = score
                }
            );
    }

    public void Sort(string field, bool reverse)
    {
        LuceneSorter sorter = new LuceneSorter(reader, field, reverse);
        sorter.Sort(docs);
    }

    public int Count
    {
        get
        {
            return docs.Count;
        }
    }

    public LuceneHitCollectorItem Doc(int n)
    {
        return docs[n];
    }
}

public class LuceneSorter
{
    private string field;
    private bool reverse;
    private StringIndex index;

    public LuceneSorter(IndexReader reader, string field, bool reverse)
    {
        if (field != "")
        {
            String fieldName = String.Intern(field);
            index = FieldCache_Fields.DEFAULT.GetStringIndex(reader, fieldName);
        }
        this.field = field;
        this.reverse = reverse;
    }

    public void Sort(List<LuceneHitCollectorItem> docs)
    {
        if (String.IsNullOrEmpty(field))
        {
            docs.Sort(CompareScore);
        }
        else
        {
            docs.Sort(CompareField);
        }
    }

    private int CompareField(LuceneHitCollectorItem a, LuceneHitCollectorItem b)
    {
        int result = index.order[a.DocId] - index.order[b.DocId];
        if (result == 0)
        {
            result = a.DocId - b.DocId;
        }
        return (reverse) ? -result : result;
    }

    private int CompareScore(LuceneHitCollectorItem a, LuceneHitCollectorItem b)
    {
        int result = a.DocId - b.DocId;
        if (a.Score != b.Score)
        {
            result = (a.Score > b.Score) ? 1 : -1;
        }
        return (reverse) ? -result : result;
    }
}

LuceneHitCollector collector = new LuceneHitCollector(reader);
searcher.Search(query, collector);
collector.Sort(field, reverse);
for (int index = 0; index < hits.Count; index++)
{
    LuceneHitCollectorItem item = hits.Doc(index);
    Document doc = item.Doc;
    double rank = item.Score;
    //process document
}

adammatusiak

20100423

Lucene large queries efficiency

No comments:

Labels

About Me

LinkedIn

Blog Archive