From dbee92cf5c762535ddf5cb04ae212a33672dc055 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 25 Sep 2023 08:45:34 +0200 Subject: [PATCH] Recommend scoring hits with BM25(k1=0.9,b=0.4). Currently different engines use different parameters for BM25, e.g. Tantivy and Lucene use (k1=1.2,b=0.75) while PISA uses (k1=0.9,b=0.4). Robertson et al. had initially suggested that 1.2/0.75 would make good defaults for BM25 but Trotman et al. later suggested that 0.9/0.4 would make better defaults and this seems to be the consensus nowadays. The ranking function matters because it affects which hits may be skipped via dynamic pruninng, which in-turn affects search performance. Closes #45 --- CONTRIBUTE.md | 15 +++++++++++++-- README.md | 9 ++++----- engines/lucene-7.2.1/src/main/java/DoQuery.java | 2 ++ engines/lucene-8.0.0/src/main/java/DoQuery.java | 2 ++ engines/lucene-8.10.1/src/main/java/DoQuery.java | 2 ++ engines/lucene-9.6.0/src/main/java/DoQuery.java | 2 ++ 6 files changed, 25 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md index e3162128fc2..470ea96c75a 100644 --- a/CONTRIBUTE.md +++ b/CONTRIBUTE.md @@ -1,6 +1,6 @@ # Adding another engine -Currently only tantivy and lucene are supported, but you can add another search +Currently only Tantivy and Lucene are supported, but you can add another search engine by creating a directory in the engines directory and add a `Makefile` implementing the following commands : @@ -20,7 +20,7 @@ Stemming should be disabled. Tokenization should be something reasonably close t Starts a program that will get `tests` from stdin, and output a result hit count as fast as possible. *If this is not your language's default, -be sure to flush stdout after writing your answer". +be sure to flush stdout after writing your answer*. The tests consist in a command followed by a query. @@ -39,6 +39,17 @@ Queries are expressed in the Lucene query language. If a command is not supported, just print to stdout "UNSUPPORTED". +# Recommendations for new engines + +Engines are recommended to follow the below guidelines: + - Indexing is not measured and may be multi-threaded. + - Engines may optimize for read-only access, e.g. by merging multiple segments + down to a single one or performing document reordering. + - Search operations must run in a single thread. + - Hits must be ranked according to the + [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) ranking function with + standard parameters `k1`=0.9 and `b`=0.4. + - Result caches must be disabled. # Adding tests diff --git a/README.md b/README.md index 7be4ca61e40..1efcef09f0c 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ The corpus used is the English wikipedia. Stemming is disabled. Queries have bee from the [AOL query dataset](https://en.wikipedia.org/wiki/AOL_search_data_leak) (but do not contain any personal information). -Out of a random sample of query, we filtered queries that had at least two terms and yield at least 1 hit when searches as -a phrase query. +Out of a random sample of query, we filtered queries that had at least two terms and yield at least 1 hit when searched +as a phrase query. For each of these query, we then run them as : - `intersection` @@ -49,15 +49,14 @@ All tests are run once in order to make sure that - Java's JIT already kicked in. Test are run in a single thread. -Out of 5 runs, we only retain the best score, so Garbage Collection likely does not matter. - +Out of 10 runs, we only retain the best score, so Garbage Collection likely does not matter. ## Engine specific detail ### Lucene - Query cache is disabled. -- GC should not influence the results as we pick the best out of 5 runs. +- GC should not influence the results as we pick the best out of 10 runs. - JVM used was openjdk 10.0.1 2018-04-17 ### Tantivy diff --git a/engines/lucene-7.2.1/src/main/java/DoQuery.java b/engines/lucene-7.2.1/src/main/java/DoQuery.java index 2840a98508f..d7cf661d8e1 100644 --- a/engines/lucene-7.2.1/src/main/java/DoQuery.java +++ b/engines/lucene-7.2.1/src/main/java/DoQuery.java @@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import java.io.BufferedReader; @@ -19,6 +20,7 @@ public static void main(String[] args) throws IOException, ParseException { try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir))) { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); + searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f)); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in))) { final QueryParser queryParser = new QueryParser("text", new StandardAnalyzer(CharArraySet.EMPTY_SET)); String line; diff --git a/engines/lucene-8.0.0/src/main/java/DoQuery.java b/engines/lucene-8.0.0/src/main/java/DoQuery.java index 3af2935a9f8..f862d3d6b7a 100644 --- a/engines/lucene-8.0.0/src/main/java/DoQuery.java +++ b/engines/lucene-8.0.0/src/main/java/DoQuery.java @@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import java.io.BufferedReader; @@ -19,6 +20,7 @@ public static void main(String[] args) throws IOException, ParseException { try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir))) { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); + searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f)); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in))) { final QueryParser queryParser = new QueryParser("text", new StandardAnalyzer(CharArraySet.EMPTY_SET)); String line; diff --git a/engines/lucene-8.10.1/src/main/java/DoQuery.java b/engines/lucene-8.10.1/src/main/java/DoQuery.java index af90df892d4..08c6a23f288 100644 --- a/engines/lucene-8.10.1/src/main/java/DoQuery.java +++ b/engines/lucene-8.10.1/src/main/java/DoQuery.java @@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import java.io.BufferedReader; @@ -19,6 +20,7 @@ public static void main(String[] args) throws IOException, ParseException { try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir))) { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); + searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f)); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in))) { final QueryParser queryParser = new QueryParser("text", new StandardAnalyzer(CharArraySet.EMPTY_SET)); String line; diff --git a/engines/lucene-9.6.0/src/main/java/DoQuery.java b/engines/lucene-9.6.0/src/main/java/DoQuery.java index af90df892d4..08c6a23f288 100644 --- a/engines/lucene-9.6.0/src/main/java/DoQuery.java +++ b/engines/lucene-9.6.0/src/main/java/DoQuery.java @@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; import java.io.BufferedReader; @@ -19,6 +20,7 @@ public static void main(String[] args) throws IOException, ParseException { try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir))) { final IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); + searcher.setSimilarity(new BM25Similarity(0.9f, 0.4f)); try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in))) { final QueryParser queryParser = new QueryParser("text", new StandardAnalyzer(CharArraySet.EMPTY_SET)); String line;