From 04bf2a4bc79570d19986967175615a48663bf162 Mon Sep 17 00:00:00 2001
From: zhichao-aws <zhichaog@amazon.com>
Date: Wed, 22 Nov 2023 18:02:41 +0800
Subject: [PATCH] Deprecate max_token_score field of neural_sparse query (#478)

* rm bounded linear feature query

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* deprecate max_token_score

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* add changelog

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* tidy

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* fix ut

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* add ut

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

* add deprecation annotation

Signed-off-by: zhichao-aws <zhichaog@amazon.com>

---------

Signed-off-by: zhichao-aws <zhichaog@amazon.com>
---
 CHANGELOG.md                                  |   1 +
 .../lucene/BoundedLinearFeatureQuery.java     | 237 ------------------
 .../query/NeuralSparseQueryBuilder.java       |  18 +-
 .../query/NeuralSparseQueryBuilderTests.java  |  65 +++--
 .../query/NeuralSparseQueryIT.java            |  11 +-
 5 files changed, 52 insertions(+), 280 deletions(-)
 delete mode 100644 src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 10063185c..438c4d65a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,3 +21,4 @@ Fixed exception for case when Hybrid query being wrapped into bool query ([#490]
 ### Documentation
 ### Maintenance
 ### Refactoring
+Deprecate the `max_token_score` field in `neural_sparse` query clause ([#478](https://github.com/opensearch-project/neural-search/pull/478))
diff --git a/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java b/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java
deleted file mode 100644
index a914f3156..000000000
--- a/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Modifications Copyright OpenSearch Contributors. See
- * GitHub history for details.
- */
-
-/*
- * This class is built based on lucene FeatureQuery. We use LinearFuntion to
- * build the query and add an upperbound to it.
- */
-
-package org.apache.lucene;
-
-import java.io.IOException;
-import java.util.Objects;
-
-import org.apache.lucene.index.ImpactsEnum;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.ImpactsDISI;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.MaxScoreCache;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.QueryVisitor;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.Weight;
-import org.apache.lucene.search.similarities.Similarity.SimScorer;
-import org.apache.lucene.util.BytesRef;
-
-/**
- * The feature queries of input tokens are wrapped by lucene BooleanQuery, which use WAND algorithm
- * to accelerate the execution. The WAND algorithm leverage the score upper bound of sub-queries to
- * skip non-competitive tokens. However, origin lucene FeatureQuery use Float.MAX_VALUE as the score
- * upper bound, and this invalidates WAND.
- *
- * To mitigate this issue, we rewrite the FeatureQuery to BoundedLinearFeatureQuery. The caller can
- * set the token score upperbound of this query. And according to our use case, we use LinearFunction
- * as the score function.
- *
- * This class combines both <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureQuery.java">FeatureQuery</a>
- * and <a href="https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/document/FeatureField.java">FeatureField</a> together
- * and will be deprecated after OpenSearch upgraded lucene to version 9.8.
- */
-
-public final class BoundedLinearFeatureQuery extends Query {
-
-    private final String fieldName;
-    private final String featureName;
-    private final Float scoreUpperBound;
-
-    public BoundedLinearFeatureQuery(String fieldName, String featureName, Float scoreUpperBound) {
-        this.fieldName = Objects.requireNonNull(fieldName);
-        this.featureName = Objects.requireNonNull(featureName);
-        this.scoreUpperBound = Objects.requireNonNull(scoreUpperBound);
-    }
-
-    @Override
-    public Query rewrite(IndexSearcher indexSearcher) throws IOException {
-        // LinearFunction return same object for rewrite
-        return super.rewrite(indexSearcher);
-    }
-
-    @Override
-    public boolean equals(Object obj) {
-        if (obj == null || getClass() != obj.getClass()) {
-            return false;
-        }
-        BoundedLinearFeatureQuery that = (BoundedLinearFeatureQuery) obj;
-        return Objects.equals(fieldName, that.fieldName)
-            && Objects.equals(featureName, that.featureName)
-            && Objects.equals(scoreUpperBound, that.scoreUpperBound);
-    }
-
-    @Override
-    public int hashCode() {
-        int h = getClass().hashCode();
-        h = 31 * h + fieldName.hashCode();
-        h = 31 * h + featureName.hashCode();
-        h = 31 * h + scoreUpperBound.hashCode();
-        return h;
-    }
-
-    @Override
-    public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
-        if (!scoreMode.needsScores()) {
-            // We don't need scores (e.g. for faceting), and since features are stored as terms,
-            // allow TermQuery to optimize in this case
-            TermQuery tq = new TermQuery(new Term(fieldName, featureName));
-            return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost);
-        }
-
-        return new Weight(this) {
-
-            @Override
-            public boolean isCacheable(LeafReaderContext ctx) {
-                return false;
-            }
-
-            @Override
-            public Explanation explain(LeafReaderContext context, int doc) throws IOException {
-                String desc = "weight(" + getQuery() + " in " + doc + ") [\" BoundedLinearFeatureQuery \"]";
-
-                Terms terms = context.reader().terms(fieldName);
-                if (terms == null) {
-                    return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist.");
-                }
-                TermsEnum termsEnum = terms.iterator();
-                if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
-                    return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist.");
-                }
-
-                PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
-                if (postings.advance(doc) != doc) {
-                    return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set.");
-                }
-
-                int freq = postings.freq();
-                float featureValue = decodeFeatureValue(freq);
-                float score = boost * featureValue;
-                return Explanation.match(
-                    score,
-                    "Linear function on the " + fieldName + " field for the " + featureName + " feature, computed as w * S from:",
-                    Explanation.match(boost, "w, weight of this function"),
-                    Explanation.match(featureValue, "S, feature value")
-                );
-            }
-
-            @Override
-            public Scorer scorer(LeafReaderContext context) throws IOException {
-                Terms terms = Terms.getTerms(context.reader(), fieldName);
-                TermsEnum termsEnum = terms.iterator();
-                if (termsEnum.seekExact(new BytesRef(featureName)) == false) {
-                    return null;
-                }
-
-                final SimScorer scorer = new SimScorer() {
-                    @Override
-                    public float score(float freq, long norm) {
-                        return boost * decodeFeatureValue(freq);
-                    }
-                };
-                final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS);
-                MaxScoreCache maxScoreCache = new MaxScoreCache(impacts, scorer);
-                final ImpactsDISI impactsDisi = new ImpactsDISI(impacts, maxScoreCache);
-
-                return new Scorer(this) {
-
-                    @Override
-                    public int docID() {
-                        return impacts.docID();
-                    }
-
-                    @Override
-                    public float score() throws IOException {
-                        return scorer.score(impacts.freq(), 1L);
-                    }
-
-                    @Override
-                    public DocIdSetIterator iterator() {
-                        return impactsDisi;
-                    }
-
-                    @Override
-                    public int advanceShallow(int target) throws IOException {
-                        return impactsDisi.getMaxScoreCache().advanceShallow(target);
-                    }
-
-                    @Override
-                    public float getMaxScore(int upTo) throws IOException {
-                        return impactsDisi.getMaxScoreCache().getMaxScore(upTo);
-                    }
-
-                    @Override
-                    public void setMinCompetitiveScore(float minScore) {
-                        impactsDisi.setMinCompetitiveScore(minScore);
-                    }
-                };
-            }
-        };
-    }
-
-    @Override
-    public void visit(QueryVisitor visitor) {
-        if (visitor.acceptField(fieldName)) {
-            visitor.visitLeaf(this);
-        }
-    }
-
-    @Override
-    public String toString(String field) {
-        return "BoundedLinearFeatureQuery(field=" + fieldName + ", feature=" + featureName + ", scoreUpperBound=" + scoreUpperBound + ")";
-    }
-
-    // the field and decodeFeatureValue are modified from FeatureField.decodeFeatureValue
-    static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
-
-    // Rewriting this function to make scoreUpperBound work.
-    private float decodeFeatureValue(float freq) {
-        if (freq > MAX_FREQ) {
-            return scoreUpperBound;
-        }
-        int tf = (int) freq; // lossless
-        int featureBits = tf << 15;
-        return Math.min(Float.intBitsToFloat(featureBits), scoreUpperBound);
-    }
-}
diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java
index d883af23d..20eeb2e11 100644
--- a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java
+++ b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java
@@ -21,10 +21,9 @@
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.builder.EqualsBuilder;
 import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.lucene.BoundedLinearFeatureQuery;
+import org.apache.lucene.document.FeatureField;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.Query;
 import org.opensearch.common.SetOnce;
 import org.opensearch.core.ParseField;
@@ -62,8 +61,11 @@ public class NeuralSparseQueryBuilder extends AbstractQueryBuilder<NeuralSparseQ
     static final ParseField QUERY_TEXT_FIELD = new ParseField("query_text");
     @VisibleForTesting
     static final ParseField MODEL_ID_FIELD = new ParseField("model_id");
+    // We use max_token_score field to help WAND scorer prune query clause in lucene 9.7. But in lucene 9.8 the inner
+    // logics change, this field is not needed any more.
     @VisibleForTesting
-    static final ParseField MAX_TOKEN_SCORE_FIELD = new ParseField("max_token_score");
+    @Deprecated
+    static final ParseField MAX_TOKEN_SCORE_FIELD = new ParseField("max_token_score").withAllDeprecated();
 
     private static MLCommonsClientAccessor ML_CLIENT;
 
@@ -164,9 +166,6 @@ public static NeuralSparseQueryBuilder fromXContent(XContentParser parser) throw
             sparseEncodingQueryBuilder.modelId(),
             String.format(Locale.ROOT, "%s field must be provided for [%s] query", MODEL_ID_FIELD.getPreferredName(), NAME)
         );
-        if (sparseEncodingQueryBuilder.maxTokenScore != null && sparseEncodingQueryBuilder.maxTokenScore <= 0) {
-            throw new IllegalArgumentException(MAX_TOKEN_SCORE_FIELD.getPreferredName() + " must be larger than 0.");
-        }
 
         return sparseEncodingQueryBuilder;
     }
@@ -238,14 +237,9 @@ protected Query doToQuery(QueryShardContext context) throws IOException {
         Map<String, Float> queryTokens = queryTokensSupplier.get();
         validateQueryTokens(queryTokens);
 
-        final Float scoreUpperBound = maxTokenScore != null ? maxTokenScore : Float.MAX_VALUE;
-
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         for (Map.Entry<String, Float> entry : queryTokens.entrySet()) {
-            builder.add(
-                new BoostQuery(new BoundedLinearFeatureQuery(fieldName, entry.getKey(), scoreUpperBound), entry.getValue()),
-                BooleanClause.Occur.SHOULD
-            );
+            builder.add(FeatureField.newLinearQuery(fieldName, entry.getKey(), entry.getValue()), BooleanClause.Occur.SHOULD);
         }
         return builder.build();
     }
diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java
index a50ab4fb8..9d1a1627b 100644
--- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java
+++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java
@@ -26,6 +26,9 @@
 
 import lombok.SneakyThrows;
 
+import org.apache.lucene.document.FeatureField;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
 import org.opensearch.client.Client;
 import org.opensearch.common.SetOnce;
 import org.opensearch.common.io.stream.BytesStreamOutput;
@@ -38,9 +41,11 @@
 import org.opensearch.core.xcontent.ToXContent;
 import org.opensearch.core.xcontent.XContentBuilder;
 import org.opensearch.core.xcontent.XContentParser;
+import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.query.MatchAllQueryBuilder;
 import org.opensearch.index.query.QueryBuilder;
 import org.opensearch.index.query.QueryRewriteContext;
+import org.opensearch.index.query.QueryShardContext;
 import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor;
 import org.opensearch.test.OpenSearchTestCase;
 
@@ -88,7 +93,6 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() {
               "VECTOR_FIELD": {
                 "query_text": "string",
                 "model_id": "string",
-                "max_token_score": 123.0,
                 "boost": 10.0,
                 "_name": "something",
               }
@@ -99,7 +103,6 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() {
             .startObject(FIELD_NAME)
             .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT)
             .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID)
-            .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE)
             .field(BOOST_FIELD.getPreferredName(), BOOST)
             .field(NAME_FIELD.getPreferredName(), QUERY_NAME)
             .endObject()
@@ -112,22 +115,19 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() {
         assertEquals(FIELD_NAME, sparseEncodingQueryBuilder.fieldName());
         assertEquals(QUERY_TEXT, sparseEncodingQueryBuilder.queryText());
         assertEquals(MODEL_ID, sparseEncodingQueryBuilder.modelId());
-        assertEquals(MAX_TOKEN_SCORE, sparseEncodingQueryBuilder.maxTokenScore(), 0.0);
         assertEquals(BOOST, sparseEncodingQueryBuilder.boost(), 0.0);
         assertEquals(QUERY_NAME, sparseEncodingQueryBuilder.queryName());
     }
 
     @SneakyThrows
-    public void testFromXContent_whenBuildWithMultipleRootFields_thenFail() {
+    public void testFromXContent_whenBuiltWithMaxTokenScore_thenThrowWarning() {
         /*
           {
               "VECTOR_FIELD": {
                 "query_text": "string",
                 "model_id": "string",
-                "boost": 10.0,
-                "_name": "something",
-              },
-              "invalid": 10
+                "max_token_score": 123.0
+              }
           }
         */
         XContentBuilder xContentBuilder = XContentFactory.jsonBuilder()
@@ -135,46 +135,51 @@ public void testFromXContent_whenBuildWithMultipleRootFields_thenFail() {
             .startObject(FIELD_NAME)
             .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT)
             .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID)
-            .field(BOOST_FIELD.getPreferredName(), BOOST)
-            .field(NAME_FIELD.getPreferredName(), QUERY_NAME)
+            .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE)
             .endObject()
-            .field("invalid", 10)
             .endObject();
 
         XContentParser contentParser = createParser(xContentBuilder);
         contentParser.nextToken();
-        expectThrows(ParsingException.class, () -> NeuralSparseQueryBuilder.fromXContent(contentParser));
+        NeuralSparseQueryBuilder sparseEncodingQueryBuilder = NeuralSparseQueryBuilder.fromXContent(contentParser);
+        assertWarnings("Deprecated field [max_token_score] used, this field is unused and will be removed entirely");
     }
 
     @SneakyThrows
-    public void testFromXContent_whenBuildWithMissingQuery_thenFail() {
+    public void testFromXContent_whenBuildWithMultipleRootFields_thenFail() {
         /*
           {
               "VECTOR_FIELD": {
-                "model_id": "string"
-              }
+                "query_text": "string",
+                "model_id": "string",
+                "boost": 10.0,
+                "_name": "something",
+              },
+              "invalid": 10
           }
         */
         XContentBuilder xContentBuilder = XContentFactory.jsonBuilder()
             .startObject()
             .startObject(FIELD_NAME)
+            .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT)
             .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID)
+            .field(BOOST_FIELD.getPreferredName(), BOOST)
+            .field(NAME_FIELD.getPreferredName(), QUERY_NAME)
             .endObject()
+            .field("invalid", 10)
             .endObject();
 
         XContentParser contentParser = createParser(xContentBuilder);
         contentParser.nextToken();
-        expectThrows(IllegalArgumentException.class, () -> NeuralSparseQueryBuilder.fromXContent(contentParser));
+        expectThrows(ParsingException.class, () -> NeuralSparseQueryBuilder.fromXContent(contentParser));
     }
 
     @SneakyThrows
-    public void testFromXContent_whenBuildWithNegativeMaxTokenScore_thenFail() {
+    public void testFromXContent_whenBuildWithMissingQuery_thenFail() {
         /*
           {
               "VECTOR_FIELD": {
-                "query_text": "string",
-                "model_id": "string",
-                "max_token_score": -1
+                "model_id": "string"
               }
           }
         */
@@ -182,7 +187,6 @@ public void testFromXContent_whenBuildWithNegativeMaxTokenScore_thenFail() {
             .startObject()
             .startObject(FIELD_NAME)
             .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID)
-            .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), -1f)
             .endObject()
             .endObject();
 
@@ -498,4 +502,23 @@ public void testRewrite_whenQueryTokensSupplierSet_thenReturnSelf() {
         queryBuilder = sparseEncodingQueryBuilder.doRewrite(null);
         assertTrue(queryBuilder == sparseEncodingQueryBuilder);
     }
+
+    @SneakyThrows
+    public void testDoToQuery_successfulDoToQuery() {
+        NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder().fieldName(FIELD_NAME)
+            .maxTokenScore(MAX_TOKEN_SCORE)
+            .queryText(QUERY_TEXT)
+            .modelId(MODEL_ID)
+            .queryTokensSupplier(QUERY_TOKENS_SUPPLIER);
+        QueryShardContext mockedQueryShardContext = mock(QueryShardContext.class);
+        MappedFieldType mockedMappedFieldType = mock(MappedFieldType.class);
+        doAnswer(invocation -> "rank_features").when(mockedMappedFieldType).typeName();
+        doAnswer(invocation -> mockedMappedFieldType).when(mockedQueryShardContext).fieldMapper(any());
+
+        BooleanQuery.Builder targetQueryBuilder = new BooleanQuery.Builder();
+        targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f), BooleanClause.Occur.SHOULD);
+        targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f), BooleanClause.Occur.SHOULD);
+
+        assertEquals(sparseEncodingQueryBuilder.doToQuery(mockedQueryShardContext), targetQueryBuilder.build());
+    }
 }
diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryIT.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryIT.java
index 672ab2940..12bd1c1cb 100644
--- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryIT.java
+++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryIT.java
@@ -106,16 +106,7 @@ public void testBasicQueryWithMaxTokenScore() {
         Map<String, Object> firstInnerHit = getFirstInnerHit(searchResponseAsMap);
 
         assertEquals("1", firstInnerHit.get("_id"));
-        Map<String, Float> queryTokens = runSparseModelInference(modelId, TEST_QUERY_TEXT);
-        float expectedScore = 0f;
-        for (Map.Entry<String, Float> entry : queryTokens.entrySet()) {
-            if (testRankFeaturesDoc.containsKey(entry.getKey())) {
-                expectedScore += entry.getValue() * Math.min(
-                    getFeatureFieldCompressedNumber(testRankFeaturesDoc.get(entry.getKey())),
-                    maxTokenScore
-                );
-            }
-        }
+        float expectedScore = computeExpectedScore(modelId, testRankFeaturesDoc, TEST_QUERY_TEXT);
         assertEquals(expectedScore, objectToFloat(firstInnerHit.get("_score")), DELTA);
     }