WIP Prevent wildcard expressions for stopwords in simple expressions

- add stopwords to registry - add stopwords_case_insensitive option - support function for getting the stopwords - cache to optimize the parsing of stopwords.txt This transforms the (term AND term*) expression for stopwords, removing the wildcard expression. Such an expression would never match any documents, because solr won't remove the wildcard term, but the stopword will be missing from the index. This workaround does that with no side effects, as stopwords would be ignored by solr anyway. Both case sensitive and case insensitive stopword processing is supported, this depends on the solr schema, and must be set accordingly.
collective · Dec 28, 2023 · 097a3bf · 097a3bf
1 parent 7d46ec7
commit 097a3bf
Show file tree

Hide file tree

Showing 7 changed files with 362 additions and 20 deletions.
diff --git a/src/collective/solr/interfaces.py b/src/collective/solr/interfaces.py
@@ -11,7 +11,6 @@
 
 
 class ISolrSchema(Interface):
-
     active = Bool(
         title=_("label_active", default="Active"),
         description=_(
@@ -389,6 +388,41 @@ class ISolrSchema(Interface):
         required=False,
     )
 
+    stopwords = Text(
+        title=_("label_stopwords", default="Stopwords in the format of stopwords.txt"),
+        description=_(
+            "help_stopwords",
+            default="Copy the stopwords.txt file here. "
+            "Check Solr configuration to understand the format. - "
+            "Stopwords will not get (word OR word*) simple "
+            "expression, only (word). "
+            "Notes: "
+            "1. This will only work for multi word queries "
+            "when force_simple_expression=True. - "
+            "2. It's still necessary to filter stopwords from "
+            "Solr, this option only causes the "
+            "faulty (stopword*) parts removed from "
+            "the expression ",
+        ),
+        default="",
+        required=False,
+    )
+
+    stopwords_case_insensitive = Bool(
+        title=_(
+            "label_stopwords_case_insensitive", default="Stopwords are case insensitive"
+        ),
+        description=_(
+            "help_stopwords_are_case_insensitive",
+            default="Stopwords are case insensitive "
+            "This depends on your Solr setup. If your stopwords are processed in a case insensitive way, "
+            "this should be checked and it will apply the stopword wildcard removal in a case "
+            "insensitive way.",
+        ),
+        default=False,
+        required=False,
+    )
+
 
 class ISolrConnectionConfig(ISolrSchema):
     """utility to hold the connection configuration for the solr server"""

diff --git a/src/collective/solr/mangler.py b/src/collective/solr/mangler.py
@@ -17,6 +17,8 @@
     removeSpecialCharactersAndOperators,
     splitSimpleSearch,
 )
+from collective.solr.stopword import isStopWord
+
 
 ranges = {
     "min": "[%s TO *]",
@@ -82,7 +84,7 @@ def makeSimpleExpressions(term, levenstein_distance):
 
 def mangleSearchableText(value, config, force_complex_search=False):
     config = config or getConfig()
-    pattern = getattr(config, "search_pattern", u"")
+    pattern = getattr(config, "search_pattern", "")
     force_simple_search = getattr(config, "force_simple_search", False)
     allow_complex_search = getattr(config, "allow_complex_search", False)
     levenstein_distance = getattr(config, "levenshtein_distance", 0)
@@ -108,6 +110,10 @@ def mangleSearchableText(value, config, force_complex_search=False):
 
     for term in splitSimpleSearch(value):
         (term_value, term_base_value) = makeSimpleExpressions(term, levenstein_distance)
+        # If this is a stopword, we never allow an (term AND term*) pattern, just use (term).
+        # Otherwise stopwords won't ever be found, because a (stopword*) search will never succeed.
+        if isStopWord(term, config):
+            term_value = term_base_value
         value_parts.append(term_value)
         base_value_parts.append(term_base_value)
 
@@ -120,6 +126,7 @@ def mangleSearchableText(value, config, force_complex_search=False):
         return set([value])  # add literal query parameter
     if pattern:
         pattern = pattern.encode("utf-8")
+
     return value
 
 
@@ -325,4 +332,4 @@ def optimizeQueryParameters(query, params):
     elif fq:
         params["fq"] = fq
     if not query:
-        query["*"] = u"*:*"  # catch all if no regular query is left...
+        query["*"] = "*:*"  # catch all if no regular query is left...
diff --git a/src/collective/solr/profiles/default/registry.xml b/src/collective/solr/profiles/default/registry.xml
@@ -1,6 +1,8 @@
-<?xml version="1.0"?>
+<?xml version="1.0" encoding="utf-8"?>
 <registry>
-  <records interface="collective.solr.interfaces.ISolrSchema" prefix="collective.solr">
+  <records interface="collective.solr.interfaces.ISolrSchema"
+           prefix="collective.solr"
+  >
     <value key="active">False</value>
     <value key="host">127.0.0.1</value>
     <value key="port">8983</value>
@@ -10,7 +12,8 @@
     <value key="max_results">10000000</value>
     <value key="exclude_user">False
     </value>
-    <value key="search_pattern">+(Title:{value}^5 OR Description:{value}^2 OR SearchableText:{value} OR SearchableText:({base_value}) OR searchwords:({base_value})^1000) -showinsearch:False
+    <value key="search_pattern">+(Title:{value}^5 OR Description:{value}^2 OR SearchableText:{value}
+      OR SearchableText:({base_value}) OR searchwords:({base_value})^1000) -showinsearch:False
     </value>
     <value key="prefix_wildcard">False</value>
     <value key="force_simple_search">False</value>
@@ -25,10 +28,22 @@
     <value key="filter_queries">
       <element>portal_type</element>
     </value>
-    <value key="boost_script"></value>
-    <value key="solr_login"></value>
-    <value key="solr_password"></value>
+    <value key="boost_script" />
+    <value key="solr_login" />
+    <value key="solr_password" />
     <value key="use_tika">False</value>
     <value key="tika_default_field">content</value>
+    <!-- Stopwords will not get (word OR word*) simple expression, only (word).
+         Notes:
+         1. This will only work for multi word queries when force_simple_expression=True
+         2. It's still necessary to filter stopwords from Solr, this option only causes the
+            faulty (stopword*) parts removed from the expression.
+
+        This removal is by default case sensitive, but it can be made case insensitive. Use
+        this if you handle your stopwords in a case insensitive way. This depends on your
+        solr configuration.
+  -->
+    <value key="stopwords_case_insensitive">False</value>
+    <value key="stopwords" />
   </records>
-</registry>
+</registry>
diff --git a/src/collective/solr/stopword.py b/src/collective/solr/stopword.py
@@ -0,0 +1,45 @@
+import re
+
+from collective.solr.utils import getConfig
+
+reLine = re.compile(r"^([A-Za-zÀ-ÖØ-öø-ÿ]*)")
+
+raw = None
+raw_case_insensitive = None
+cooked = None
+
+
+def parseStopwords(stopwords, stopwords_case_insensitive):
+    return list(
+        map(
+            lambda word: word.lower() if stopwords_case_insensitive else word,
+            filter(
+                lambda word: word,
+                map(lambda line: reLine.match(line).group(1), stopwords.splitlines()),
+            ),
+        )
+    )
+
+
+def getStopWords(config):
+    global raw
+    global cooked
+    global raw_case_insensitive
+    config = config or getConfig()
+    stopwords = getattr(config, "stopwords", "") or ""
+    stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False)
+    if (
+        cooked is None
+        or raw is not stopwords
+        or raw_case_insensitive != stopwords_case_insensitive
+    ):
+        raw = stopwords
+        raw_case_insensitive = stopwords_case_insensitive
+        cooked = parseStopwords(raw, stopwords_case_insensitive)
+    return cooked
+
+
+def isStopWord(term, config):
+    stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False)
+    stopwords = getStopWords(config)
+    return (term.lower() if stopwords_case_insensitive else term) in stopwords
diff --git a/src/collective/solr/testing.py b/src/collective/solr/testing.py
@@ -84,17 +84,16 @@ def tearDown(self):
 
 
 class CollectiveSolrLayer(PloneSandboxLayer):
-
     defaultBases = (PLONE_FIXTURE,)
 
     def __init__(
         self,
         bases=None,
         name="Collective Solr Layer",
         module=None,
-        solr_host=u"localhost",
+        solr_host="localhost",
         solr_port=8983,
-        solr_base=u"/solr/plone",
+        solr_base="/solr/plone",
         solr_active=False,
     ):
         super(PloneSandboxLayer, self).__init__(bases, name, module)
@@ -136,7 +135,7 @@ def setUpPloneSite(self, portal):
     def tearDownPloneSite(self, portal):
         set_registry_record("collective.solr.active", False)
         set_registry_record("collective.solr.port", 8983)
-        set_registry_record("collective.solr.base", u"/solr/plone")
+        set_registry_record("collective.solr.base", "/solr/plone")
         self.solr_layer.tearDown()
 
 
@@ -180,7 +179,7 @@ def activateAndReindex(portal):
 class CollectiveSolrMockRegistry(object):
     def __init__(self):
         self.active = False
-        self.host = u"localhost"
+        self.host = "localhost"
         self.port = None
         self.base = None
         self.async_indexing = False
@@ -198,9 +197,11 @@ def __init__(self):
         self.exclude_user = False
         self.field_list = []
         self.atomic_updates = False
-        self.boost_script = u""
+        self.boost_script = ""
         self.solr_login = None
         self.solr_password = None
+        self.stopwords = ""
+        self.stopwords_case_insensitive = False
 
     def __getitem__(self, name):
         name_parts = name.split(".")
@@ -241,7 +242,7 @@ class CollectiveSolrMockRegistryLayer(Layer):
 
     def setUp(self):
         provideUtility(
-            provides=IRegistry, component=CollectiveSolrMockRegistry(), name=u""
+            provides=IRegistry, component=CollectiveSolrMockRegistry(), name=""
         )
 
     def tearDown(self):

diff --git a/src/collective/solr/tests/test_mangler.py b/src/collective/solr/tests/test_mangler.py
@@ -11,10 +11,12 @@
     mangleQuery,
     optimizeQueryParameters,
     subtractQueryParameters,
+    mangleSearchableText,
 )
 from collective.solr.parser import SolrField, SolrSchema
 from collective.solr.testing import COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE
 from collective.solr.utils import getConfig
+from unittest import mock
 
 
 def mangle(**keywords):
@@ -31,7 +33,6 @@ def __init__(self, query, range=None, operator=None, depth=None):
 
 
 class QueryManglerTests(TestCase):
-
     layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE
 
     def setUp(self):
@@ -282,7 +283,6 @@ def testMultiplePathQuery(self):
 
 
 class QueryParameterTests(TestCase):
-
     layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE
 
     def testSortIndex(self):
@@ -446,7 +446,7 @@ def optimize(**params):
         # now unconfigured...
         config = getConfig()
         self.assertEqual(optimize(), (dict(a="a:23", b="b:42", c="c:(23 42)"), dict()))
-        config.filter_queries = [u"a"]
+        config.filter_queries = ["a"]
         self.assertEqual(optimize(), (dict(b="b:42", c="c:(23 42)"), dict(fq=["a:23"])))
         self.assertEqual(
             optimize(fq="x:13"),
@@ -502,3 +502,111 @@ def testFilterFacetDependencies(self):
         self.assertEqual(params, {"facet.foo": "bar"})
         params = extract(dict(facet_foo=("foo:bar", "bar:foo")))
         self.assertEqual(params, {"facet.foo": ("foo", "bar")})
+
+
+@mock.patch("collective.solr.stopword.raw", None)
+@mock.patch("collective.solr.stopword.cooked", None)
+class QueryManglerStopwordsTests(TestCase):
+    layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE
+
+    def setUp(self):
+        self.config = getConfig()
+        self.config.stopwords_case_insensitive = False
+        self.config.stopwords = """
+no
+can
+do
+"""
+        self.config.force_simple_search = True
+        provideUtility(self.config, ISolrConnectionConfig)
+
+    def tearDown(self):
+        gsm = getGlobalSiteManager()
+        gsm.unregisterUtility(self.config, ISolrConnectionConfig)
+
+    def testBase(self):
+        self.assertEqual(
+            mangleSearchableText("beautiful world", self.config),
+            "(beautiful* OR beautiful) (world* OR world)",
+        )
+
+    def testNoSimpleSearch(self):
+        self.config.force_simple_search = False
+        self.assertEqual(
+            mangleSearchableText("beautiful can world", self.config),
+            "(beautiful* OR beautiful) (can) (world* OR world)",
+        )
+
+    def testBaseSingleWord(self):
+        self.assertEqual(
+            mangleSearchableText("beautiful", self.config),
+            "(beautiful* OR beautiful)",
+        )
+
+    def testRemovesWildcardTermSingleWord(self):
+        self.assertEqual(
+            mangleSearchableText("can", self.config),
+            "(can)",
+        )
+
+    def testRemovesWildcardTermMultiWord(self):
+        self.assertEqual(
+            mangleSearchableText("beautiful can world", self.config),
+            "(beautiful* OR beautiful) (can) (world* OR world)",
+        )
+        self.assertEqual(
+            mangleSearchableText("no can do", self.config),
+            "(no) (can) (do)",
+        )
+
+    def testRemovesWildcardCaseSensitiveUppercase(self):
+        self.assertEqual(
+            mangleSearchableText("beautiful Can world", self.config),
+            "(beautiful* OR beautiful) (can* OR Can) (world* OR world)",
+        )
+        self.assertEqual(
+            mangleSearchableText("No CAN dO", self.config),
+            "(no* OR No) (can* OR CAN) (do* OR dO)",
+        )
+
+    def testRemovesWildcardCaseAgnosticUppercase(self):
+        self.config.stopwords_case_insensitive = True
+        self.assertEqual(
+            mangleSearchableText("beautiful Can world", self.config),
+            "(beautiful* OR beautiful) (Can) (world* OR world)",
+        )
+        self.assertEqual(
+            mangleSearchableText("No CAN dO", self.config),
+            "(No) (CAN) (dO)",
+        )
+
+    def testRemovesWildcardCaseSensitive(self):
+        self.config.stopwords = """
+No
+CAN
+dO
+"""
+        self.assertEqual(
+            mangleSearchableText("beautiful Can world", self.config),
+            "(beautiful* OR beautiful) (can* OR Can) (world* OR world)",
+        )
+        self.assertEqual(
+            mangleSearchableText("nO can Do", self.config),
+            "(no* OR nO) (can* OR can) (do* OR Do)",
+        )
+
+    def testRemovesWildcardCaseAgnostic(self):
+        self.config.stopwords_case_insensitive = True
+        self.config.stopwords = """
+No
+CAN
+dO
+"""
+        self.assertEqual(
+            mangleSearchableText("beautiful Can world", self.config),
+            "(beautiful* OR beautiful) (Can) (world* OR world)",
+        )
+        self.assertEqual(
+            mangleSearchableText("nO can Do", self.config),
+            "(nO) (can) (Do)",
+        )