Skip to content

Commit

Permalink
WIP Prevent wildcard expressions for stopwords in simple expressions
Browse files Browse the repository at this point in the history
- add stopwords to registry
- add stopwords_case_insensitive option
- support function for getting the stopwords
- cache to optimize the parsing of stopwords.txt

This transforms the (term AND term*) expression for stopwords, removing
the wildcard expression. Such an expression would never match any
documents, because solr won't remove the wildcard term, but the
stopword will be missing from the index. This workaround does that with
no side effects, as stopwords would be ignored by solr anyway.

Both case sensitive and case insensitive stopword processing is
supported, this depends on the solr schema, and must be set accordingly.
  • Loading branch information
reebalazs committed Dec 28, 2023
1 parent 7d46ec7 commit 097a3bf
Show file tree
Hide file tree
Showing 7 changed files with 362 additions and 20 deletions.
36 changes: 35 additions & 1 deletion src/collective/solr/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@


class ISolrSchema(Interface):

active = Bool(
title=_("label_active", default="Active"),
description=_(
Expand Down Expand Up @@ -389,6 +388,41 @@ class ISolrSchema(Interface):
required=False,
)

stopwords = Text(
title=_("label_stopwords", default="Stopwords in the format of stopwords.txt"),
description=_(
"help_stopwords",
default="Copy the stopwords.txt file here. "
"Check Solr configuration to understand the format. - "
"Stopwords will not get (word OR word*) simple "
"expression, only (word). "
"Notes: "
"1. This will only work for multi word queries "
"when force_simple_expression=True. - "
"2. It's still necessary to filter stopwords from "
"Solr, this option only causes the "
"faulty (stopword*) parts removed from "
"the expression ",
),
default="",
required=False,
)

stopwords_case_insensitive = Bool(
title=_(
"label_stopwords_case_insensitive", default="Stopwords are case insensitive"
),
description=_(
"help_stopwords_are_case_insensitive",
default="Stopwords are case insensitive "
"This depends on your Solr setup. If your stopwords are processed in a case insensitive way, "
"this should be checked and it will apply the stopword wildcard removal in a case "
"insensitive way.",
),
default=False,
required=False,
)


class ISolrConnectionConfig(ISolrSchema):
"""utility to hold the connection configuration for the solr server"""
Expand Down
11 changes: 9 additions & 2 deletions src/collective/solr/mangler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
removeSpecialCharactersAndOperators,
splitSimpleSearch,
)
from collective.solr.stopword import isStopWord


ranges = {
"min": "[%s TO *]",
Expand Down Expand Up @@ -82,7 +84,7 @@ def makeSimpleExpressions(term, levenstein_distance):

def mangleSearchableText(value, config, force_complex_search=False):
config = config or getConfig()
pattern = getattr(config, "search_pattern", u"")
pattern = getattr(config, "search_pattern", "")
force_simple_search = getattr(config, "force_simple_search", False)
allow_complex_search = getattr(config, "allow_complex_search", False)
levenstein_distance = getattr(config, "levenshtein_distance", 0)
Expand All @@ -108,6 +110,10 @@ def mangleSearchableText(value, config, force_complex_search=False):

for term in splitSimpleSearch(value):
(term_value, term_base_value) = makeSimpleExpressions(term, levenstein_distance)
# If this is a stopword, we never allow an (term AND term*) pattern, just use (term).
# Otherwise stopwords won't ever be found, because a (stopword*) search will never succeed.
if isStopWord(term, config):
term_value = term_base_value
value_parts.append(term_value)
base_value_parts.append(term_base_value)

Expand All @@ -120,6 +126,7 @@ def mangleSearchableText(value, config, force_complex_search=False):
return set([value]) # add literal query parameter
if pattern:
pattern = pattern.encode("utf-8")

return value


Expand Down Expand Up @@ -325,4 +332,4 @@ def optimizeQueryParameters(query, params):
elif fq:
params["fq"] = fq
if not query:
query["*"] = u"*:*" # catch all if no regular query is left...
query["*"] = "*:*" # catch all if no regular query is left...
29 changes: 22 additions & 7 deletions src/collective/solr/profiles/default/registry.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
<?xml version="1.0"?>
<?xml version="1.0" encoding="utf-8"?>
<registry>
<records interface="collective.solr.interfaces.ISolrSchema" prefix="collective.solr">
<records interface="collective.solr.interfaces.ISolrSchema"
prefix="collective.solr"
>
<value key="active">False</value>
<value key="host">127.0.0.1</value>
<value key="port">8983</value>
Expand All @@ -10,7 +12,8 @@
<value key="max_results">10000000</value>
<value key="exclude_user">False
</value>
<value key="search_pattern">+(Title:{value}^5 OR Description:{value}^2 OR SearchableText:{value} OR SearchableText:({base_value}) OR searchwords:({base_value})^1000) -showinsearch:False
<value key="search_pattern">+(Title:{value}^5 OR Description:{value}^2 OR SearchableText:{value}
OR SearchableText:({base_value}) OR searchwords:({base_value})^1000) -showinsearch:False
</value>
<value key="prefix_wildcard">False</value>
<value key="force_simple_search">False</value>
Expand All @@ -25,10 +28,22 @@
<value key="filter_queries">
<element>portal_type</element>
</value>
<value key="boost_script"></value>
<value key="solr_login"></value>
<value key="solr_password"></value>
<value key="boost_script" />
<value key="solr_login" />
<value key="solr_password" />
<value key="use_tika">False</value>
<value key="tika_default_field">content</value>
<!-- Stopwords will not get (word OR word*) simple expression, only (word).
Notes:
1. This will only work for multi word queries when force_simple_expression=True
2. It's still necessary to filter stopwords from Solr, this option only causes the
faulty (stopword*) parts removed from the expression.
This removal is by default case sensitive, but it can be made case insensitive. Use
this if you handle your stopwords in a case insensitive way. This depends on your
solr configuration.
-->
<value key="stopwords_case_insensitive">False</value>
<value key="stopwords" />
</records>
</registry>
</registry>
45 changes: 45 additions & 0 deletions src/collective/solr/stopword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import re

from collective.solr.utils import getConfig

reLine = re.compile(r"^([A-Za-zÀ-ÖØ-öø-ÿ]*)")

raw = None
raw_case_insensitive = None
cooked = None


def parseStopwords(stopwords, stopwords_case_insensitive):
return list(
map(
lambda word: word.lower() if stopwords_case_insensitive else word,
filter(
lambda word: word,
map(lambda line: reLine.match(line).group(1), stopwords.splitlines()),
),
)
)


def getStopWords(config):
global raw
global cooked
global raw_case_insensitive
config = config or getConfig()
stopwords = getattr(config, "stopwords", "") or ""
stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False)
if (
cooked is None
or raw is not stopwords
or raw_case_insensitive != stopwords_case_insensitive
):
raw = stopwords
raw_case_insensitive = stopwords_case_insensitive
cooked = parseStopwords(raw, stopwords_case_insensitive)
return cooked


def isStopWord(term, config):
stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False)
stopwords = getStopWords(config)
return (term.lower() if stopwords_case_insensitive else term) in stopwords
15 changes: 8 additions & 7 deletions src/collective/solr/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,16 @@ def tearDown(self):


class CollectiveSolrLayer(PloneSandboxLayer):

defaultBases = (PLONE_FIXTURE,)

def __init__(
self,
bases=None,
name="Collective Solr Layer",
module=None,
solr_host=u"localhost",
solr_host="localhost",
solr_port=8983,
solr_base=u"/solr/plone",
solr_base="/solr/plone",
solr_active=False,
):
super(PloneSandboxLayer, self).__init__(bases, name, module)
Expand Down Expand Up @@ -136,7 +135,7 @@ def setUpPloneSite(self, portal):
def tearDownPloneSite(self, portal):
set_registry_record("collective.solr.active", False)
set_registry_record("collective.solr.port", 8983)
set_registry_record("collective.solr.base", u"/solr/plone")
set_registry_record("collective.solr.base", "/solr/plone")
self.solr_layer.tearDown()


Expand Down Expand Up @@ -180,7 +179,7 @@ def activateAndReindex(portal):
class CollectiveSolrMockRegistry(object):
def __init__(self):
self.active = False
self.host = u"localhost"
self.host = "localhost"
self.port = None
self.base = None
self.async_indexing = False
Expand All @@ -198,9 +197,11 @@ def __init__(self):
self.exclude_user = False
self.field_list = []
self.atomic_updates = False
self.boost_script = u""
self.boost_script = ""
self.solr_login = None
self.solr_password = None
self.stopwords = ""
self.stopwords_case_insensitive = False

def __getitem__(self, name):
name_parts = name.split(".")
Expand Down Expand Up @@ -241,7 +242,7 @@ class CollectiveSolrMockRegistryLayer(Layer):

def setUp(self):
provideUtility(
provides=IRegistry, component=CollectiveSolrMockRegistry(), name=u""
provides=IRegistry, component=CollectiveSolrMockRegistry(), name=""
)

def tearDown(self):
Expand Down
114 changes: 111 additions & 3 deletions src/collective/solr/tests/test_mangler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
mangleQuery,
optimizeQueryParameters,
subtractQueryParameters,
mangleSearchableText,
)
from collective.solr.parser import SolrField, SolrSchema
from collective.solr.testing import COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE
from collective.solr.utils import getConfig
from unittest import mock


def mangle(**keywords):
Expand All @@ -31,7 +33,6 @@ def __init__(self, query, range=None, operator=None, depth=None):


class QueryManglerTests(TestCase):

layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

def setUp(self):
Expand Down Expand Up @@ -282,7 +283,6 @@ def testMultiplePathQuery(self):


class QueryParameterTests(TestCase):

layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

def testSortIndex(self):
Expand Down Expand Up @@ -446,7 +446,7 @@ def optimize(**params):
# now unconfigured...
config = getConfig()
self.assertEqual(optimize(), (dict(a="a:23", b="b:42", c="c:(23 42)"), dict()))
config.filter_queries = [u"a"]
config.filter_queries = ["a"]
self.assertEqual(optimize(), (dict(b="b:42", c="c:(23 42)"), dict(fq=["a:23"])))
self.assertEqual(
optimize(fq="x:13"),
Expand Down Expand Up @@ -502,3 +502,111 @@ def testFilterFacetDependencies(self):
self.assertEqual(params, {"facet.foo": "bar"})
params = extract(dict(facet_foo=("foo:bar", "bar:foo")))
self.assertEqual(params, {"facet.foo": ("foo", "bar")})


@mock.patch("collective.solr.stopword.raw", None)
@mock.patch("collective.solr.stopword.cooked", None)
class QueryManglerStopwordsTests(TestCase):
layer = COLLECTIVE_SOLR_MOCK_REGISTRY_FIXTURE

def setUp(self):
self.config = getConfig()
self.config.stopwords_case_insensitive = False
self.config.stopwords = """
no
can
do
"""
self.config.force_simple_search = True
provideUtility(self.config, ISolrConnectionConfig)

def tearDown(self):
gsm = getGlobalSiteManager()
gsm.unregisterUtility(self.config, ISolrConnectionConfig)

def testBase(self):
self.assertEqual(
mangleSearchableText("beautiful world", self.config),
"(beautiful* OR beautiful) (world* OR world)",
)

def testNoSimpleSearch(self):
self.config.force_simple_search = False
self.assertEqual(
mangleSearchableText("beautiful can world", self.config),
"(beautiful* OR beautiful) (can) (world* OR world)",
)

def testBaseSingleWord(self):
self.assertEqual(
mangleSearchableText("beautiful", self.config),
"(beautiful* OR beautiful)",
)

def testRemovesWildcardTermSingleWord(self):
self.assertEqual(
mangleSearchableText("can", self.config),
"(can)",
)

def testRemovesWildcardTermMultiWord(self):
self.assertEqual(
mangleSearchableText("beautiful can world", self.config),
"(beautiful* OR beautiful) (can) (world* OR world)",
)
self.assertEqual(
mangleSearchableText("no can do", self.config),
"(no) (can) (do)",
)

def testRemovesWildcardCaseSensitiveUppercase(self):
self.assertEqual(
mangleSearchableText("beautiful Can world", self.config),
"(beautiful* OR beautiful) (can* OR Can) (world* OR world)",
)
self.assertEqual(
mangleSearchableText("No CAN dO", self.config),
"(no* OR No) (can* OR CAN) (do* OR dO)",
)

def testRemovesWildcardCaseAgnosticUppercase(self):
self.config.stopwords_case_insensitive = True
self.assertEqual(
mangleSearchableText("beautiful Can world", self.config),
"(beautiful* OR beautiful) (Can) (world* OR world)",
)
self.assertEqual(
mangleSearchableText("No CAN dO", self.config),
"(No) (CAN) (dO)",
)

def testRemovesWildcardCaseSensitive(self):
self.config.stopwords = """
No
CAN
dO
"""
self.assertEqual(
mangleSearchableText("beautiful Can world", self.config),
"(beautiful* OR beautiful) (can* OR Can) (world* OR world)",
)
self.assertEqual(
mangleSearchableText("nO can Do", self.config),
"(no* OR nO) (can* OR can) (do* OR Do)",
)

def testRemovesWildcardCaseAgnostic(self):
self.config.stopwords_case_insensitive = True
self.config.stopwords = """
No
CAN
dO
"""
self.assertEqual(
mangleSearchableText("beautiful Can world", self.config),
"(beautiful* OR beautiful) (Can) (world* OR world)",
)
self.assertEqual(
mangleSearchableText("nO can Do", self.config),
"(nO) (can) (Do)",
)
Loading

0 comments on commit 097a3bf

Please sign in to comment.