janeczku · quarz12 · Apr 25, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 29, 2023
diff --git a/cps/db.py b/cps/db.py
@@ -49,7 +49,10 @@
 from .pagination import Pagination
 
 from weakref import WeakSet
+from thefuzz.fuzz import partial_token_sort_ratio, ratio
 
+# %-level, 100 means exact match, 75 allows exactly 1 wrong character in a 4 letter word
+FUZZY_SEARCH_ACCURACY = 75
 
 log = logger.create()
 
@@ -381,6 +384,18 @@ def __repr__(self):
                                                                  self.timestamp, self.pubdate, self.series_index,
                                                                  self.last_modified, self.path, self.has_cover)
 
+    def __str__(self):
+        return "{0} {1} {2} {3} {4}".format(self.title, " ".join([tag.name for tag in self.tags]),
+                                                " ".join(
+                                                    [series.name for series
+                                                     in self.series]),
+                                                " ".join(
+                                                    [author.name for author
+                                                     in self.authors]),
+                                                " ".join([publisher.name for
+                                                          publisher in
+                                                          self.publishers]))
+
     @property
     def atom_timestamp(self):
         return self.timestamp.strftime('%Y-%m-%dT%H:%M:%S+00:00') or ''
@@ -883,12 +898,18 @@ def check_exists_book(self, authr, title):
             .filter(and_(Books.authors.any(and_(*q)), func.lower(Books.title).ilike("%" + title + "%"))).first()
 
     def search_query(self, term, config, *join):
-        term.strip().lower()
+        term = term.strip().lower()
         self.session.connection().connection.connection.create_function("lower", 1, lcase)
-        q = list()
-        author_terms = re.split("[, ]+", term)
-        for author_term in author_terms:
-            q.append(Books.authors.any(func.lower(Authors.name).ilike("%" + author_term + "%")))
+        self.session.connection().connection.connection.create_function("max_ratio", 2, max_ratio)
+        # splits search term into single words
+        words = re.split("[,\s]+", term)
+        # put the longest words first to make queries more efficient
+        words.sort(key=len, reverse=True)
+        words=list(filter(lambda w:len(w)>3,words))
+        # no word in search term is longer than 3 letters -> return empty query #TODO give some kind of error message
+        if len(words)==0:
+            return self.session.query(Books).filter(False)
+
         query = self.generate_linked_query(config.config_read_column, Books)
         if len(join) == 6:
             query = query.outerjoin(join[0], join[1]).outerjoin(join[2]).outerjoin(join[3], join[4]).outerjoin(join[5])
@@ -899,19 +920,28 @@ def search_query(self, term, config, *join):
         elif len(join) == 1:
             query = query.outerjoin(join[0])
 
+        filter_expression = []
         cc = self.get_cc_columns(config, filter_config_custom_read=True)
-        filter_expression = [Books.tags.any(func.lower(Tags.name).ilike("%" + term + "%")),
-                             Books.series.any(func.lower(Series.name).ilike("%" + term + "%")),
-                             Books.authors.any(and_(*q)),
-                             Books.publishers.any(func.lower(Publishers.name).ilike("%" + term + "%")),
-                             func.lower(Books.title).ilike("%" + term + "%")]
         for c in cc:
             if c.datatype not in ["datetime", "rating", "bool", "int", "float"]:
                 filter_expression.append(
                     getattr(Books,
                             'custom_column_' + str(c.id)).any(
                         func.lower(cc_classes[c.id].value).ilike("%" + term + "%")))
-        return query.filter(self.common_filters(True)).filter(or_(*filter_expression))
+        # filter out multiple languages and archived books,
+        results = query.filter(self.common_filters(True))
+        filters=[filter_expression] if filter_expression else []
+        # search tags, series and titles, also add author queries
+        for word in words:
+            filters.append(or_(*[
+                Books.tags.any(func.max_ratio(func.lower(Tags.name), word) >= FUZZY_SEARCH_ACCURACY),
+                Books.series.any(func.max_ratio(func.lower(Series.name), word) >= FUZZY_SEARCH_ACCURACY),
+                Books.authors.any(func.max_ratio(func.lower(Authors.name), word) >= FUZZY_SEARCH_ACCURACY),
+                Books.publishers.any(func.max_ratio(func.lower(Publishers.name), word) >= FUZZY_SEARCH_ACCURACY),
+                func.max_ratio(func.lower(Books.title), word) >= FUZZY_SEARCH_ACCURACY
+            ]))
+        results = results.filter(and_(*filters))
+        return results
 
     def get_cc_columns(self, config, filter_config_custom_read=False):
         tmp_cc = self.session.query(CustomColumns).filter(CustomColumns.datatype.notin_(cc_exceptions)).all()
@@ -934,6 +964,7 @@ def get_search_results(self, term, config, offset=None, order=None, limit=None,
         order = order[0] if order else [Books.sort]
         pagination = None
         result = self.search_query(term, config, *join).order_by(*order).all()
+        result = sorted(result,key=lambda query:partial_token_sort_ratio(str(query[0]),term),reverse=True)
         result_count = len(result)
         if offset != None and limit != None:
             offset = int(offset)
@@ -1052,6 +1083,11 @@ def lcase(s):
         return s.lower()
 
 
+def max_ratio(string:str,term):
+    """applies ratio on each word of string and returns the max value"""
+    words=string.split()
+    return max([ratio(word.strip(":"),term) if len(word.strip(":")) > 3 else 0 for word in words]) # ignore words of len < 3#do not compare words of len < 3 -> too generic
+
 class Category:
     name = None
     id = None

diff --git a/cps/templates/search.html b/cps/templates/search.html
@@ -5,6 +5,7 @@
     {% if entries|length < 1 %}
       <h2>{{_('No Results Found')}}</h2>
       <p>{{_('Search Term:')}} {{adv_searchterm}}</p>
+        <p>{{_('Words smaller than 3 letters are not considered')}}</p>
     {% else %}
       <h2>{{result_count}} {{_('Results for:')}} {{adv_searchterm}}</h2>
       {% if current_user.is_authenticated %}

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,6 @@ flask-wtf>=0.14.2,<1.2.0
 chardet>=3.0.0,<4.1.0
 advocate>=1.0.0,<1.1.0
 Flask-Limiter>=2.3.0,<3.4.0
+
+thefuzz~=0.19.0
+Levenshtein~=0.21.0