Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(HelpdeskSearch): Add report to evaluate search and improvements with the same #1979

Merged
merged 10 commits into from
Sep 12, 2024
27 changes: 22 additions & 5 deletions helpdesk/api/article.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,41 @@
import frappe
from textblob import TextBlob
from textblob.exceptions import MissingCorpusError

from helpdesk.search import search as hd_search


def get_nouns(text: str):
blob = TextBlob(text)
def get_nouns(blob: TextBlob):
try:
return [word for word, pos in blob.pos_tags if pos[0] == "N"]
except LookupError:
return []


def get_noun_phrases(blob: TextBlob):
try:
return blob.noun_phrases
except (LookupError, MissingCorpusError):
return []


@frappe.whitelist()
def search(query: str):
out = hd_search(query, only_articles=True)
if not out: # fallback
if nouns := get_nouns(query):
query = " ".join(nouns)
out = hd_search(query, only_articles=True)
blob = TextBlob(query)
if noun_phrases := get_noun_phrases(blob):
and_query = " ".join(noun_phrases)
or_query = "|".join(noun_phrases)
out = hd_search(and_query, only_articles=True) or hd_search(
or_query, only_articles=True
)
if not out and (nouns := get_nouns(blob)):
and_query = " ".join(nouns)
or_query = "|".join(nouns)
out = hd_search(and_query, only_articles=True) or hd_search(
or_query, only_articles=True
)
if not out:
return []
return out[0].get("items", [])
6 changes: 6 additions & 0 deletions helpdesk/helpdesk/doctype/hd_settings/hd_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@ def on_update(self):
room = get_website_room()

frappe.publish_realtime(event, room=room, after_commit=True)

@property
def hd_search(self):
from helpdesk.api.article import search

return search
Empty file.
8 changes: 8 additions & 0 deletions helpdesk/helpdesk/doctype/hd_stopword/hd_stopword.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// Copyright (c) 2024, Frappe Technologies and contributors
// For license information, please see license.txt

// frappe.ui.form.on("HD Stopword", {
// refresh(frm) {

// },
// });
56 changes: 56 additions & 0 deletions helpdesk/helpdesk/doctype/hd_stopword/hd_stopword.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"actions": [],
"allow_rename": 1,
"autoname": "field:word",
"creation": "2024-09-12 16:08:27.022111",
"doctype": "DocType",
"engine": "InnoDB",
"field_order": [
"word",
"column_break_jyol",
"enabled"
],
"fields": [
{
"fieldname": "word",
"fieldtype": "Data",
"label": "Word",
"unique": 1
},
{
"default": "1",
"fieldname": "enabled",
"fieldtype": "Check",
"label": "Enabled"
},
{
"fieldname": "column_break_jyol",
"fieldtype": "Column Break"
}
],
"index_web_pages_for_search": 1,
"links": [],
"modified": "2024-09-12 16:10:05.577545",
"modified_by": "Administrator",
"module": "Helpdesk",
"name": "HD Stopword",
"naming_rule": "By fieldname",
"owner": "Administrator",
"permissions": [
{
"create": 1,
"delete": 1,
"email": 1,
"export": 1,
"print": 1,
"read": 1,
"report": 1,
"role": "System Manager",
"share": 1,
"write": 1
}
],
"sort_field": "creation",
"sort_order": "DESC",
"states": []
}
9 changes: 9 additions & 0 deletions helpdesk/helpdesk/doctype/hd_stopword/hd_stopword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, Frappe Technologies and contributors
# For license information, please see license.txt

# import frappe
from frappe.model.document import Document


class HDStopword(Document):
pass
9 changes: 9 additions & 0 deletions helpdesk/helpdesk/doctype/hd_stopword/test_hd_stopword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, Frappe Technologies and Contributors
# See license.txt

# import frappe
from frappe.tests.utils import FrappeTestCase


class TestHDStopword(FrappeTestCase):
pass
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Copyright (c) 2024, Frappe Technologies and contributors
// For license information, please see license.txt

frappe.query_reports["Ticket-Search Analysis"] = {
filters: [
// {
// "fieldname": "my_filter",
// "label": __("My Filter"),
// "fieldtype": "Data",
// "reqd": 1,
// },
],
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"add_total_row": 0,
"columns": [],
"creation": "2024-09-11 22:31:05.837918",
"disabled": 0,
"docstatus": 0,
"doctype": "Report",
"filters": [],
"idx": 0,
"is_standard": "Yes",
"letterhead": null,
"modified": "2024-09-11 22:31:05.837918",
"modified_by": "Administrator",
"module": "Helpdesk",
"name": "Ticket-Search Analysis",
"owner": "Administrator",
"prepared_report": 0,
"ref_doctype": "HD Ticket",
"report_name": "Ticket-Search Analysis",
"report_type": "Script Report",
"roles": [
{
"role": "System Manager"
},
{
"role": "Agent"
},
{
"role": "Support Team"
},
{
"role": "Supported Site User"
},
{
"role": "External Agent"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) 2024, Frappe Technologies and contributors
# For license information, please see license.txt

import frappe
from frappe import _

from helpdesk.api.article import search


def execute(filters: dict | None = None):
"""Return columns and data for the report.

This is the main entry point for the report. It accepts the filters as a
dictionary and should return columns and data. It is called by the framework
every time the report is refreshed or a filter is updated.
"""
columns = get_columns()
data = get_data()

return columns, data


def get_columns() -> list[dict]:
"""Return columns for the report.

One field definition per column, just like a DocType field definition.
"""
return [
{
"label": _("Subject"),
"fieldname": "subject",
"fieldtype": "Data",
},
{
"label": _("Top Result"),
"fieldname": "top_res",
"fieldtype": "Text",
},
{
"label": _("Search Score"),
"fieldname": "score",
"fieldtype": "Float",
},
]


def get_top_res(search_term: str) -> float:
"""Return the search score for the top result for the search term."""
res = search(search_term)
headings = ""
score = 0
for item in res:
headings += item["headings"] or item["subject"]
headings += "\n"
score += item["score"]
return headings, score


def get_data() -> list[list]:
"""Return data for the report.

The report data is a list of rows, with each row being a list of cell values.
"""
tickets = frappe.get_all(
"HD Ticket", {"agent_group": ["like", "%FC%"]}, ["name", "subject"], limit=100
)
for ticket in tickets:
ticket["top_res"], ticket["score"] = get_top_res(ticket["subject"])
return [
[ticket["subject"], ticket["top_res"], ticket["score"]] for ticket in tickets
]
29 changes: 22 additions & 7 deletions helpdesk/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import frappe
from bs4 import BeautifulSoup, PageElement
from frappe.utils import cstr, strip_html_tags, update_progress_bar
from frappe.utils.caching import redis_cache
from frappe.utils.synchronization import filelock
from redis.commands.search.field import TagField, TextField
from redis.commands.search.indexDefinition import IndexDefinition
Expand Down Expand Up @@ -62,11 +63,20 @@
"you",
"me",
"do",
"has",
"been",
"urgent",
"want",
]


@redis_cache(3600 * 24)
def get_stopwords():
return STOPWORDS + frappe.get_all("HD Stopword", {"enabled": True}, pluck="name")


class Search:
unsafe_chars = re.compile(r"[\[\]{}<>+]")
unsafe_chars = re.compile(r"[\[\]{}<>+!-]")

def __init__(self, index_name, prefix, schema) -> None:
self.redis = frappe.cache()
Expand Down Expand Up @@ -95,7 +105,7 @@ def create_index(self):
self.redis.ft(self.index_name).create_index(
schema,
definition=index_def,
stopwords=STOPWORDS,
stopwords=get_stopwords(),
)
self._index_exists = True

Expand Down Expand Up @@ -128,6 +138,7 @@ def search(

query.summarize(fields=["description"])
query.scorer("DISMAX")
query.with_scores()

try:
result = self.redis.ft(self.index_name).search(query)
Expand All @@ -147,8 +158,7 @@ def search(
def clean_query(self, query):
query = query.strip().replace("-*", "*")
query = self.unsafe_chars.sub(" ", query)
query.strip()
return query
return query.strip().lower()

def spellcheck(self, query, **kwargs):
return self.redis.ft(self.index_name).spellcheck(query, **kwargs)
Expand Down Expand Up @@ -292,7 +302,10 @@ def get_count(self, doctype):

def get_records(self, doctype):
records = []
for d in frappe.db.get_all(doctype, fields=self.DOCTYPE_FIELDS[doctype]):
filters = {"published": True} if doctype == "HD Article" else {}
for d in frappe.db.get_all(
doctype, filters=filters, fields=self.DOCTYPE_FIELDS[doctype]
):
d.doctype = doctype
if doctype == "HD Article":
for heading, section in self.get_sections(d.content):
Expand All @@ -311,9 +324,9 @@ def get_records(self, doctype):
def search(query, only_articles=False):
search = HelpdeskSearch()
query = search.clean_query(query)
query_parts = query.split(" ")
query_parts = query.split()
query = " ".join(
[f"%{q}%" for q in query_parts if q not in STOPWORDS]
[f"{q}*" for q in query_parts if q not in get_stopwords()]
) # for stopwords to be ignored
result = search.search(query, start=0, highlight=True)
groups = {}
Expand Down Expand Up @@ -360,6 +373,8 @@ def download_corpus():
try:
data.find("taggers/averaged_perceptron_tagger_eng.zip")
data.find("tokenizers/punkt_tab.zip")
data.find("corpora/brown.zip")
except LookupError:
download("averaged_perceptron_tagger_eng")
download("punkt_tab")
download("brown")
Loading