Skip to content

Commit

Permalink
optimize wiki name search in search api
Browse files Browse the repository at this point in the history
  • Loading branch information
cir9no committed Aug 27, 2024
1 parent 87a857a commit 254f4eb
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 68 deletions.
12 changes: 6 additions & 6 deletions repo_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,17 +90,17 @@ def _get_repo_name_mtime_size(self, repo_id):
finally:
session.close()

def _get_wiki_id_commit_id_name(self, start, count):
def _get_wiki_id_commit_id(self, start, count):
session = self.db_session()
try:
cmd = """SELECT RepoInfo.repo_id, Branch.commit_id, RepoInfo.name
cmd = """SELECT RepoInfo.repo_id, Branch.commit_id
FROM RepoInfo
INNER JOIN Branch ON RepoInfo.repo_id = Branch.repo_id
WHERE RepoInfo.type = 'wiki'
limit :start, :count"""

res = [
(r[0], r[1], r[2])
(r[0], r[1])
for r in session.execute(text(cmd), {'start': start, 'count': count})
]

Expand Down Expand Up @@ -145,11 +145,11 @@ def get_repo_head_commit(self, repo_id):
logger.error(e)
return self._get_repo_head_commit(repo_id)

def get_wiki_id_commit_id_name(self, start, count):
def get_wiki_id_commit_id(self, start, count):
try:
return self._get_wiki_id_commit_id_name(start, count)
return self._get_wiki_id_commit_id(start, count)
except Exception as e:
logger.error(e)
return self._get_wiki_id_commit_id_name(start, count)
return self._get_wiki_id_commit_id(start, count)

repo_data = RepoData()
1 change: 0 additions & 1 deletion seafevent_server/request_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ def search_wikis():

if not query:
return {'error_msg': 'query invalid.'}, 400

if not wikis:
return {'error_msg': 'wikis invalid.'}, 400

Expand Down
6 changes: 3 additions & 3 deletions seasearch/index_store/index_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def keyword_search(self, query, repos, repo_filename_index, count, suffixes=None
def wiki_search(self, query, wikis, wiki_index, count):
return wiki_index.search_wikis(wikis, query, 0, count)

def update_wiki_index(self, wiki_id, commit_id, wiki_name, wiki_index, wiki_status_index):
def update_wiki_index(self, wiki_id, commit_id, wiki_index, wiki_status_index):
try:
new_commit_id = commit_id
index_name = WIKI_INDEX_PREFIX + wiki_id
Expand All @@ -82,11 +82,11 @@ def update_wiki_index(self, wiki_id, commit_id, wiki_name, wiki_index, wiki_stat

if wiki_status.need_recovery():
logger.warning('%s: wiki index inrecovery', wiki_id)
wiki_index.update(index_name, wiki_id, commit_id, wiki_name, to_commit)
wiki_index.update(index_name, wiki_id, commit_id, to_commit)
commit_id = to_commit
time.sleep(1)
wiki_status_index.begin_update_repo(wiki_id, commit_id, new_commit_id)
wiki_index.update(index_name, wiki_id, commit_id, wiki_name, new_commit_id)
wiki_index.update(index_name, wiki_id, commit_id, new_commit_id)
wiki_status_index.finish_update_repo(wiki_id, new_commit_id)

logger.info('wiki: %s, update wiki index success', wiki_id)
Expand Down
89 changes: 41 additions & 48 deletions seasearch/index_store/wiki_index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import posixpath
import json
import re

from seafevents.seasearch.utils import get_library_diff_files, is_wiki, extract_sdoc_text
from seafevents.seasearch.utils.constants import ZERO_OBJ_ID, WIKI_INDEX_PREFIX
Expand Down Expand Up @@ -167,17 +166,9 @@ def extract_ids_from_navigation(navigation_items, navigation_ids):
non_navigation_doc_uuids = [page['docUuid'] for page in config['pages'] if page['id'] not in navigation_ids]
return non_navigation_doc_uuids

def add_files(self, index_name, wiki_id, wiki_name, files, doc_uuids):
def add_files(self, index_name, wiki_id, files, doc_uuids):
bulk_add_params = []
index_info = {'index': {'_index': index_name}}
# Add wiki name in bulk_add_params
wiki_info = {
'content': wiki_name,
'type': 'name',
'wiki_id': wiki_id,
}
bulk_add_params.append(index_info)
bulk_add_params.append(wiki_info)

for path, obj_id, mtime, size in files:
if not is_wiki(path):
Expand Down Expand Up @@ -236,7 +227,7 @@ def normal_search(self, index_name, dsl):

return doc_item['hits']['hits'], total

def update(self, index_name, wiki_id, old_commit_id, wiki_name, new_commit_id):
def update(self, index_name, wiki_id, old_commit_id, new_commit_id):
added_files, deleted_files, modified_files, _ , _ = \
get_library_diff_files(wiki_id, old_commit_id, new_commit_id)

Expand All @@ -251,12 +242,13 @@ def update(self, index_name, wiki_id, old_commit_id, wiki_name, new_commit_id):
need_added_files = added_files + modified_files
# deleting files is to prevent duplicate insertions when the last execution failed
self.delete_files(index_name, added_files, deleted_doc_uuids)
self.add_files(index_name, wiki_id, wiki_name, need_added_files, doc_uuids)
self.add_files(index_name, wiki_id, need_added_files, doc_uuids)

def search_wikis(self, wikis, keyword, start=0, size=10):
bulk_search_params = []
# Format: [(doc_uuid, name, wiki_id)]
uuid_name_list = []

title_info = []
name_info = []
for wiki_id in wikis:
query_map = {'bool': {'should': [], 'minimum_should_match': 1}}
searches = self._make_query_searches(keyword)
Expand All @@ -282,14 +274,17 @@ def search_wikis(self, wikis, keyword, start=0, size=10):
# Get wiki title
conf = self.get_wiki_conf(wiki_id)
doc_uuids = self.extract_doc_uuids(conf)

for page in conf['pages']:
page_uuid = page['path'].split('/')[2]
if page_uuid in doc_uuids:
uuid_name_list.append((page_uuid, page["name"], wiki_id))
title_info.append((page_uuid, page["name"], wiki_id))

# Get wiki name
wiki = seafile_api.get_repo(wiki_id)
name_info.append((wiki.repo_name, wiki_id))

results = self.seasearch_api.m_search(bulk_search_params)
res_wikis = []
content_match = []
for result in results.get('responses'):
hits = result.get('hits', {}).get('hits', [])

Expand All @@ -301,44 +296,42 @@ def search_wikis(self, wikis, keyword, start=0, size=10):
score = hit.get('_score')
_id = hit.get('_id')
hit_type = source['type']
if hit_type == 'name':
title = re.sub(
r"</?mark>", "", hit.get('highlight').get('content', [None])[0]
)
r = {
'wiki_id': source['wiki_id'],
'hit_type': f'wiki_{hit_type}',
'content': title,
'_id': _id,
'score': score,
}
else:
r = {
'doc_uuid': source['doc_uuid'],
'wiki_id': source['wiki_id'],
'score': score,
'_id': _id,
'hit_type': f'wiki_{hit_type}'
}
if highlight_content := hit.get('highlight').get('content', [None])[0]:
r.update(content=highlight_content)
res_wikis.append(r)
res_wikis = sorted(res_wikis, key=lambda row: row['score'], reverse=True)[:size]

# Determine whether the inquiry is a title
query_match = []
for doc_uuid, title, wiki_id in uuid_name_list:

r = {
'doc_uuid': source['doc_uuid'],
'wiki_id': source['wiki_id'],
'score': score,
'_id': _id,
'hit_type': f'wiki_{hit_type}'
}
if highlight_content := hit.get('highlight').get('content', [None])[0]:
r.update(content=highlight_content)
content_match.append(r)
content_match = sorted(content_match, key=lambda row: row['score'], reverse=True)[:size]

# Search in wiki title
title_match = []
for doc_uuid, title, wiki_id in title_info:
if keyword in title:
r_t = {
'doc_uuid': doc_uuid,
'wiki_id': wiki_id,
'title': title,
'hit_type': 'wiki_title'
}
query_match.append(r_t)

query_match.extend(res_wikis)
return query_match
title_match.append(r_t)

# Search in wiki name
name_match = []
for name, wiki_id in name_info:
if keyword in name:
r_n = {
'content': name,
'hit_type': 'wiki_name',
'wiki_id': wiki_id,
}
name_match.append(r_n)
return name_match + title_match + content_match

def delete_index_by_index_name(self, index_name):
self.seasearch_api.delete_index_by_name(index_name)
6 changes: 3 additions & 3 deletions seasearch/index_task/wiki_index_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,18 @@ def update_wiki_indexes(wiki_status_index, wiki_index, index_manager, repo_data)
all_wikis = []
while True:
try:
wiki_commits = repo_data.get_wiki_id_commit_id_name(start, count)
wiki_commits = repo_data.get_wiki_id_commit_id(start, count)
except Exception as e:
logger.error("Error: %s" % e)
return
start += 1000
if len(wiki_commits) == 0:
break

for wiki_id, commit_id, wiki_name in wiki_commits:
for wiki_id, commit_id in wiki_commits:
all_wikis.append(wiki_id)

index_manager.update_wiki_index(wiki_id, commit_id, wiki_name, wiki_index, wiki_status_index)
index_manager.update_wiki_index(wiki_id, commit_id, wiki_index, wiki_status_index)

logger.info("Finish update wiki index")

Expand Down
14 changes: 7 additions & 7 deletions seasearch/script/wiki_index_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run(self):
while True:
global NO_TASKS
try:
wiki_commits = self.repo_data.get_wiki_id_commit_id_name(start, per_size)
wiki_commits = self.repo_data.get_wiki_id_commit_id(start, per_size)
except Exception as e:
logger.error("Error: %s" % e)
NO_TASKS = True
Expand All @@ -67,8 +67,8 @@ def run(self):
if len(wiki_commits) == 0:
NO_TASKS = True
break
for wiki_id, commit_id, wiki_name in wiki_commits:
wikis_queue.put((wiki_id, commit_id, wiki_name))
for wiki_id, commit_id in wiki_commits:
wikis_queue.put((wiki_id, commit_id))
wikis[wiki_id] = commit_id
start += per_size

Expand Down Expand Up @@ -96,9 +96,9 @@ def thread_task(self, wikis_queue):
else:
wiki_id = queue_data[0]
commit_id = queue_data[1]
wiki_name = queue_data[2]

try:
self.index_manager.update_wiki_index(wiki_id, commit_id, wiki_name, self.wiki_index, self.wiki_status_index)
self.index_manager.update_wiki_index(wiki_id, commit_id, self.wiki_index, self.wiki_status_index)
except Exception as e:
logger.exception('Wiki index error: %s, wiki_id: %s' % (e, wiki_id), exc_info=True)
self.incr_error()
Expand Down Expand Up @@ -184,7 +184,7 @@ def delete_indices():
start, count = 0, 1000
while True:
try:
wiki_commits = repo_data.get_wiki_id_commit_id_name(start, count)
wiki_commits = repo_data.get_wiki_id_commit_id(start, count)
except Exception as e:
logger.error("Error: %s" % e)
return
Expand All @@ -193,7 +193,7 @@ def delete_indices():
if len(wiki_commits) == 0:
break

for wiki_id, commit_id, wiki_name in wiki_commits:
for wiki_id, commit_id, in wiki_commits:
wiki_index_name = WIKI_INDEX_PREFIX + wiki_id
wiki_index.delete_index_by_index_name(wiki_index_name)

Expand Down

0 comments on commit 254f4eb

Please sign in to comment.