Skip to content

Commit

Permalink
feat: index library collections in studio meilisearch index (#35324)
Browse files Browse the repository at this point in the history
  • Loading branch information
navinkarkera authored and rediris committed Aug 27, 2024
1 parent 09c78fd commit 2c66cf0
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 31 deletions.
95 changes: 71 additions & 24 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from meilisearch.models.task import TaskInfo
from opaque_keys.edx.keys import UsageKey
from opaque_keys.edx.locator import LibraryLocatorV2
from openedx_learning.api import authoring as authoring_api
from common.djangoapps.student.roles import GlobalStaff
from rest_framework.request import Request
from common.djangoapps.student.role_helpers import get_course_roles
Expand All @@ -31,8 +32,9 @@
Fields,
meili_id_from_opaque_key,
searchable_doc_for_course_block,
searchable_doc_for_collection,
searchable_doc_for_library_block,
searchable_doc_tags
searchable_doc_tags,
)

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -294,12 +296,16 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
status_cb("Counting courses...")
num_courses = CourseOverview.objects.count()

# Get the list of collections
status_cb("Counting collections...")
num_collections = authoring_api.get_collections().count()

# Some counters so we can track our progress as indexing progresses:
num_contexts = num_courses + num_libraries
num_contexts = num_courses + num_libraries + num_collections
num_contexts_done = 0 # How many courses/libraries we've indexed
num_blocks_done = 0 # How many individual components/XBlocks we've indexed

status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
with _using_temp_index(status_cb) as temp_index_name:
############## Configure the index ##############

Expand Down Expand Up @@ -332,6 +338,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
Fields.block_id,
Fields.content,
Fields.tags,
Fields.description,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
Expand Down Expand Up @@ -363,8 +370,8 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:

############## Libraries ##############
status_cb("Indexing libraries...")
for lib_key in lib_keys:
status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")

def index_library(lib_key: str) -> list:
docs = []
for component in lib_api.get_library_components(lib_key):
try:
Expand All @@ -375,48 +382,88 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing library component {component}: {err}")
finally:
num_blocks_done += 1
if docs:
try:
# Add all the docs in this library at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing library {lib_key}: {err}")
return docs

for lib_key in lib_keys:
status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
lib_docs = index_library(lib_key)
num_blocks_done += len(lib_docs)
num_contexts_done += 1

############## Courses ##############
status_cb("Indexing courses...")
# To reduce memory usage on large instances, split up the CourseOverviews into pages of 1,000 courses:

def index_course(course: CourseOverview) -> list:
docs = []
# Pre-fetch the course with all of its children:
course = store.get_course(course.id, depth=None)

def add_with_children(block):
""" Recursively index the given XBlock/component """
doc = searchable_doc_for_course_block(block)
doc.update(searchable_doc_tags(block.usage_key))
docs.append(doc) # pylint: disable=cell-var-from-loop
_recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop

# Index course children
_recurse_children(course, add_with_children)

if docs:
# Add all the docs in this course at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
return docs

paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
for p in paginator.page_range:
for course in paginator.page(p).object_list:
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
)
docs = []

# Pre-fetch the course with all of its children:
course = store.get_course(course.id, depth=None)
course_docs = index_course(course)
num_contexts_done += 1
num_blocks_done += len(course_docs)

def add_with_children(block):
""" Recursively index the given XBlock/component """
doc = searchable_doc_for_course_block(block)
doc.update(searchable_doc_tags(block.usage_key))
docs.append(doc) # pylint: disable=cell-var-from-loop
_recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop
############## Collections ##############
status_cb("Indexing collections...")

# Index course children
_recurse_children(course, add_with_children)
def index_collection_batch(batch, num_contexts_done) -> int:
docs = []
for collection in batch:
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. "
f"Now indexing collection {collection.title} ({collection.id})"
)
try:
doc = searchable_doc_for_collection(collection)
# Uncomment below line once collections are tagged.
# doc.update(searchable_doc_tags(collection.id))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")
finally:
num_contexts_done += 1

if docs:
# Add all the docs in this course at once (usually faster than adding one at a time):
if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
num_contexts_done += 1
num_blocks_done += len(docs)
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return num_contexts_done

# To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
for p in paginator.page_range:
num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)

status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.")
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")


def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None:
Expand Down
38 changes: 38 additions & 0 deletions openedx/core/djangoapps/content/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from openedx.core.djangoapps.content_libraries import api as lib_api
from openedx.core.djangoapps.content_tagging import api as tagging_api
from openedx.core.djangoapps.xblock import api as xblock_api
from openedx_learning.api.authoring_models import LearningPackage

log = logging.getLogger(__name__)

Expand All @@ -27,6 +28,7 @@ class Fields:
type = "type" # DocType.course_block or DocType.library_block (see below)
block_id = "block_id" # The block_id part of the usage key. Sometimes human-readable, sometimes a random hex ID
display_name = "display_name"
description = "description"
modified = "modified"
created = "created"
last_published = "last_published"
Expand Down Expand Up @@ -66,6 +68,7 @@ class DocType:
"""
course_block = "course_block"
library_block = "library_block"
collection = "collection"


def meili_id_from_opaque_key(usage_key: UsageKey) -> str:
Expand Down Expand Up @@ -276,3 +279,38 @@ def searchable_doc_for_course_block(block) -> dict:
doc.update(_fields_from_block(block))

return doc


def searchable_doc_for_collection(collection) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, so that the given collection can be
found using faceted search.
"""
doc = {
Fields.id: collection.id,
Fields.type: DocType.collection,
Fields.display_name: collection.title,
Fields.description: collection.description,
Fields.created: collection.created.timestamp(),
Fields.modified: collection.modified.timestamp(),
# Add related learning_package.key as context_key by default.
# If related contentlibrary is found, it will override this value below.
# Mostly contentlibrary.library_key == learning_package.key
Fields.context_key: collection.learning_package.key,
}
# Just in case learning_package is not related to a library
try:
context_key = collection.learning_package.contentlibrary.library_key
org = str(context_key.org)
doc.update({
Fields.context_key: str(context_key),
Fields.org: org,
})
except LearningPackage.contentlibrary.RelatedObjectDoesNotExist:
log.warning(f"Related library not found for {collection}")
doc[Fields.access_id] = _meili_access_id_from_context_key(doc[Fields.context_key])
# Add the breadcrumbs.
doc[Fields.breadcrumbs] = [{"display_name": collection.learning_package.title}]

return doc
42 changes: 41 additions & 1 deletion openedx/core/djangoapps/content/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import copy

from datetime import datetime, timezone
from unittest.mock import MagicMock, call, patch
from unittest.mock import MagicMock, Mock, call, patch
from opaque_keys.edx.keys import UsageKey

import ddt
from django.test import override_settings
from freezegun import freeze_time
from openedx_learning.api import authoring as authoring_api
from organizations.tests.factories import OrganizationFactory

from common.djangoapps.student.tests.factories import UserFactory
Expand Down Expand Up @@ -174,6 +175,28 @@ def setUp(self):
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")

# Create a collection:
self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
self.collection_dict = {
'id': 1,
'type': 'collection',
'display_name': 'my_collection',
'description': 'my collection description',
'context_key': 'lib:org1:lib',
'org': 'org1',
'created': created_date.timestamp(),
'modified': created_date.timestamp(),
"access_id": lib_access.id,
'breadcrumbs': [{'display_name': 'Library'}]
}
with freeze_time(created_date):
self.collection = authoring_api.create_collection(
learning_package_id=self.learning_package.id,
title="my_collection",
created_by=None,
description="my collection description"
)

@override_settings(MEILISEARCH_ENABLED=False)
def test_reindex_meilisearch_disabled(self, mock_meilisearch):
with self.assertRaises(RuntimeError):
Expand All @@ -199,10 +222,27 @@ def test_reindex_meilisearch(self, mock_meilisearch):
[
call([doc_sequential, doc_vertical]),
call([doc_problem1, doc_problem2]),
call([self.collection_dict]),
],
any_order=True,
)

@override_settings(MEILISEARCH_ENABLED=True)
@patch(
"openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",
Mock(side_effect=Exception("Failed to generate document")),
)
def test_reindex_meilisearch_collection_error(self, mock_meilisearch):

mock_logger = Mock()
api.rebuild_index(mock_logger)
assert call(
[self.collection_dict]
) not in mock_meilisearch.return_value.index.return_value.add_documents.mock_calls
mock_logger.assert_any_call(
f"Error indexing collection {self.collection}: Failed to generate document"
)

@override_settings(MEILISEARCH_ENABLED=True)
def test_reindex_meilisearch_library_block_error(self, mock_meilisearch):

Expand Down
35 changes: 34 additions & 1 deletion openedx/core/djangoapps/content/search/tests/test_documents.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""
Tests for the Studio content search documents (what gets stored in the index)
"""
from datetime import datetime, timezone
from organizations.models import Organization

from freezegun import freeze_time
from openedx_learning.api import authoring as authoring_api

from openedx.core.djangoapps.content_tagging import api as tagging_api
from openedx.core.djangolib.testing.utils import skip_unless_cms
from xmodule.modulestore.django import modulestore
Expand All @@ -11,10 +15,12 @@

try:
# This import errors in the lms because content.search is not an installed app there.
from ..documents import searchable_doc_for_course_block, searchable_doc_tags
from ..documents import searchable_doc_for_course_block, searchable_doc_tags, searchable_doc_for_collection
from ..models import SearchAccess
except RuntimeError:
searchable_doc_for_course_block = lambda x: x
searchable_doc_tags = lambda x: x
searchable_doc_for_collection = lambda x: x
SearchAccess = {}


Expand Down Expand Up @@ -198,3 +204,30 @@ def test_video_block_untagged(self):
"content": {},
# This video has no tags.
}

def test_collection_with_no_library(self):
created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc)
with freeze_time(created_date):
learning_package = authoring_api.create_learning_package(
key="course-v1:edX+toy+2012_Fall",
title="some learning_package",
description="some description",
)
collection = authoring_api.create_collection(
learning_package_id=learning_package.id,
title="my_collection",
created_by=None,
description="my collection description"
)
doc = searchable_doc_for_collection(collection)
assert doc == {
"id": collection.id,
"type": "collection",
"display_name": collection.title,
"description": collection.description,
"context_key": learning_package.key,
"access_id": self.toy_course_access_id,
"breadcrumbs": [{"display_name": learning_package.title}],
"created": created_date.timestamp(),
"modified": created_date.timestamp(),
}
2 changes: 1 addition & 1 deletion requirements/constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ libsass==0.10.0
click==8.1.6

# pinning this version to avoid updates while the library is being developed
openedx-learning==0.11.1
openedx-learning==0.11.2

# Open AI version 1.0.0 dropped support for openai.ChatCompletion which is currently in use in enterprise.
openai<=0.28.1
Expand Down
2 changes: 1 addition & 1 deletion requirements/edx/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ openedx-filters==1.9.0
# -r requirements/edx/kernel.in
# lti-consumer-xblock
# ora2
openedx-learning==0.11.1
openedx-learning==0.11.2
# via
# -c requirements/edx/../constraints.txt
# -r requirements/edx/kernel.in
Expand Down
2 changes: 1 addition & 1 deletion requirements/edx/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1372,7 +1372,7 @@ openedx-filters==1.9.0
# -r requirements/edx/testing.txt
# lti-consumer-xblock
# ora2
openedx-learning==0.11.1
openedx-learning==0.11.2
# via
# -c requirements/edx/../constraints.txt
# -r requirements/edx/doc.txt
Expand Down
2 changes: 1 addition & 1 deletion requirements/edx/doc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ openedx-filters==1.9.0
# -r requirements/edx/base.txt
# lti-consumer-xblock
# ora2
openedx-learning==0.11.1
openedx-learning==0.11.2
# via
# -c requirements/edx/../constraints.txt
# -r requirements/edx/base.txt
Expand Down
Loading

0 comments on commit 2c66cf0

Please sign in to comment.