feat: index library collections in studio meilisearch index (#35324)

openedx · Aug 27, 2024 · 2c66cf0 · 2c66cf0
1 parent 09c78fd
commit 2c66cf0
Show file tree

Hide file tree

Showing 9 changed files with 189 additions and 31 deletions.
diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py
@@ -19,6 +19,7 @@
 from meilisearch.models.task import TaskInfo
 from opaque_keys.edx.keys import UsageKey
 from opaque_keys.edx.locator import LibraryLocatorV2
+from openedx_learning.api import authoring as authoring_api
 from common.djangoapps.student.roles import GlobalStaff
 from rest_framework.request import Request
 from common.djangoapps.student.role_helpers import get_course_roles
@@ -31,8 +32,9 @@
     Fields,
     meili_id_from_opaque_key,
     searchable_doc_for_course_block,
+    searchable_doc_for_collection,
     searchable_doc_for_library_block,
-    searchable_doc_tags
+    searchable_doc_tags,
 )
 
 log = logging.getLogger(__name__)
@@ -294,12 +296,16 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
     status_cb("Counting courses...")
     num_courses = CourseOverview.objects.count()
 
+    # Get the list of collections
+    status_cb("Counting collections...")
+    num_collections = authoring_api.get_collections().count()
+
     # Some counters so we can track our progress as indexing progresses:
-    num_contexts = num_courses + num_libraries
+    num_contexts = num_courses + num_libraries + num_collections
     num_contexts_done = 0  # How many courses/libraries we've indexed
     num_blocks_done = 0  # How many individual components/XBlocks we've indexed
 
-    status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
+    status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
     with _using_temp_index(status_cb) as temp_index_name:
         ############## Configure the index ##############
 
@@ -332,6 +338,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
             Fields.block_id,
             Fields.content,
             Fields.tags,
+            Fields.description,
             # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
             # are searchable only if at least one document in the index has a value. If we didn't list them here and,
             # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
@@ -363,8 +370,8 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
 
         ############## Libraries ##############
         status_cb("Indexing libraries...")
-        for lib_key in lib_keys:
-            status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
+
+        def index_library(lib_key: str) -> list:
             docs = []
             for component in lib_api.get_library_components(lib_key):
                 try:
@@ -375,48 +382,88 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
                     docs.append(doc)
                 except Exception as err:  # pylint: disable=broad-except
                     status_cb(f"Error indexing library component {component}: {err}")
-                finally:
-                    num_blocks_done += 1
             if docs:
                 try:
                     # Add all the docs in this library at once (usually faster than adding one at a time):
                     _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
                 except (TypeError, KeyError, MeilisearchError) as err:
                     status_cb(f"Error indexing library {lib_key}: {err}")
+            return docs
 
+        for lib_key in lib_keys:
+            status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
+            lib_docs = index_library(lib_key)
+            num_blocks_done += len(lib_docs)
             num_contexts_done += 1
 
         ############## Courses ##############
         status_cb("Indexing courses...")
         # To reduce memory usage on large instances, split up the CourseOverviews into pages of 1,000 courses:
+
+        def index_course(course: CourseOverview) -> list:
+            docs = []
+            # Pre-fetch the course with all of its children:
+            course = store.get_course(course.id, depth=None)
+
+            def add_with_children(block):
+                """ Recursively index the given XBlock/component """
+                doc = searchable_doc_for_course_block(block)
+                doc.update(searchable_doc_tags(block.usage_key))
+                docs.append(doc)  # pylint: disable=cell-var-from-loop
+                _recurse_children(block, add_with_children)  # pylint: disable=cell-var-from-loop
+
+            # Index course children
+            _recurse_children(course, add_with_children)
+
+            if docs:
+                # Add all the docs in this course at once (usually faster than adding one at a time):
+                _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
+            return docs
+
         paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
         for p in paginator.page_range:
             for course in paginator.page(p).object_list:
                 status_cb(
                     f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
                 )
-                docs = []
-
-                # Pre-fetch the course with all of its children:
-                course = store.get_course(course.id, depth=None)
+                course_docs = index_course(course)
+                num_contexts_done += 1
+                num_blocks_done += len(course_docs)
 
-                def add_with_children(block):
-                    """ Recursively index the given XBlock/component """
-                    doc = searchable_doc_for_course_block(block)
-                    doc.update(searchable_doc_tags(block.usage_key))
-                    docs.append(doc)  # pylint: disable=cell-var-from-loop
-                    _recurse_children(block, add_with_children)  # pylint: disable=cell-var-from-loop
+        ############## Collections ##############
+        status_cb("Indexing collections...")
 
-                # Index course children
-                _recurse_children(course, add_with_children)
+        def index_collection_batch(batch, num_contexts_done) -> int:
+            docs = []
+            for collection in batch:
+                status_cb(
+                    f"{num_contexts_done + 1}/{num_contexts}. "
+                    f"Now indexing collection {collection.title} ({collection.id})"
+                )
+                try:
+                    doc = searchable_doc_for_collection(collection)
+                    # Uncomment below line once collections are tagged.
+                    # doc.update(searchable_doc_tags(collection.id))
+                    docs.append(doc)
+                except Exception as err:  # pylint: disable=broad-except
+                    status_cb(f"Error indexing collection {collection}: {err}")
+                finally:
+                    num_contexts_done += 1
 
-                if docs:
-                    # Add all the docs in this course at once (usually faster than adding one at a time):
+            if docs:
+                try:
+                    # Add docs in batch of 100 at once (usually faster than adding one at a time):
                     _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
-                num_contexts_done += 1
-                num_blocks_done += len(docs)
+                except (TypeError, KeyError, MeilisearchError) as err:
+                    status_cb(f"Error indexing collection batch {p}: {err}")
+            return num_contexts_done
+
+        # To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
+        paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
+        for p in paginator.page_range:
+            num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)
 
-    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.")
+    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")
 
 
 def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None:

diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py
@@ -13,6 +13,7 @@
 from openedx.core.djangoapps.content_libraries import api as lib_api
 from openedx.core.djangoapps.content_tagging import api as tagging_api
 from openedx.core.djangoapps.xblock import api as xblock_api
+from openedx_learning.api.authoring_models import LearningPackage
 
 log = logging.getLogger(__name__)
 
@@ -27,6 +28,7 @@ class Fields:
     type = "type"  # DocType.course_block or DocType.library_block (see below)
     block_id = "block_id"  # The block_id part of the usage key. Sometimes human-readable, sometimes a random hex ID
     display_name = "display_name"
+    description = "description"
     modified = "modified"
     created = "created"
     last_published = "last_published"
@@ -66,6 +68,7 @@ class DocType:
     """
     course_block = "course_block"
     library_block = "library_block"
+    collection = "collection"
 
 
 def meili_id_from_opaque_key(usage_key: UsageKey) -> str:
@@ -276,3 +279,38 @@ def searchable_doc_for_course_block(block) -> dict:
     doc.update(_fields_from_block(block))
 
     return doc
+
+
+def searchable_doc_for_collection(collection) -> dict:
+    """
+    Generate a dictionary document suitable for ingestion into a search engine
+    like Meilisearch or Elasticsearch, so that the given collection can be
+    found using faceted search.
+    """
+    doc = {
+        Fields.id: collection.id,
+        Fields.type: DocType.collection,
+        Fields.display_name: collection.title,
+        Fields.description: collection.description,
+        Fields.created: collection.created.timestamp(),
+        Fields.modified: collection.modified.timestamp(),
+        # Add related learning_package.key as context_key by default.
+        # If related contentlibrary is found, it will override this value below.
+        # Mostly contentlibrary.library_key == learning_package.key
+        Fields.context_key: collection.learning_package.key,
+    }
+    # Just in case learning_package is not related to a library
+    try:
+        context_key = collection.learning_package.contentlibrary.library_key
+        org = str(context_key.org)
+        doc.update({
+            Fields.context_key: str(context_key),
+            Fields.org: org,
+        })
+    except LearningPackage.contentlibrary.RelatedObjectDoesNotExist:
+        log.warning(f"Related library not found for {collection}")
+    doc[Fields.access_id] = _meili_access_id_from_context_key(doc[Fields.context_key])
+    # Add the breadcrumbs.
+    doc[Fields.breadcrumbs] = [{"display_name": collection.learning_package.title}]
+
+    return doc
diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py
@@ -6,12 +6,13 @@
 import copy
 
 from datetime import datetime, timezone
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, Mock, call, patch
 from opaque_keys.edx.keys import UsageKey
 
 import ddt
 from django.test import override_settings
 from freezegun import freeze_time
+from openedx_learning.api import authoring as authoring_api
 from organizations.tests.factories import OrganizationFactory
 
 from common.djangoapps.student.tests.factories import UserFactory
@@ -174,6 +175,28 @@ def setUp(self):
         tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
         tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")
 
+        # Create a collection:
+        self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
+        self.collection_dict = {
+            'id': 1,
+            'type': 'collection',
+            'display_name': 'my_collection',
+            'description': 'my collection description',
+            'context_key': 'lib:org1:lib',
+            'org': 'org1',
+            'created': created_date.timestamp(),
+            'modified': created_date.timestamp(),
+            "access_id": lib_access.id,
+            'breadcrumbs': [{'display_name': 'Library'}]
+        }
+        with freeze_time(created_date):
+            self.collection = authoring_api.create_collection(
+                learning_package_id=self.learning_package.id,
+                title="my_collection",
+                created_by=None,
+                description="my collection description"
+            )
+
     @override_settings(MEILISEARCH_ENABLED=False)
     def test_reindex_meilisearch_disabled(self, mock_meilisearch):
         with self.assertRaises(RuntimeError):
@@ -199,10 +222,27 @@ def test_reindex_meilisearch(self, mock_meilisearch):
             [
                 call([doc_sequential, doc_vertical]),
                 call([doc_problem1, doc_problem2]),
+                call([self.collection_dict]),
             ],
             any_order=True,
         )
 
+    @override_settings(MEILISEARCH_ENABLED=True)
+    @patch(
+        "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",
+        Mock(side_effect=Exception("Failed to generate document")),
+    )
+    def test_reindex_meilisearch_collection_error(self, mock_meilisearch):
+
+        mock_logger = Mock()
+        api.rebuild_index(mock_logger)
+        assert call(
+            [self.collection_dict]
+        ) not in mock_meilisearch.return_value.index.return_value.add_documents.mock_calls
+        mock_logger.assert_any_call(
+            f"Error indexing collection {self.collection}: Failed to generate document"
+        )
+
     @override_settings(MEILISEARCH_ENABLED=True)
     def test_reindex_meilisearch_library_block_error(self, mock_meilisearch):
 

diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py
@@ -1,8 +1,12 @@
 """
 Tests for the Studio content search documents (what gets stored in the index)
 """
+from datetime import datetime, timezone
 from organizations.models import Organization
 
+from freezegun import freeze_time
+from openedx_learning.api import authoring as authoring_api
+
 from openedx.core.djangoapps.content_tagging import api as tagging_api
 from openedx.core.djangolib.testing.utils import skip_unless_cms
 from xmodule.modulestore.django import modulestore
@@ -11,10 +15,12 @@
 
 try:
     # This import errors in the lms because content.search is not an installed app there.
-    from ..documents import searchable_doc_for_course_block, searchable_doc_tags
+    from ..documents import searchable_doc_for_course_block, searchable_doc_tags, searchable_doc_for_collection
     from ..models import SearchAccess
 except RuntimeError:
     searchable_doc_for_course_block = lambda x: x
+    searchable_doc_tags = lambda x: x
+    searchable_doc_for_collection = lambda x: x
     SearchAccess = {}
 
 
@@ -198,3 +204,30 @@ def test_video_block_untagged(self):
             "content": {},
             # This video has no tags.
         }
+
+    def test_collection_with_no_library(self):
+        created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc)
+        with freeze_time(created_date):
+            learning_package = authoring_api.create_learning_package(
+                key="course-v1:edX+toy+2012_Fall",
+                title="some learning_package",
+                description="some description",
+            )
+            collection = authoring_api.create_collection(
+                learning_package_id=learning_package.id,
+                title="my_collection",
+                created_by=None,
+                description="my collection description"
+            )
+        doc = searchable_doc_for_collection(collection)
+        assert doc == {
+            "id": collection.id,
+            "type": "collection",
+            "display_name": collection.title,
+            "description": collection.description,
+            "context_key": learning_package.key,
+            "access_id": self.toy_course_access_id,
+            "breadcrumbs": [{"display_name": learning_package.title}],
+            "created": created_date.timestamp(),
+            "modified": created_date.timestamp(),
+        }
diff --git a/requirements/constraints.txt b/requirements/constraints.txt
@@ -93,7 +93,7 @@ libsass==0.10.0
 click==8.1.6
 
 # pinning this version to avoid updates while the library is being developed
-openedx-learning==0.11.1
+openedx-learning==0.11.2
 
 # Open AI version 1.0.0 dropped support for openai.ChatCompletion which is currently in use in enterprise.
 openai<=0.28.1

diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt
@@ -823,7 +823,7 @@ openedx-filters==1.9.0
     #   -r requirements/edx/kernel.in
     #   lti-consumer-xblock
     #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
     # via
     #   -c requirements/edx/../constraints.txt
     #   -r requirements/edx/kernel.in

diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt
@@ -1372,7 +1372,7 @@ openedx-filters==1.9.0
     #   -r requirements/edx/testing.txt
     #   lti-consumer-xblock
     #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
     # via
     #   -c requirements/edx/../constraints.txt
     #   -r requirements/edx/doc.txt

diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt
@@ -982,7 +982,7 @@ openedx-filters==1.9.0
     #   -r requirements/edx/base.txt
     #   lti-consumer-xblock
     #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
     # via
     #   -c requirements/edx/../constraints.txt
     #   -r requirements/edx/base.txt