feat: index library collections in studio meilisearch index (#35324)

2024-08-27 23:15:46 +05:30
parent 9124e7bca4
commit c65478e487
9 changed files with 192 additions and 34 deletions
--- a/openedx/core/djangoapps/content/search/api.py
+++ b/openedx/core/djangoapps/content/search/api.py
@@ -19,6 +19,7 @@ from meilisearch.errors import MeilisearchError
 from meilisearch.models.task import TaskInfo
 from opaque_keys.edx.keys import UsageKey
 from opaque_keys.edx.locator import LibraryLocatorV2
+from openedx_learning.api import authoring as authoring_api
 from common.djangoapps.student.roles import GlobalStaff
 from rest_framework.request import Request
 from common.djangoapps.student.role_helpers import get_course_roles
@@ -31,8 +32,9 @@ from .documents import (
    Fields,
    meili_id_from_opaque_key,
    searchable_doc_for_course_block,
+    searchable_doc_for_collection,
    searchable_doc_for_library_block,
-    searchable_doc_tags
+    searchable_doc_tags,
 )

 log = logging.getLogger(__name__)
@@ -294,12 +296,16 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
    status_cb("Counting courses...")
    num_courses = CourseOverview.objects.count()

+    # Get the list of collections
+    status_cb("Counting collections...")
+    num_collections = authoring_api.get_collections().count()
+
    # Some counters so we can track our progress as indexing progresses:
-    num_contexts = num_courses + num_libraries
+    num_contexts = num_courses + num_libraries + num_collections
    num_contexts_done = 0  # How many courses/libraries we've indexed
    num_blocks_done = 0  # How many individual components/XBlocks we've indexed

-    status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
+    status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
    with _using_temp_index(status_cb) as temp_index_name:
        ############## Configure the index ##############

@@ -332,6 +338,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
            Fields.block_id,
            Fields.content,
            Fields.tags,
+            Fields.description,
            # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
            # are searchable only if at least one document in the index has a value. If we didn't list them here and,
            # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
@@ -363,8 +370,8 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:

        ############## Libraries ##############
        status_cb("Indexing libraries...")
-        for lib_key in lib_keys:
-            status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
+
+        def index_library(lib_key: str) -> list:
            docs = []
            for component in lib_api.get_library_components(lib_key):
                try:
@@ -375,48 +382,88 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
                    docs.append(doc)
                except Exception as err:  # pylint: disable=broad-except
                    status_cb(f"Error indexing library component {component}: {err}")
-                finally:
-                    num_blocks_done += 1
            if docs:
                try:
                    # Add all the docs in this library at once (usually faster than adding one at a time):
                    _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
                except (TypeError, KeyError, MeilisearchError) as err:
                    status_cb(f"Error indexing library {lib_key}: {err}")
+            return docs

+        for lib_key in lib_keys:
+            status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
+            lib_docs = index_library(lib_key)
+            num_blocks_done += len(lib_docs)
            num_contexts_done += 1

        ############## Courses ##############
        status_cb("Indexing courses...")
        # To reduce memory usage on large instances, split up the CourseOverviews into pages of 1,000 courses:
+
+        def index_course(course: CourseOverview) -> list:
+            docs = []
+            # Pre-fetch the course with all of its children:
+            course = store.get_course(course.id, depth=None)
+
+            def add_with_children(block):
+                """ Recursively index the given XBlock/component """
+                doc = searchable_doc_for_course_block(block)
+                doc.update(searchable_doc_tags(block.usage_key))
+                docs.append(doc)  # pylint: disable=cell-var-from-loop
+                _recurse_children(block, add_with_children)  # pylint: disable=cell-var-from-loop
+
+            # Index course children
+            _recurse_children(course, add_with_children)
+
+            if docs:
+                # Add all the docs in this course at once (usually faster than adding one at a time):
+                _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
+            return docs
+
        paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
        for p in paginator.page_range:
            for course in paginator.page(p).object_list:
                status_cb(
                    f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
                )
-                docs = []
-
-                # Pre-fetch the course with all of its children:
-                course = store.get_course(course.id, depth=None)
-
-                def add_with_children(block):
-                    """ Recursively index the given XBlock/component """
-                    doc = searchable_doc_for_course_block(block)
-                    doc.update(searchable_doc_tags(block.usage_key))
-                    docs.append(doc)  # pylint: disable=cell-var-from-loop
-                    _recurse_children(block, add_with_children)  # pylint: disable=cell-var-from-loop
-
-                # Index course children
-                _recurse_children(course, add_with_children)
-
-                if docs:
-                    # Add all the docs in this course at once (usually faster than adding one at a time):
-                    _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
+                course_docs = index_course(course)
                num_contexts_done += 1
-                num_blocks_done += len(docs)
+                num_blocks_done += len(course_docs)

-    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.")
+        ############## Collections ##############
+        status_cb("Indexing collections...")
+
+        def index_collection_batch(batch, num_contexts_done) -> int:
+            docs = []
+            for collection in batch:
+                status_cb(
+                    f"{num_contexts_done + 1}/{num_contexts}. "
+                    f"Now indexing collection {collection.title} ({collection.id})"
+                )
+                try:
+                    doc = searchable_doc_for_collection(collection)
+                    # Uncomment below line once collections are tagged.
+                    # doc.update(searchable_doc_tags(collection.id))
+                    docs.append(doc)
+                except Exception as err:  # pylint: disable=broad-except
+                    status_cb(f"Error indexing collection {collection}: {err}")
+                finally:
+                    num_contexts_done += 1
+
+            if docs:
+                try:
+                    # Add docs in batch of 100 at once (usually faster than adding one at a time):
+                    _wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
+                except (TypeError, KeyError, MeilisearchError) as err:
+                    status_cb(f"Error indexing collection batch {p}: {err}")
+            return num_contexts_done
+
+        # To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
+        paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
+        for p in paginator.page_range:
+            num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)
+
+    status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")


 def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None:
--- a/openedx/core/djangoapps/content/search/documents.py
+++ b/openedx/core/djangoapps/content/search/documents.py
@@ -13,6 +13,7 @@ from openedx.core.djangoapps.content.search.models import SearchAccess
 from openedx.core.djangoapps.content_libraries import api as lib_api
 from openedx.core.djangoapps.content_tagging import api as tagging_api
 from openedx.core.djangoapps.xblock import api as xblock_api
+from openedx_learning.api.authoring_models import LearningPackage

 log = logging.getLogger(__name__)

@@ -27,6 +28,7 @@ class Fields:
    type = "type"  # DocType.course_block or DocType.library_block (see below)
    block_id = "block_id"  # The block_id part of the usage key. Sometimes human-readable, sometimes a random hex ID
    display_name = "display_name"
+    description = "description"
    modified = "modified"
    created = "created"
    last_published = "last_published"
@@ -66,6 +68,7 @@ class DocType:
    """
    course_block = "course_block"
    library_block = "library_block"
+    collection = "collection"


 def meili_id_from_opaque_key(usage_key: UsageKey) -> str:
@@ -276,3 +279,38 @@ def searchable_doc_for_course_block(block) -> dict:
    doc.update(_fields_from_block(block))

    return doc
+
+
+def searchable_doc_for_collection(collection) -> dict:
+    """
+    Generate a dictionary document suitable for ingestion into a search engine
+    like Meilisearch or Elasticsearch, so that the given collection can be
+    found using faceted search.
+    """
+    doc = {
+        Fields.id: collection.id,
+        Fields.type: DocType.collection,
+        Fields.display_name: collection.title,
+        Fields.description: collection.description,
+        Fields.created: collection.created.timestamp(),
+        Fields.modified: collection.modified.timestamp(),
+        # Add related learning_package.key as context_key by default.
+        # If related contentlibrary is found, it will override this value below.
+        # Mostly contentlibrary.library_key == learning_package.key
+        Fields.context_key: collection.learning_package.key,
+    }
+    # Just in case learning_package is not related to a library
+    try:
+        context_key = collection.learning_package.contentlibrary.library_key
+        org = str(context_key.org)
+        doc.update({
+            Fields.context_key: str(context_key),
+            Fields.org: org,
+        })
+    except LearningPackage.contentlibrary.RelatedObjectDoesNotExist:
+        log.warning(f"Related library not found for {collection}")
+    doc[Fields.access_id] = _meili_access_id_from_context_key(doc[Fields.context_key])
+    # Add the breadcrumbs.
+    doc[Fields.breadcrumbs] = [{"display_name": collection.learning_package.title}]
+
+    return doc
--- a/openedx/core/djangoapps/content/search/tests/test_api.py
+++ b/openedx/core/djangoapps/content/search/tests/test_api.py
@@ -6,12 +6,13 @@ from __future__ import annotations
 import copy

 from datetime import datetime, timezone
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, Mock, call, patch
 from opaque_keys.edx.keys import UsageKey

 import ddt
 from django.test import override_settings
 from freezegun import freeze_time
+from openedx_learning.api import authoring as authoring_api
 from organizations.tests.factories import OrganizationFactory

 from common.djangoapps.student.tests.factories import UserFactory
@@ -174,6 +175,28 @@ class TestSearchApi(ModuleStoreTestCase):
        tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
        tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")

+        # Create a collection:
+        self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
+        self.collection_dict = {
+            'id': 1,
+            'type': 'collection',
+            'display_name': 'my_collection',
+            'description': 'my collection description',
+            'context_key': 'lib:org1:lib',
+            'org': 'org1',
+            'created': created_date.timestamp(),
+            'modified': created_date.timestamp(),
+            "access_id": lib_access.id,
+            'breadcrumbs': [{'display_name': 'Library'}]
+        }
+        with freeze_time(created_date):
+            self.collection = authoring_api.create_collection(
+                learning_package_id=self.learning_package.id,
+                title="my_collection",
+                created_by=None,
+                description="my collection description"
+            )
+
    @override_settings(MEILISEARCH_ENABLED=False)
    def test_reindex_meilisearch_disabled(self, mock_meilisearch):
        with self.assertRaises(RuntimeError):
@@ -199,10 +222,27 @@ class TestSearchApi(ModuleStoreTestCase):
            [
                call([doc_sequential, doc_vertical]),
                call([doc_problem1, doc_problem2]),
+                call([self.collection_dict]),
            ],
            any_order=True,
        )

+    @override_settings(MEILISEARCH_ENABLED=True)
+    @patch(
+        "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",
+        Mock(side_effect=Exception("Failed to generate document")),
+    )
+    def test_reindex_meilisearch_collection_error(self, mock_meilisearch):
+
+        mock_logger = Mock()
+        api.rebuild_index(mock_logger)
+        assert call(
+            [self.collection_dict]
+        ) not in mock_meilisearch.return_value.index.return_value.add_documents.mock_calls
+        mock_logger.assert_any_call(
+            f"Error indexing collection {self.collection}: Failed to generate document"
+        )
+
    @override_settings(MEILISEARCH_ENABLED=True)
    def test_reindex_meilisearch_library_block_error(self, mock_meilisearch):

--- a/openedx/core/djangoapps/content/search/tests/test_documents.py
+++ b/openedx/core/djangoapps/content/search/tests/test_documents.py
@@ -1,8 +1,12 @@
 """
 Tests for the Studio content search documents (what gets stored in the index)
 """
+from datetime import datetime, timezone
 from organizations.models import Organization

+from freezegun import freeze_time
+from openedx_learning.api import authoring as authoring_api
+
 from openedx.core.djangoapps.content_tagging import api as tagging_api
 from openedx.core.djangolib.testing.utils import skip_unless_cms
 from xmodule.modulestore.django import modulestore
@@ -11,10 +15,12 @@ from xmodule.modulestore.tests.factories import BlockFactory, ToyCourseFactory

 try:
    # This import errors in the lms because content.search is not an installed app there.
-    from ..documents import searchable_doc_for_course_block, searchable_doc_tags
+    from ..documents import searchable_doc_for_course_block, searchable_doc_tags, searchable_doc_for_collection
    from ..models import SearchAccess
 except RuntimeError:
    searchable_doc_for_course_block = lambda x: x
+    searchable_doc_tags = lambda x: x
+    searchable_doc_for_collection = lambda x: x
    SearchAccess = {}


@@ -198,3 +204,30 @@ class StudioDocumentsTest(SharedModuleStoreTestCase):
            "content": {},
            # This video has no tags.
        }
+
+    def test_collection_with_no_library(self):
+        created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc)
+        with freeze_time(created_date):
+            learning_package = authoring_api.create_learning_package(
+                key="course-v1:edX+toy+2012_Fall",
+                title="some learning_package",
+                description="some description",
+            )
+            collection = authoring_api.create_collection(
+                learning_package_id=learning_package.id,
+                title="my_collection",
+                created_by=None,
+                description="my collection description"
+            )
+        doc = searchable_doc_for_collection(collection)
+        assert doc == {
+            "id": collection.id,
+            "type": "collection",
+            "display_name": collection.title,
+            "description": collection.description,
+            "context_key": learning_package.key,
+            "access_id": self.toy_course_access_id,
+            "breadcrumbs": [{"display_name": learning_package.title}],
+            "created": created_date.timestamp(),
+            "modified": created_date.timestamp(),
+        }
--- a/requirements/constraints.txt
+++ b/requirements/constraints.txt
@@ -93,7 +93,7 @@ libsass==0.10.0
 click==8.1.6

 # pinning this version to avoid updates while the library is being developed
-openedx-learning==0.11.1
+openedx-learning==0.11.2

 # Open AI version 1.0.0 dropped support for openai.ChatCompletion which is currently in use in enterprise.
 openai<=0.28.1
--- a/requirements/edx/base.txt
+++ b/requirements/edx/base.txt
@@ -823,7 +823,7 @@ openedx-filters==1.9.0
    #   -r requirements/edx/kernel.in
    #   lti-consumer-xblock
    #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
    # via
    #   -c requirements/edx/../constraints.txt
    #   -r requirements/edx/kernel.in
--- a/requirements/edx/development.txt
+++ b/requirements/edx/development.txt
@@ -1372,7 +1372,7 @@ openedx-filters==1.9.0
    #   -r requirements/edx/testing.txt
    #   lti-consumer-xblock
    #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
    # via
    #   -c requirements/edx/../constraints.txt
    #   -r requirements/edx/doc.txt
--- a/requirements/edx/doc.txt
+++ b/requirements/edx/doc.txt
@@ -982,7 +982,7 @@ openedx-filters==1.9.0
    #   -r requirements/edx/base.txt
    #   lti-consumer-xblock
    #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
    # via
    #   -c requirements/edx/../constraints.txt
    #   -r requirements/edx/base.txt
--- a/requirements/edx/testing.txt
+++ b/requirements/edx/testing.txt
@@ -1033,7 +1033,7 @@ openedx-filters==1.9.0
    #   -r requirements/edx/base.txt
    #   lti-consumer-xblock
    #   ora2
-openedx-learning==0.11.1
+openedx-learning==0.11.2
    # via
    #   -c requirements/edx/../constraints.txt
    #   -r requirements/edx/base.txt