feat: incremental reindex_studio management command (#35864)

This allows large instances to run an (interruptable, resumable) reindex task that can cover thousands of courses.
This commit is contained in:
Daniel Valenzuela
2024-12-06 17:30:38 -03:00
committed by GitHub
parent 3196ceb4a0
commit b07464ba2d
6 changed files with 342 additions and 84 deletions

View File

@@ -5,7 +5,7 @@ from __future__ import annotations
import logging
import time
from contextlib import contextmanager
from contextlib import contextmanager, nullcontext
from datetime import datetime, timedelta, timezone
from functools import wraps
from typing import Callable, Generator
@@ -24,7 +24,14 @@ from common.djangoapps.student.roles import GlobalStaff
from rest_framework.request import Request
from common.djangoapps.student.role_helpers import get_course_roles
from openedx.core.djangoapps.content.course_overviews.models import CourseOverview
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request
from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted
from openedx.core.djangoapps.content.search.index_config import (
INDEX_DISTINCT_ATTRIBUTE,
INDEX_FILTERABLE_ATTRIBUTES,
INDEX_SEARCHABLE_ATTRIBUTES,
INDEX_SORTABLE_ATTRIBUTES,
INDEX_RANKING_RULES,
)
from openedx.core.djangoapps.content_libraries import api as lib_api
from xmodule.modulestore.django import modulestore
@@ -217,6 +224,42 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat
_wait_for_meili_task(client.delete_index(temp_index_name))
def _index_is_empty(index_name: str) -> bool:
"""
Check if an index is empty
Args:
index_name (str): The name of the index to check
"""
client = _get_meilisearch_client()
index = client.get_index(index_name)
return index.get_stats().number_of_documents == 0
def _configure_index(index_name):
"""
Configure the index. The following index settings are best changed on an empty index.
Changing them on a populated index will "re-index all documents in the index", which can take some time.
Args:
index_name (str): The name of the index to configure
"""
client = _get_meilisearch_client()
# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE)
# Mark which attributes can be used for filtering/faceted search:
client.index(index_name).update_filterable_attributes(INDEX_FILTERABLE_ATTRIBUTES)
# Mark which attributes are used for keyword search, in order of importance:
client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES)
# Mark which attributes can be used for sorting search results:
client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES)
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES)
def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None:
"""
Recurse the children of an XBlock and call the given function for each
@@ -279,8 +322,75 @@ def is_meilisearch_enabled() -> bool:
return False
# pylint: disable=too-many-statements
def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
def reset_index(status_cb: Callable[[str], None] | None = None) -> None:
"""
Reset the Meilisearch index, deleting all documents and reconfiguring it
"""
if status_cb is None:
status_cb = log.info
status_cb("Creating new empty index...")
with _using_temp_index(status_cb) as temp_index_name:
_configure_index(temp_index_name)
status_cb("Index recreated!")
status_cb("Index reset complete.")
def _is_index_configured(index_name: str) -> bool:
"""
Check if an index is completely configured
Args:
index_name (str): The name of the index to check
"""
client = _get_meilisearch_client()
index = client.get_index(index_name)
index_settings = index.get_settings()
for k, v in (
("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE),
("filterableAttributes", INDEX_FILTERABLE_ATTRIBUTES),
("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES),
("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES),
("rankingRules", INDEX_RANKING_RULES),
):
setting = index_settings.get(k, [])
if isinstance(v, list):
v = set(v)
setting = set(setting)
if setting != v:
return False
return True
def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None:
"""
Initialize the Meilisearch index, creating it and configuring it if it doesn't exist
"""
if status_cb is None:
status_cb = log.info
if warn_cb is None:
warn_cb = log.warning
if _index_exists(STUDIO_INDEX_NAME):
if _index_is_empty(STUDIO_INDEX_NAME):
warn_cb(
"The studio search index is empty. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return
if not _is_index_configured(STUDIO_INDEX_NAME):
warn_cb(
"A rebuild of the index is required. Please run ./manage.py cms reindex_studio"
" --experimental [--incremental]"
)
return
status_cb("Index already exists and is configured.")
return
reset_index(status_cb)
def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: # lint-amnesty, pylint: disable=too-many-statements
"""
Rebuild the Meilisearch index from scratch
"""
@@ -292,7 +402,14 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
# Get the lists of libraries
status_cb("Counting libraries...")
lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
keys_indexed = []
if incremental:
keys_indexed = list(IncrementalIndexCompleted.objects.values_list("context_key", flat=True))
lib_keys = [
lib.library_key
for lib in lib_api.ContentLibrary.objects.select_related("org").only("org", "slug").order_by("-id")
if lib.library_key not in keys_indexed
]
num_libraries = len(lib_keys)
# Get the list of courses
@@ -300,88 +417,25 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
num_courses = CourseOverview.objects.count()
# Some counters so we can track our progress as indexing progresses:
num_contexts = num_courses + num_libraries
num_contexts_done = 0 # How many courses/libraries we've indexed
num_libs_skipped = len(keys_indexed)
num_contexts = num_courses + num_libraries + num_libs_skipped
num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed
num_blocks_done = 0 # How many individual components/XBlocks we've indexed
status_cb(f"Found {num_courses} courses, {num_libraries} libraries.")
with _using_temp_index(status_cb) as temp_index_name:
with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name:
############## Configure the index ##############
# The following index settings are best changed on an empty index.
# Changing them on a populated index will "re-index all documents in the index, which can take some time"
# The index settings are best changed on an empty index.
# Changing them on a populated index will "re-index all documents in the index", which can take some time
# and use more RAM. Instead, we configure an empty index then populate it one course/library at a time.
# Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique):
client.index(temp_index_name).update_distinct_attribute(Fields.usage_key)
# Mark which attributes can be used for filtering/faceted search:
client.index(temp_index_name).update_filterable_attributes([
# Get specific block/collection using combination of block_id and context_key
Fields.block_id,
Fields.block_type,
Fields.context_key,
Fields.usage_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.type,
Fields.access_id,
Fields.last_published,
Fields.content + "." + Fields.problem_types,
])
# Mark which attributes are used for keyword search, in order of importance:
client.index(temp_index_name).update_searchable_attributes([
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
Fields.display_name,
Fields.block_id,
Fields.content,
Fields.description,
Fields.tags,
Fields.collections,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.published + "." + Fields.display_name,
Fields.published + "." + Fields.published_description,
])
# Mark which attributes can be used for sorting search results:
client.index(temp_index_name).update_sortable_attributes([
Fields.display_name,
Fields.created,
Fields.modified,
Fields.last_published,
])
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
# cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy
client.index(temp_index_name).update_ranking_rules([
"sort",
"words",
"typo",
"proximity",
"attribute",
"exactness",
])
if not incremental:
_configure_index(index_name)
############## Libraries ##############
status_cb("Indexing libraries...")
def index_library(lib_key: str) -> list:
def index_library(lib_key: LibraryLocatorV2) -> list:
docs = []
for component in lib_api.get_library_components(lib_key):
try:
@@ -396,7 +450,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
if docs:
try:
# Add all the docs in this library at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing library {lib_key}: {err}")
return docs
@@ -416,7 +470,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return num_done
@@ -439,6 +493,8 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
num_collections_done,
lib_key,
)
if incremental:
IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key)
status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}")
num_contexts_done += 1
@@ -464,7 +520,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
if docs:
# Add all the docs in this course at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
_wait_for_meili_task(client.index(index_name).add_documents(docs))
return docs
paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000)
@@ -473,10 +529,16 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})"
)
if course.id in keys_indexed:
num_contexts_done += 1
continue
course_docs = index_course(course)
if incremental:
IncrementalIndexCompleted.objects.get_or_create(context_key=course.id)
num_contexts_done += 1
num_blocks_done += len(course_docs)
IncrementalIndexCompleted.objects.all().delete()
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")

View File

@@ -0,0 +1,70 @@
"""Configuration for the search index."""
from .documents import Fields
INDEX_DISTINCT_ATTRIBUTE = "usage_key"
# Mark which attributes can be used for filtering/faceted search:
INDEX_FILTERABLE_ATTRIBUTES = [
# Get specific block/collection using combination of block_id and context_key
Fields.block_id,
Fields.block_type,
Fields.context_key,
Fields.usage_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.type,
Fields.access_id,
Fields.last_published,
Fields.content + "." + Fields.problem_types,
]
# Mark which attributes are used for keyword search, in order of importance:
INDEX_SEARCHABLE_ATTRIBUTES = [
# Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields.
Fields.display_name,
Fields.block_id,
Fields.content,
Fields.description,
Fields.tags,
Fields.collections,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections + "." + Fields.collections_display_name,
Fields.collections + "." + Fields.collections_key,
Fields.published + "." + Fields.display_name,
Fields.published + "." + Fields.published_description,
]
# Mark which attributes can be used for sorting search results:
INDEX_SORTABLE_ATTRIBUTES = [
Fields.display_name,
Fields.created,
Fields.modified,
Fields.last_published,
]
# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance.
INDEX_RANKING_RULES = [
"sort",
"words",
"typo",
"proximity",
"attribute",
"exactness",
]

View File

@@ -18,8 +18,11 @@ class Command(BaseCommand):
"""
def add_arguments(self, parser):
parser.add_argument('--experimental', action='store_true')
parser.set_defaults(experimental=False)
parser.add_argument("--experimental", action="store_true")
parser.add_argument("--reset", action="store_true")
parser.add_argument("--init", action="store_true")
parser.add_argument("--incremental", action="store_true")
parser.set_defaults(experimental=False, reset=False, init=False, incremental=False)
def handle(self, *args, **options):
"""
@@ -34,4 +37,11 @@ class Command(BaseCommand):
"Use the --experimental argument to acknowledge and run it."
)
api.rebuild_index(self.stdout.write)
if options["reset"]:
api.reset_index(self.stdout.write)
elif options["init"]:
api.init_index(self.stdout.write, self.stderr.write)
elif options["incremental"]:
api.rebuild_index(self.stdout.write, incremental=True)
else:
api.rebuild_index(self.stdout.write)

View File

@@ -0,0 +1,21 @@
# Generated by Django 4.2.16 on 2024-11-15 12:40
from django.db import migrations, models
import opaque_keys.edx.django.models
class Migration(migrations.Migration):
dependencies = [
('search', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='IncrementalIndexCompleted',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('context_key', opaque_keys.edx.django.models.LearningContextKeyField(max_length=255, unique=True)),
],
),
]

View File

@@ -65,3 +65,15 @@ def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) ->
course_clause | library_clause
).order_by('-id').values_list("id", flat=True)
)
class IncrementalIndexCompleted(models.Model):
"""
Stores the contex keys of aleady indexed courses and libraries for incremental indexing.
"""
context_key = LearningContextKeyField(
max_length=255,
unique=True,
null=False,
)

View File

@@ -10,8 +10,10 @@ from unittest.mock import MagicMock, Mock, call, patch
from opaque_keys.edx.keys import UsageKey
import ddt
import pytest
from django.test import override_settings
from freezegun import freeze_time
from meilisearch.errors import MeilisearchApiError
from openedx_learning.api import authoring as authoring_api
from organizations.tests.factories import OrganizationFactory
@@ -26,7 +28,7 @@ from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE,
try:
# This import errors in the lms because content.search is not an installed app there.
from .. import api
from ..models import SearchAccess
from ..models import SearchAccess, IncrementalIndexCompleted
except RuntimeError:
SearchAccess = {}
@@ -239,6 +241,87 @@ class TestSearchApi(ModuleStoreTestCase):
any_order=True,
)
@override_settings(MEILISEARCH_ENABLED=True)
def test_reindex_meilisearch_incremental(self, mock_meilisearch):
# Add tags field to doc, since reindex calls includes tags
doc_sequential = copy.deepcopy(self.doc_sequential)
doc_sequential["tags"] = {}
doc_vertical = copy.deepcopy(self.doc_vertical)
doc_vertical["tags"] = {}
doc_problem1 = copy.deepcopy(self.doc_problem1)
doc_problem1["tags"] = {}
doc_problem1["collections"] = {"display_name": [], "key": []}
doc_problem2 = copy.deepcopy(self.doc_problem2)
doc_problem2["tags"] = {}
doc_problem2["collections"] = {"display_name": [], "key": []}
doc_collection = copy.deepcopy(self.collection_dict)
doc_collection["tags"] = {}
api.rebuild_index(incremental=True)
assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 3
mock_meilisearch.return_value.index.return_value.add_documents.assert_has_calls(
[
call([doc_sequential, doc_vertical]),
call([doc_problem1, doc_problem2]),
call([doc_collection]),
],
any_order=True,
)
# Now we simulate interruption by passing this function to the status_cb argument
def simulated_interruption(message):
# this exception prevents courses from being indexed
if "Indexing courses" in message:
raise Exception("Simulated interruption")
with pytest.raises(Exception, match="Simulated interruption"):
api.rebuild_index(simulated_interruption, incremental=True)
# two more calls due to collections
assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 5
assert IncrementalIndexCompleted.objects.all().count() == 1
api.rebuild_index(incremental=True)
assert IncrementalIndexCompleted.objects.all().count() == 0
# one missing course indexed
assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 6
@override_settings(MEILISEARCH_ENABLED=True)
def test_reset_meilisearch_index(self, mock_meilisearch):
api.reset_index()
mock_meilisearch.return_value.swap_indexes.assert_called_once()
mock_meilisearch.return_value.create_index.assert_called_once()
mock_meilisearch.return_value.delete_index.call_count = 2
api.reset_index()
mock_meilisearch.return_value.delete_index.call_count = 4
@override_settings(MEILISEARCH_ENABLED=True)
def test_init_meilisearch_index(self, mock_meilisearch):
# Test index already exists
api.init_index()
mock_meilisearch.return_value.swap_indexes.assert_not_called()
mock_meilisearch.return_value.create_index.assert_not_called()
mock_meilisearch.return_value.delete_index.assert_not_called()
# Test index already exists and has no documents
mock_meilisearch.return_value.get_stats.return_value = 0
api.init_index()
mock_meilisearch.return_value.swap_indexes.assert_not_called()
mock_meilisearch.return_value.create_index.assert_not_called()
mock_meilisearch.return_value.delete_index.assert_not_called()
mock_meilisearch.return_value.get_index.side_effect = [
MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')),
MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')),
Mock(created_at=1),
Mock(created_at=1),
Mock(created_at=1),
]
api.init_index()
mock_meilisearch.return_value.swap_indexes.assert_called_once()
mock_meilisearch.return_value.create_index.assert_called_once()
mock_meilisearch.return_value.delete_index.call_count = 2
@override_settings(MEILISEARCH_ENABLED=True)
@patch(
"openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",