diff --git a/cms/djangoapps/contentstore/courseware_index.py b/cms/djangoapps/contentstore/courseware_index.py index e030d6844b..ab522f44ca 100644 --- a/cms/djangoapps/contentstore/courseware_index.py +++ b/cms/djangoapps/contentstore/courseware_index.py @@ -10,10 +10,8 @@ from eventtracking import tracker from xmodule.modulestore import ModuleStoreEnum from search.search_engine_base import SearchEngine +from opaque_keys.edx.locator import CourseLocator, LibraryLocator -# Use default index and document names for now -INDEX_NAME = "courseware_index" -DOCUMENT_TYPE = "courseware_content" # REINDEX_AGE is the default amount of time that we look back for changes # that might have happened. If we are provided with a time at which the @@ -40,18 +38,56 @@ class SearchIndexingError(Exception): self.error_list = error_list -class CoursewareSearchIndexer(object): +class SearchIndexBase(object): """ Class to perform indexing for courseware search from different modulestores """ + INDEX_NAME = None + DOCUMENT_TYPE = None + + INDEX_EVENT = { + 'name': None, + 'category': None + } + @classmethod - def index_course(cls, modulestore, course_key, triggered_at=None, reindex_age=REINDEX_AGE): + def _fetch_top_level(self, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + raise NotImplementedError("Should be overridden in child classes") + + @classmethod + def _get_location_info(self, structure_key): + """ Builds location info dictionary """ + raise NotImplementedError("Should be overridden in child classes") + + @classmethod + def _id_modifier(self, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id + + @classmethod + def remove_deleted_items(cls, searcher, structure_key, exclude_items): + """ + remove any item that is present in the search index that is not present in updated list of indexed items + as we find items we can shorten the set of items to keep + """ + response = searcher.search( + doc_type=cls.DOCUMENT_TYPE, + field_dictionary=cls._get_location_info(structure_key), + exclude_ids=exclude_items + ) + result_ids = [result["data"]["id"] for result in response["results"]] + for result_id in result_ids: + searcher.remove(cls.DOCUMENT_TYPE, result_id) + + @classmethod + def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE): """ Process course for indexing Arguments: - course_key (CourseKey) - course identifier + structure_key (CourseKey|LibraryKey) - course or library identifier triggered_at (datetime) - provides time at which indexing was triggered; useful for index updates - only things changed recently from that date @@ -64,13 +100,11 @@ class CoursewareSearchIndexer(object): Number of items that have been added to the index """ error_list = [] - searcher = SearchEngine.get_search_engine(INDEX_NAME) + searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return - location_info = { - "course": unicode(course_key), - } + location_info = cls._get_location_info(structure_key) # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `index_item` indexed_count = { @@ -101,7 +135,7 @@ class CoursewareSearchIndexer(object): if not item_index_dictionary and not item.has_children: return - item_id = unicode(item.scope_ids.usage_id) + item_id = unicode(cls._id_modifier(item.scope_ids.usage_id)) indexed_items.add(item_id) if item.has_children: # determine if it's okay to skip adding the children herein based upon how recently any may have changed @@ -122,38 +156,24 @@ class CoursewareSearchIndexer(object): if item.start: item_index['start_date'] = item.start - searcher.index(DOCUMENT_TYPE, item_index) + searcher.index(cls.DOCUMENT_TYPE, item_index) indexed_count["count"] += 1 except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %r', item.location, err) error_list.append(_('Could not index item: {}').format(item.location)) - def remove_deleted_items(): - """ - remove any item that is present in the search index that is not present in updated list of indexed items - as we find items we can shorten the set of items to keep - """ - response = searcher.search( - doc_type=DOCUMENT_TYPE, - field_dictionary={"course": unicode(course_key)}, - exclude_ids=indexed_items - ) - result_ids = [result["data"]["id"] for result in response["results"]] - for result_id in result_ids: - searcher.remove(DOCUMENT_TYPE, result_id) - try: with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): - course = modulestore.get_course(course_key, depth=None) - for item in course.get_children(): + structure = cls._fetch_top_level(modulestore, structure_key) + for item in structure.get_children(): index_item(item) - remove_deleted_items() + cls.remove_deleted_items(searcher, structure_key, indexed_items) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %r", - course_key, + structure_key, err ) error_list.append(_('General indexing error occurred')) @@ -164,31 +184,93 @@ class CoursewareSearchIndexer(object): return indexed_count["count"] @classmethod - def do_course_reindex(cls, modulestore, course_key): + def _do_reindex(cls, modulestore, structure_key): """ - (Re)index all content within the given course, tracking the fact that a full reindex has taking place + (Re)index all content within the given structure (course or library), + tracking the fact that a full reindex has taken place """ - indexed_count = cls.index_course(modulestore, course_key) + indexed_count = cls.index(modulestore, structure_key) if indexed_count: - cls._track_index_request('edx.course.index.reindexed', indexed_count) + cls._track_index_request(cls.INDEX_EVENT['name'], cls.INDEX_EVENT['category'], indexed_count) return indexed_count @classmethod - def _track_index_request(cls, event_name, indexed_count): + def _track_index_request(cls, event_name, category, indexed_count): """Track content index requests. Arguments: event_name (str): Name of the event to be logged. + category (str): cat3gory of indexed items + indexed_count (int): number of indexed items Returns: None """ data = { "indexed_count": indexed_count, - 'category': 'courseware_index', + 'category': category, } tracker.emit( event_name, data ) + + +class CoursewareSearchIndexer(SearchIndexBase): + INDEX_NAME = "courseware_index" + DOCUMENT_TYPE = "courseware_content" + + INDEX_EVENT = { + 'name': 'edx.course.index.reindexed', + 'category': 'courseware_index' + } + + @classmethod + def _fetch_top_level(self, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_course(structure_key, depth=None) + + @classmethod + def _get_location_info(self, structure_key): + """ Builds location info dictionary """ + return {"course": unicode(structure_key)} + + @classmethod + def do_course_reindex(cls, modulestore, course_key): + """ + (Re)index all content within the given course, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, course_key) + + +class LibrarySearchIndexer(SearchIndexBase): + INDEX_NAME = "library_index" + DOCUMENT_TYPE = "library_content" + + INDEX_EVENT = { + 'name': 'edx.library.index.reindexed', + 'category': 'library_index' + } + + @classmethod + def _fetch_top_level(self, modulestore, structure_key): + """ Fetch the item from the modulestore location """ + return modulestore.get_library(structure_key, depth=None) + + @classmethod + def _get_location_info(self, structure_key): + """ Builds location info dictionary """ + return {"library": unicode(structure_key.replace(version_guid=None, branch=None))} + + @classmethod + def _id_modifier(self, usage_id): + """ Modifies usage_id to submit to index """ + return usage_id.replace(library_key=(usage_id.library_key.replace(version_guid=None, branch=None))) + + @classmethod + def do_library_reindex(cls, modulestore, library_key): + """ + (Re)index all content within the given library, tracking the fact that a full reindex has taken place + """ + return cls._do_reindex(modulestore, library_key) \ No newline at end of file diff --git a/cms/djangoapps/contentstore/tests/test_courseware_index.py b/cms/djangoapps/contentstore/tests/test_courseware_index.py index 8f0621d60e..25db2d4e62 100644 --- a/cms/djangoapps/contentstore/tests/test_courseware_index.py +++ b/cms/djangoapps/contentstore/tests/test_courseware_index.py @@ -2,6 +2,7 @@ Testing indexing of the courseware as it is changed """ import ddt +from lazy.lazy import lazy import time from datetime import datetime from mock import patch @@ -15,15 +16,18 @@ from xmodule.modulestore.edit_info import EditInfoMixin from xmodule.modulestore.inheritance import InheritanceMixin from xmodule.modulestore.mixed import MixedModuleStore from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase -from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory +from xmodule.modulestore.tests.factories import CourseFactory, ItemFactory, LibraryFactory from xmodule.modulestore.tests.mongo_connection import MONGO_PORT_NUM, MONGO_HOST from xmodule.modulestore.tests.test_cross_modulestore_import_export import MongoContentstoreBuilder from xmodule.modulestore.tests.utils import create_modulestore_instance, LocationMixin, MixedSplitTestCase from xmodule.tests import DATA_DIR from xmodule.x_module import XModuleMixin + from search.search_engine_base import SearchEngine -from contentstore.courseware_index import CoursewareSearchIndexer, INDEX_NAME, DOCUMENT_TYPE, SearchIndexingError +from contentstore.courseware_index import ( + CoursewareSearchIndexer, LibrarySearchIndexer, SearchIndexingError, get_indexer_for_location +) from contentstore.signals import listen_for_course_publish @@ -123,9 +127,21 @@ class MixedWithOptionsTestCase(MixedSplitTestCase): """ base version of setup_course_base is a no-op """ pass + @lazy + def searcher(self): + return self.get_search_engine() + + def _get_default_search(self): + """ Returns field_dictionary for default search """ + return {} + def get_search_engine(self): """ Centralized call to getting the search engine for the test """ - return SearchEngine.get_search_engine(INDEX_NAME) + return SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME) + + def search(self, field_dictionary=None): + fields = field_dictionary if field_dictionary else self._get_default_search() + return self.searcher.search(field_dictionary=fields) def _perform_test_using_store(self, store_type, test_to_perform): """ Helper method to run a test function that uses a specific store """ @@ -217,38 +233,39 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase): def index_recent_changes(self, store, since_time): """ index course using recent changes """ trigger_time = datetime.now(UTC) - return CoursewareSearchIndexer.index_course( + return CoursewareSearchIndexer.index( store, self.course.id, triggered_at=trigger_time, reindex_age=(trigger_time - since_time) ) + def _get_default_search(self): + return {"course": unicode(self.course.id)} + def _test_indexing_course(self, store): """ indexing course tests """ - searcher = self.get_search_engine() - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 0) # Only published modules should be in the index added_to_index = self.reindex_course(store) self.assertEqual(added_to_index, 3) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 3) # Publish the vertical as is, and any unpublished children should now be available self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) def _test_not_indexing_unpublished_content(self, store): """ add a new one, only appers in index once added """ - searcher = self.get_search_engine() # Publish the vertical to start with self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # Now add a new unit to the existing vertical @@ -260,44 +277,42 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase): modulestore=store, ) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # Now publish it and we should find it # Publish the vertical as is, and everything should be available self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 5) def _test_deleting_item(self, store): """ test deleting an item """ - searcher = self.get_search_engine() # Publish the vertical to start with self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # just a delete should not change anything self.delete_item(store, self.html_unit.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # but after publishing, we should no longer find the html_unit self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 3) def _test_not_indexable(self, store): """ test not indexable items """ - searcher = self.get_search_engine() # Publish the vertical to start with self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # Add a non-indexable item @@ -309,25 +324,24 @@ class TestCoursewareSearchIndexer(MixedWithOptionsTestCase): modulestore=store, ) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) # even after publishing, we should not find the non-indexable item self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) def _test_start_date_propagation(self, store): """ make sure that the start date is applied at the right level """ - searcher = self.get_search_engine() early_date = self.course.start later_date = self.vertical.start # Publish the vertical self.publish_item(store, self.vertical.location) self.reindex_course(store) - response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) + response = self.search() self.assertEqual(response["total"], 4) results = response["results"] @@ -438,13 +452,13 @@ class TestLargeCourseDeletions(MixedWithOptionsTestCase): def _clean_course_id(self): """ Clean all documents from the index that have a specific course provided """ if self.course_id: - searcher = self.get_search_engine() - response = searcher.search(field_dictionary={"course": self.course_id}) + + response = self.searcher.search(field_dictionary={"course": self.course_id}) while response["total"] > 0: for item in response["results"]: - searcher.remove(DOCUMENT_TYPE, item["data"]["id"]) - searcher.remove(DOCUMENT_TYPE, item["data"]["id"]) - response = searcher.search(field_dictionary={"course": self.course_id}) + self.searcher.remove(TestCoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"]) + self.searcher.remove(TestCoursewareSearchIndexer.DOCUMENT_TYPE, item["data"]["id"]) + response = self.searcher.search(field_dictionary={"course": self.course_id}) self.course_id = None def setUp(self): @@ -457,8 +471,8 @@ class TestLargeCourseDeletions(MixedWithOptionsTestCase): def assert_search_count(self, expected_count): """ Check that the search within this course will yield the expected number of results """ - searcher = self.get_search_engine() - response = searcher.search(field_dictionary={"course": self.course_id}) + + response = self.searcher.search(field_dictionary={"course": self.course_id}) self.assertEqual(response["total"], expected_count) def _do_test_large_course_deletion(self, store, load_factor): @@ -553,7 +567,7 @@ class TestTaskExecution(ModuleStoreTestCase): def test_task_indexing_course(self): """ Making sure that the receiver correctly fires off the task when invoked by signal """ - searcher = SearchEngine.get_search_engine(INDEX_NAME) + searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME) response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 0) @@ -563,3 +577,173 @@ class TestTaskExecution(ModuleStoreTestCase): # Note that this test will only succeed if celery is working in inline mode response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 3) + + +@ddt.ddt +class TestLibrarySearchIndexer(MixedWithOptionsTestCase): + """ Tests the operation of the CoursewareSearchIndexer """ + + def setUp(self): + super(TestLibrarySearchIndexer, self).setUp() + + self.library = None + self.html_unit1 = None + self.html_unit2 = None + + def setup_course_base(self, store): + """ + Set up the for the course outline tests. + """ + self.library = LibraryFactory.create(modulestore=store) + + self.html_unit1 = ItemFactory.create( + parent_location=self.library.location, + category="html", + display_name="Html Content", + modulestore=store, + publish_item=False, + ) + + self.html_unit2 = ItemFactory.create( + parent_location=self.library.location, + category="html", + display_name="Html Content 2", + modulestore=store, + publish_item=False, + ) + + def _get_default_search(self): + """ Returns field_dictionary for default search """ + return {"library": unicode(self.library.location.replace(version_guid=None, branch=None))} + + def reindex_library(self, store): + """ kick off complete reindex of the course """ + return LibrarySearchIndexer.do_library_reindex(store, self.library.location.library_key) + + def _get_contents(self, response): + """ Extracts contents from search response """ + return [item['contents'] for item in response['results']] + + def index_recent_changes(self, store, since_time): + """ index course using recent changes """ + trigger_time = datetime.now(UTC) + return LibrarySearchIndexer.index( + store, + self.library.id, + triggered_at=trigger_time, + reindex_age=(trigger_time - since_time) + ) + + def _test_indexing_library(self, store): + """ indexing course tests """ + response = self.search() + self.assertEqual(response["total"], 2) + + added_to_index = self.reindex_library(store) + self.assertEqual(added_to_index, 2) + response = self.search() + self.assertEqual(response["total"], 2) + + def _test_creating_item(self, store): + """ test updating an item """ + response = self.search() + self.assertEqual(response["total"], 2) + + # updating a library item causes immediate reindexing + data = "Some data" + ItemFactory.create( + parent_location=self.library.location, + category="html", + display_name="Html Content 3", + data=data, + modulestore=store, + publish_item=False, + ) + + response = self.search() + self.assertEqual(response["total"], 3) + html_contents = [cont['html_content'] for cont in self._get_contents(response)] + self.assertIn(data, html_contents) + + def _test_updating_item(self, store): + """ test updating an item """ + response = self.search() + self.assertEqual(response["total"], 2) + + # updating a library item causes immediate reindexing + new_data = "I'm new data" + self.html_unit1.data = new_data + self.reindex_library(store) + response = self.search() + self.assertEqual(response["total"], 2) + html_contents = [cont['html_content'] for cont in self._get_contents(response)] + self.assertIn(new_data, html_contents) + + def _test_deleting_item(self, store): + """ test deleting an item """ + response = self.search() + self.assertEqual(response["total"], 2) + + # deleting a library item causes immediate reindexing + self.delete_item(store, self.html_unit1.location) + self.reindex_library(store) + response = self.search() + self.assertEqual(response["total"], 1) + + def _test_not_indexable(self, store): + """ test not indexable items """ + + response = self.search() + self.assertEqual(response["total"], 2) + + # Add a non-indexable item + ItemFactory.create( + parent_location=self.library.location, + category="problem", + display_name="Some other content", + publish_item=False, + modulestore=store, + ) + self.reindex_library(store) + response = self.search() + self.assertEqual(response["total"], 2) + + @patch('django.conf.settings.SEARCH_ENGINE', None) + def _test_search_disabled(self, store): + """ if search setting has it as off, confirm that nothing is indexed """ + indexed_count = self.reindex_library(store) + self.assertFalse(indexed_count) + + @patch('django.conf.settings.SEARCH_ENGINE', 'search.tests.utils.ErroringIndexEngine') + def _test_exception(self, store): + """ Test that exception within indexing yields a SearchIndexingError """ + with self.assertRaises(SearchIndexingError): + self.reindex_library(store) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_indexing_library(self, store_type): + self._perform_test_using_store(store_type, self._test_indexing_library) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_updating_item(self, store_type): + self._perform_test_using_store(store_type, self._test_updating_item) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_creating_item(self, store_type): + self._perform_test_using_store(store_type, self._test_creating_item) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_deleting_item(self, store_type): + self._perform_test_using_store(store_type, self._test_deleting_item) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_not_indexable(self, store_type): + self._perform_test_using_store(store_type, self._test_not_indexable) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_search_disabled(self, store_type): + self._perform_test_using_store(store_type, self._test_search_disabled) + + @ddt.data(ModuleStoreEnum.Type.mongo, ModuleStoreEnum.Type.split) + def test_exception(self, store_type): + self._perform_test_using_store(store_type, self._test_exception) \ No newline at end of file