Merge pull request #7027 from edx/jeskew/defn_loading_perf_problems

Rollback override of lazy definition loading for large courses.
2015-02-23 11:15:44 -05:00
parent 906f3474bb 4e7ed93b3e
commit d3e2cb0fac
4 changed files with 97 additions and 54 deletions
--- a/common/lib/xmodule/xmodule/modulestore/split_mongo/split.py
+++ b/common/lib/xmodule/xmodule/modulestore/split_mongo/split.py
@@ -423,11 +423,12 @@ class SplitBulkWriteMixin(BulkOperationsMixin):
                    ids.remove(definition_id)
                    definitions.append(definition)

-        # Query the db for the definitions.
-        defs_from_db = self.db_connection.get_definitions(list(ids))
-        # Add the retrieved definitions to the cache.
-        bulk_write_record.definitions.update({d.get('_id'): d for d in defs_from_db})
-        definitions.extend(defs_from_db)
+        if len(ids):
+            # Query the db for the definitions.
+            defs_from_db = self.db_connection.get_definitions(list(ids))
+            # Add the retrieved definitions to the cache.
+            bulk_write_record.definitions.update({d.get('_id'): d for d in defs_from_db})
+            definitions.extend(defs_from_db)
        return definitions

    def update_definition(self, course_key, definition):
@@ -683,7 +684,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        connection.close()

    def cache_items(self, system, base_block_ids, course_key, depth=0, lazy=True):
-        '''
+        """
        Handles caching of items once inheritance and any other one time
        per course per fetch operations are done.

@@ -692,8 +693,8 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
            base_block_ids: list of BlockIds to fetch
            course_key: the destination course providing the context
            depth: how deep below these to prefetch
-            lazy: whether to fetch definitions or use placeholders
-        '''
+            lazy: whether to load definitions now or later
+        """
        with self.bulk_operations(course_key, emit_signals=False):
            new_module_data = {}
            for block_id in base_block_ids:
@@ -704,13 +705,9 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
                    new_module_data
                )

-            # This code supports lazy loading, where the descendent definitions aren't loaded
+            # This method supports lazy loading, where the descendent definitions aren't loaded
            # until they're actually needed.
-            # However, assume that depth == 0 means no depth is specified and depth != 0 means
-            # a depth *is* specified. If a non-zero depth is specified, force non-lazy definition
-            # loading in order to populate the definition cache for later access.
-            load_definitions_now = depth != 0 or not lazy
-            if load_definitions_now:
+            if not lazy:
                # Non-lazy loading: Load all descendants by id.
                descendent_definitions = self.get_definitions(
                    course_key,
@@ -734,15 +731,16 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
            return system.module_data

    @contract(course_entry=CourseEnvelope, block_keys="list(BlockKey)", depth="int | None")
-    def _load_items(self, course_entry, block_keys, depth=0, lazy=True, **kwargs):
-        '''
+    def _load_items(self, course_entry, block_keys, depth=0, **kwargs):
+        """
        Load & cache the given blocks from the course. May return the blocks in any order.

-        Load the definitions into each block if lazy is False;
-        otherwise, use the lazy definition placeholder.
-        '''
+        Load the definitions into each block if lazy is in kwargs and is False;
+        otherwise, do not load the definitions - they'll be loaded later when needed.
+        """
        runtime = self._get_cache(course_entry.structure['_id'])
        if runtime is None:
+            lazy = kwargs.pop('lazy', True)
            runtime = self.create_runtime(course_entry, lazy)
            self._add_cache(course_entry.structure['_id'], runtime)
            self.cache_items(runtime, block_keys, course_entry.course_key, depth, lazy)
@@ -786,7 +784,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
            self.request_cache.data['course_cache'] = {}

    def _lookup_course(self, course_key):
-        '''
+        """
        Decode the locator into the right series of db access. Does not
        return the CourseDescriptor! It returns the actual db json from
        structures.
@@ -797,7 +795,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        reference)

        :param course_key: any subclass of CourseLocator
-        '''
+        """
        if course_key.org and course_key.course and course_key.run:
            if course_key.branch is None:
                raise InsufficientSpecificationError(course_key)
@@ -864,7 +862,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
            locator = locator_factory(structure_info, branch)
            envelope = CourseEnvelope(locator, entry)
            root = entry['root']
-            structures_list = self._load_items(envelope, [root], 0, lazy=True, **kwargs)
+            structures_list = self._load_items(envelope, [root], depth=0, **kwargs)
            if not isinstance(structures_list[0], ErrorDescriptor):
                result.append(structures_list[0])
        return result
@@ -929,13 +927,13 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        """
        structure_entry = self._lookup_course(structure_id)
        root = structure_entry.structure['root']
-        result = self._load_items(structure_entry, [root], depth, lazy=True, **kwargs)
+        result = self._load_items(structure_entry, [root], depth, **kwargs)
        return result[0]

    def get_course(self, course_id, depth=0, **kwargs):
-        '''
+        """
        Gets the course descriptor for the course identified by the locator
-        '''
+        """
        if not isinstance(course_id, CourseLocator) or course_id.deprecated:
            # The supplied CourseKey is of the wrong type, so it can't possibly be stored in this modulestore.
            raise ItemNotFoundError(course_id)
@@ -951,14 +949,14 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        return self._get_structure(library_id, depth, **kwargs)

    def has_course(self, course_id, ignore_case=False, **kwargs):
-        '''
+        """
        Does this course exist in this modulestore. This method does not verify that the branch &/or
        version in the course_id exists. Use get_course_index_info to check that.

        Returns the course_id of the course if it was found, else None
        Note: we return the course_id instead of a boolean here since the found course may have
           a different id than the given course_id when ignore_case is True.
-        '''
+        """
        if not isinstance(course_id, CourseLocator) or course_id.deprecated:
            # The supplied CourseKey is of the wrong type, so it can't possibly be stored in this modulestore.
            return False
@@ -967,12 +965,12 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        return CourseLocator(course_index['org'], course_index['course'], course_index['run'], course_id.branch) if course_index else None

    def has_library(self, library_id, ignore_case=False, **kwargs):
-        '''
+        """
        Does this library exist in this modulestore. This method does not verify that the branch &/or
        version in the library_id exists.

        Returns the library_id of the course if it was found, else None.
-        '''
+        """
        if not isinstance(library_id, LibraryLocator):
            return None

@@ -1017,7 +1015,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):

        with self.bulk_operations(usage_key.course_key):
            course = self._lookup_course(usage_key.course_key)
-            items = self._load_items(course, [BlockKey.from_usage_key(usage_key)], depth, lazy=True, **kwargs)
+            items = self._load_items(course, [BlockKey.from_usage_key(usage_key)], depth, **kwargs)
            if len(items) == 0:
                raise ItemNotFoundError(usage_key)
            elif len(items) > 1:
@@ -1078,7 +1076,7 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
                if block_name == block_id.id and _block_matches_all(block):
                    block_ids.append(block_id)

-            return self._load_items(course, block_ids, lazy=True, **kwargs)
+            return self._load_items(course, block_ids, **kwargs)

        if 'category' in qualifiers:
            qualifiers['block_type'] = qualifiers.pop('category')
@@ -1091,18 +1089,18 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
                items.append(block_id)

        if len(items) > 0:
-            return self._load_items(course, items, 0, lazy=True, **kwargs)
+            return self._load_items(course, items, depth=0, **kwargs)
        else:
            return []

    def get_parent_location(self, locator, **kwargs):
-        '''
+        """
        Return the location (Locators w/ block_ids) for the parent of this location in this
        course. Could use get_items(location, {'children': block_id}) but this is slightly faster.
        NOTE: the locator must contain the block_id, and this code does not actually ensure block_id exists

        :param locator: BlockUsageLocator restricting search scope
-        '''
+        """
        if not isinstance(locator, BlockUsageLocator) or locator.deprecated:
            # The supplied locator is of the wrong type, so it can't possibly be stored in this modulestore.
            raise ItemNotFoundError(locator)
@@ -1208,12 +1206,12 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        return definition['edit_info']

    def get_course_successors(self, course_locator, version_history_depth=1):
-        '''
+        """
        Find the version_history_depth next versions of this course. Return as a VersionTree
        Mostly makes sense when course_locator uses a version_guid, but because it finds all relevant
        next versions, these do include those created for other courses.
        :param course_locator:
-        '''
+        """
        if not isinstance(course_locator, CourseLocator) or course_locator.deprecated:
            # The supplied CourseKey is of the wrong type, so it can't possibly be stored in this modulestore.
            raise ItemNotFoundError(course_locator)
@@ -1244,14 +1242,14 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        return VersionTree(course_locator, result)

    def get_block_generations(self, block_locator):
-        '''
+        """
        Find the history of this block. Return as a VersionTree of each place the block changed (except
        deletion).

        The block's history tracks its explicit changes but not the changes in its children starting
        from when the block was created.

-        '''
+        """
        # course_agnostic means we don't care if the head and version don't align, trust the version
        course_struct = self._lookup_course(block_locator.course_key.course_agnostic()).structure
        block_key = BlockKey.from_usage_key(block_locator)
@@ -1296,9 +1294,9 @@ class SplitMongoModuleStore(SplitBulkWriteMixin, ModuleStoreWriteBase):
        )

    def get_definition_successors(self, definition_locator, version_history_depth=1):
-        '''
+        """
        Find the version_history_depth next versions of this definition. Return as a VersionTree
-        '''
+        """
        # TODO implement
        pass

--- a/common/lib/xmodule/xmodule/modulestore/tests/test_mongo_call_count.py
+++ b/common/lib/xmodule/xmodule/modulestore/tests/test_mongo_call_count.py
@@ -87,14 +87,39 @@ class CountMongoCallsCourseTraversal(TestCase):
    to the leaf nodes.
    """

+    # Suppose you want to traverse a course - maybe accessing the fields of each XBlock in the course,
+    # maybe not. What parameters should one use for get_course() in order to minimize the number of
+    # mongo calls? The tests below both ensure that code changes don't increase the number of mongo calls
+    # during traversal -and- demonstrate how to minimize the number of calls.
    @ddt.data(
-        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, None, 189),  # The way this traversal *should* be done.
-        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, 0, 387),     # The pathological case - do *not* query a course this way!
-        (MIXED_SPLIT_MODULESTORE_BUILDER, None, 7),  # The way this traversal *should* be done.
-        (MIXED_SPLIT_MODULESTORE_BUILDER, 0, 145)    # The pathological case - do *not* query a course this way!
+        # These two lines show the way this traversal *should* be done
+        # (if you'll eventually access all the fields and load all the definitions anyway).
+        # 'lazy' does not matter in old Mongo.
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, None, False, True, 189),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, None, True, True, 189),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, 0, False, True, 387),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, 0, True, True, 387),
+        # As shown in these two lines: whether or not the XBlock fields are accessed,
+        # the same number of mongo calls are made in old Mongo for depth=None.
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, None, False, False, 189),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, None, True, False, 189),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, 0, False, False, 387),
+        (MIXED_OLD_MONGO_MODULESTORE_BUILDER, 0, True, False, 387),
+        # The line below shows the way this traversal *should* be done
+        # (if you'll eventually access all the fields and load all the definitions anyway).
+        (MIXED_SPLIT_MODULESTORE_BUILDER, None, False, True, 4),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, None, True, True, 143),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, 0, False, True, 143),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, 0, True, True, 143),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, None, False, False, 4),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, None, True, False, 4),
+        # TODO: The call count below seems like a bug - should be 4?
+        # Seems to be related to using self.lazy in CachingDescriptorSystem.get_module_data().
+        (MIXED_SPLIT_MODULESTORE_BUILDER, 0, False, False, 143),
+        (MIXED_SPLIT_MODULESTORE_BUILDER, 0, True, False, 4)
    )
    @ddt.unpack
-    def test_number_mongo_calls(self, store, depth, num_mongo_calls):
+    def test_number_mongo_calls(self, store, depth, lazy, access_all_block_fields, num_mongo_calls):
        with store.build() as (source_content, source_store):

            source_course_key = source_store.make_course_key('a', 'course', 'course')
@@ -116,10 +141,20 @@ class CountMongoCallsCourseTraversal(TestCase):
            # Starting at the root course block, do a breadth-first traversal using
            # get_children() to retrieve each block's children.
            with check_mongo_calls(num_mongo_calls):
-                start_block = source_store.get_course(source_course_key, depth=depth)
-                stack = [start_block]
-                while stack:
-                    curr_block = stack.pop()
-                    if curr_block.has_children:
-                        for block in reversed(curr_block.get_children()):
-                            stack.append(block)
+                with source_store.bulk_operations(source_course_key):
+                    start_block = source_store.get_course(source_course_key, depth=depth, lazy=lazy)
+                    all_blocks = []
+                    stack = [start_block]
+                    while stack:
+                        curr_block = stack.pop()
+                        all_blocks.append(curr_block)
+                        if curr_block.has_children:
+                            for block in reversed(curr_block.get_children()):
+                                stack.append(block)
+
+                    if access_all_block_fields:
+                        # Read the fields on each block in order to ensure each block and its definition is loaded.
+                        for xblock in all_blocks:
+                            for __, field in xblock.fields.iteritems():
+                                if field.is_set_on(xblock):
+                                    __ = field.read_from(xblock)
--- a/common/lib/xmodule/xmodule/modulestore/tests/test_split_modulestore_bulk_operations.py
+++ b/common/lib/xmodule/xmodule/modulestore/tests/test_split_modulestore_bulk_operations.py
@@ -405,7 +405,12 @@ class TestBulkWriteMixinFindMethods(TestBulkWriteMixin):

        self.conn.get_definitions.return_value = db_definitions
        results = self.bulk.get_definitions(self.course_key, search_ids)
-        self.conn.get_definitions.assert_called_once_with(list(set(search_ids) - set(active_ids)))
+        definitions_gotten = list(set(search_ids) - set(active_ids))
+        if len(definitions_gotten) > 0:
+            self.conn.get_definitions.assert_called_once_with(definitions_gotten)
+        else:
+            # If no definitions to get, then get_definitions() should *not* have been called.
+            self.assertEquals(self.conn.get_definitions.call_count, 0)
        for _id in active_ids:
            if _id in search_ids:
                self.assertIn(active_definition(_id), results)
--- a/common/lib/xmodule/xmodule/modulestore/xml_exporter.py
+++ b/common/lib/xmodule/xmodule/modulestore/xml_exporter.py
@@ -41,7 +41,12 @@ def export_to_xml(modulestore, contentstore, course_key, root_dir, course_dir):

    with modulestore.bulk_operations(course_key):

-        course = modulestore.get_course(course_key, depth=None)  # None means infinite
+        # depth = None: Traverses down the entire course structure.
+        # lazy = False: Loads and caches all block definitions during traversal for fast access later
+        #               -and- to eliminate many round-trips to read individual definitions.
+        # Why these parameters? Because a course export needs to access all the course block information
+        # eventually. Accessing it all now at the beginning increases performance of the export.
+        course = modulestore.get_course(course_key, depth=None, lazy=False)
        fsm = OSFS(root_dir)
        export_fs = course.runtime.export_fs = fsm.makeopendir(course_dir)
        root_course_dir = root_dir + '/' + course_dir