edx-platform/scripts/structures_pruning/utils/splitmongo.py

"""
This module provides logic to clean up old, unused course content data for the
DraftVersioningModuleStore modulestore, more commonly referred to as the "Split
Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
SplitMongoModuleStore). All courses and assets that have newer style locator
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
"ccx-v1:", or "block-v1:".

The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This
code does not address that modulestore in any way. That modulestore handles
courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as
well as assets starting with "i4x://".

"Split" gets its name from the fact that it separates the Structure of a course
from the content in the leaf nodes. In theory, the Structure is an outline of
the course that contains all the parent/child relations for different content
blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as
small, commonly inherited metadata like due dates. More detailed information
about any particular block of content is stored in a separate collection as
Definitions.

Both Structures and Definitions are immutable in Split. When a course is edited,
a new Structure is created, and the Active Versions entry for a course is
updated to point to that new Structure. In that way, we never get a partially
applied edit -- it either succeeds or fails atomically. The Active Versions
entry for a Course has pointers to "published" and "draft" Structures. There is
also a special "library" pointer that is only used by Content Libraries. We do
not need to distinguish between these for the purposes of cleanup.

The problem is that Structure documents have become far larger than they were
intended to be, and we never created code to properly clean them up. As such, it
is not uncommon for the majority of Mongo storage space to be used by old
Structure documents that are completely unused (and are unreachable) by LMS or
Studio.

This module provides cleanup functionality with various tweakable options for
how much history to preserve. For simplicity, it reads all Structure IDs into
memory instead of working on subsets of the data. As a practical matter, this
means that it will work for databases with up to about 10 million Structures
before RAM usage starts to become a problem.
"""
from collections import deque, namedtuple
from itertools import count, takewhile
import json
import logging
import os
import sys
import time

from bson.objectid import ObjectId
from pymongo import MongoClient, UpdateOne
from opaque_keys.edx.locator import CourseLocator, LibraryLocator

LOG = logging.getLogger('structures')


class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')):
    """
    This summarizes the entire set of Structure relationships in a database.

    Each Structure represents a saved state for the Course or Content Library.
    For each branch ("published", "draft", or "library"), there is a sequence of
    Structures that starts with an Original and ends in an Active Structure::

      Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active

    `branches` is a list of ActiveVersionBranch objects representing what's
    currently live on the LMS and Studio. Active Structures referenced in this
    list cannot be removed because it would break the site for users.

    `structures` is a dict of Structure IDs (Strings) to Structure objects
    (described above). All the Structure objects store ID locations to their
    parent and original Structures rather than having direct references to them.
    This is partly because we don't really need to traverse the vast majority of
    the graph. Look at `ChangePlan` for details on why that is.
    """
    def traverse_ids(self, start_id, limit=None, include_start=False):
        """
        Given a Structure ID to start from, this will iterate through the
        previous_id chain, for up to `limit` parent relationships. If `limit` is
        None, it will keep going until it gets through the Original.
        """
        if include_start:
            yield start_id

        current_id = start_id
        i = 0
        while current_id in self.structures:
            if limit is not None and i >= limit:
                return

            current_id = self.structures[current_id].previous_id
            if current_id is None:
                return

            yield current_id
            i += 1


class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')):
    """
    An Active Version document can point to multiple branches (e.g. "published",
    "draft"). This object represensts one of those branches.

    The value for `branch` can be "draft-branch", "published-branch", or
    "library". All Courses have a draft-branch and a published-branch. Content
    Libraries have only a "library" branch.

    The value for `key` is the Opaque Key representing the Course or Library,
    mostly for debugging purposes (they're not a part of the plan file).

    The value for `edited_on` is a timestamp showing the last time the Active
    Version document was modified -- for a Course, this means when *either* the
    published-branch or draft-branch was most recently modified. Again, this is
    not used for pruning, but just provides debug information.
    """
    def __str__(self):
        return "Active Version {} [{}] {} for {}".format(
            self.id,
            self.edited_on.strftime('%Y-%m-%d %H:%M:%S'),
            self.branch,
            self.key,
        )


class Structure(namedtuple('Structure', 'id original_id previous_id')):
    """
    The parts of a SplitMongo Structure document that we care about, namely the
    ID (str'd version of the ObjectID), and the IDs of the Original and Previous
    structure documents. The previous_id may be None ()

    We use a namedtuple for this specifically because it's more space efficient
    than a dict, and we can have millions of Structures.
    """
    def is_original(self):
        """Is this Structure an original (i.e. should never be deleted)?"""
        return self.previous_id is None


class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')):
    """
    Summary of the pruning actions we want a Backend to take.

    The idea of having this data structure and being able to serialize it is so
    that we can save our plan of action somewhere for debugging, failure
    recovery, and batching updates.

    `delete` is a list of Structure IDs we want to delete.

    `update_parents` is a list of (structure_id, new_previous_id) tuples
    representing the previous_id updates we need to make.

    A ChangePlan is just a declarative. It is the responsibility of the
    Backend to figure out how to implement a ChangePlan safely and efficiently
    in order to do the actual updates.
    """
    def dump(self, file_obj):
        """Serialize ChangePlan to a file (JSON format)."""
        json.dump(
            {
                "delete": self.delete,
                "update_parents": self.update_parents,
            },
            file_obj,
            indent=2,
        )
        LOG.info(
            "Wrote Change Plan: %s (%s deletions, %s parent updates)",
            os.path.realpath(file_obj.name),
            len(self.delete),
            len(self.update_parents)
        )

    @classmethod
    def load(cls, file_obj):
        """Load a ChangePlan from a JSON file. Takes a file object."""
        data = json.load(file_obj)
        return cls(
            delete=data["delete"], update_parents=data["update_parents"]
        )

    @classmethod
    def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None):
        """
        Given a StructuresGraph and a target number for intermediate Structures
        to preserve, return a ChangePlan that represents the changes needed to
        prune the database. The overall strategy is to iterate through all
        Active Structures, walk back through the ancestors, and add all the
        Structure IDs we should save to a set. After we have our save set, we
        know that we can delete all other structures without worrying about
        whether those Structures are reachable or knowing what their
        relationships are. This keeps things simpler, and means that we should
        be more resilient to failures when pruning.

        Structure documents exist in chains of parent/child relationships,
        starting with an Original Structure, having some number of Intermediate
        Structures, and ending in an Active Structure::

          Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active

        Pruning Rules:

        1. All Active Structures must be preserved, as those are being used by
           the LMS and Studio to serve course content.

        2. All Original Structures should be preserved, since those are used by
           the LMS and Studio to determine common shared ancestry between
           Structures.

        3. Up to `num_intermediate_structures` Intermediate Structures will be
           kept. These Structures are not actually used in edx-platform code,
           but they are sometimes used by developers to allow emergency reverts
           in course team support situations (e.g. someone accidentally wiped
           out their course with a bad import).

        4. The oldest preserved Intermediate Structure will be modified so that
           its `previous_id` is updated to point to the Original Structure. That
           way, we're not preserving references to the IDs of Structures that
           have been pruned.

        """
        structure_ids_to_save = set()
        set_parent_to_original = set()

        branches, structures = structures_graph

        # Figure out which Structures to save...
        for branch in branches:
            # Anything that's actively being pointed to (is the head of a branch)
            # must be preserved. This is what's being served by Studio and LMS.
            active_structure_id = branch.structure_id
            structure_ids_to_save.add(active_structure_id)

            # All originals will be saved.
            structure_ids_to_save.add(structures[active_structure_id].original_id)

            # Save up to `num_intermediate_structures` intermediate nodes
            int_structure_ids_to_save = structures_graph.traverse_ids(
                active_structure_id, limit=num_intermediate_structures
            )
            for int_structure_id in int_structure_ids_to_save:
                structure_ids_to_save.add(int_structure_id)

        missing_structure_ids = structure_ids_to_save - structures.keys()

        if ignore_missing:
            # Remove missing structures since we can't save them
            structure_ids_to_save -= missing_structure_ids
        elif len(missing_structure_ids) > 0:
            LOG.error("Missing structures detected")
            sys.exit(1)

        # Figure out what links to rewrite -- the oldest structure to save that
        # isn't an original.
        for branch in branches:
            rewrite_candidates = takewhile(
                lambda s: s in structure_ids_to_save and not structures[s].is_original(),
                structures_graph.traverse_ids(branch.structure_id, include_start=True)
            )
            # `last_seen` will have the last structure_id from the
            # `rewrite_candidates` iterable.
            last_seen = deque(rewrite_candidates, 1)
            if last_seen:
                structure = structures[last_seen.pop()]
                # Don't do a rewrite if it's just a no-op...
                if structure.original_id != structure.previous_id:
                    set_parent_to_original.add(structure.id)

        # Sort the items in the ChangePlan. This might not be helpful, but I'm
        # hoping that it will keep disk changes more localized and not thrash
        # things as much as randomly distributed deletes. Mongo ObjectIDs are
        # ordered (they have a timestamp component).
        change_plan = cls(
            delete=sorted(structures.keys() - structure_ids_to_save),
            update_parents=sorted(
                (s_id, structures[s_id].original_id)
                for s_id in set_parent_to_original
            )
        )

        if details_file:
            change_plan.write_details(
                details_file, structures_graph, structure_ids_to_save, set_parent_to_original
            )

        if dump_structures:
            active_structure_ids = {branch.structure_id for branch in branches}
            for sid in structures:
                save = sid in structure_ids_to_save
                active = sid in active_structure_ids
                relink = sid in set_parent_to_original
                prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
                LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")

        for missing_structure_id in missing_structure_ids:
            active_structure_ids = {branch.structure_id for branch in branches}

            LOG.error(f"Missing structure ID: {missing_structure_id}")
            original_ids = set()
            for structure in structures.values():
                if structure.previous_id == missing_structure_id:
                    save = structure.id in structure_ids_to_save
                    active = structure.id in active_structure_ids
                    relink = structure.id in set_parent_to_original
                    prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
                    LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}")
                    original_ids.add(structure.original_id)

            active_structure_ids = {branch.structure_id for branch in branches}

            branches_to_log = []

            LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}")
            for branch in branches:
                structure = structures[branch.structure_id]
                if structure.original_id in original_ids:
                    for sid in structures_graph.traverse_ids(branch.structure_id):
                        if sid not in structures:
                            branches_to_log.append(branch)

            for branch in branches_to_log:
                structure = structures[branch.structure_id]

                LOG.info(f"Branch: {branch}")

                save = branch.structure_id in structure_ids_to_save
                active = branch.structure_id in active_structure_ids
                relink = branch.structure_id in set_parent_to_original
                prev_misssing = structure.previous_id is not None and structure.previous_id not in structures

                for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True):
                    if sid in structures:
                        save = sid in structure_ids_to_save
                        active = sid in active_structure_ids
                        relink = sid in set_parent_to_original
                        prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
                        LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")

        return change_plan

    @staticmethod
    def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original):
        """
        Simple dump of the changes we're going to make to the database.

        This method requires information that we don't actually keep in the
        ChangePlan file, such as the Course IDs and edit times. Because of this,
        it can only be created at the time the ChangePlan is being generated,
        and cannot be derived from an existing ChangePlan. The goal was to
        provide this debug information while keeping the ChangePlan file format
        as stupidly simple as possible.
        """
        branches, structures = structures_graph
        active_structure_ids = {branch.structure_id for branch in branches}

        def text_for(s_id):
            """Helper method to format Structures consistently."""
            action = "+" if s_id in structure_ids_to_save else "-"
            notes = []
            if s_id in active_structure_ids:
                notes.append("(active)")
            if s_id in set_parent_to_original:
                notes.append("(re-link to original)")
            if s_id in structures and structures[s_id].is_original():
                notes.append("(original)")

            if notes:
                return "{} {} {}".format(action, s_id, " ".join(notes))

            return "{} {}".format(action, s_id)

        print("== Summary ==", file=details_file)
        print("Active Version Branches: {}".format(len(branches)), file=details_file)
        print("Total Structures: {}".format(len(structures)), file=details_file)
        print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file)
        print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file)
        print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file)
        print("\n== Active Versions ==", file=details_file)

        for branch in branches:
            print("{}".format(branch), file=details_file)
            for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True):
                print(text_for(structure_id), file=details_file)
            print("", file=details_file)

        LOG.info(
            "Wrote Change Details File: %s", os.path.realpath(details_file.name)
        )


class SplitMongoBackend:
    """
    Interface to the MongoDB backend. This is currently the only supported KV
    store for the Split(DraftVersioning)ModuleStore, but having this as a
    separate class makes it easier to stub in test data.

    The methods on this class should accept and return backend-agnostic data
    structures, so no BSON details should leak out.
    """
    def __init__(self, mongo_connection_str, db_name):
        self._db = MongoClient(
            mongo_connection_str,
            connectTimeoutMS=2000,
            socketTimeoutMS=300000,  # *long* operations
            serverSelectionTimeoutMS=2000
        )
        self._active_versions = self._db[db_name].modulestore.active_versions
        self._structures = self._db[db_name].modulestore.structures

    def structures_graph(self, delay, batch_size):
        """
        Return StructuresGraph for the entire modulestore.

        `batch_size` is the number of structure documents we pull at a time.
        `delay` is the delay in seconds between batch queries.

        This has one slight complication. A StructuresGraph is expected to be a
        consistent view of the database, but MongoDB doesn't offer a "repeatable
        read" transaction isolation mode. That means that Structures may be
        added at any time between our database calls. Because of this, we have
        to be careful in stitching together something that is safe. The
        guarantees we try to make about the StructuresGraph being returned are:

          1. Every Structure ID in `active_structure_ids` is also in `structures`
          2. If `branches` is stale and there is a new Structure that is Active
             in the database, it is *not* in `structures`.

        Scenario A: We fetch branches, then structures
          1. Get Branches (and thus Active Structure IDs)
          2. New Structures created by Studio
          3. Get all Structures

        It is almost certainly the case that the new Structures created in (2)
        should be active. Our algorithm works by starting from the Active
        Structure IDs that we know about, making a "save" list, and then
        deleting all other Structures. The problem in this scenario is that we
        fetch the new Structures in (3), but we don't know that they're Active
        because our `active_structure_ids` comes from (1) and is stale. So we
        would in fact delete what should be Active Structures.

        Scenario B: We fetch structures, then branches
          1. Get all Structures
          2. New Structures created by Studio
          3. Get Branches (and thus Active Structure IDs)

        In this scenario, we may see Active Structure IDs that are not in
        our Structures dict. This is bad because we won't know how to crawl
        their ancestry and mark the appropriate Structure IDs to be saved.

        So the approach we take is Scenario B with a fallback. After we fetch
        everything, we go through the Active Structure IDs and make sure that
        those Structures and their ancestors exist in `structures`. If they
        don't, we make extra fetches to get them. Misses should be rare, so it
        shouldn't have a drastic performance impact overall.

        Note that it's safe if the ChangePlan as a whole is a little stale, so
        long as it's internally consistent. We only ever delete Structures that
        are in the `structures` doc, so a new Active Version that we're
        completely unaware of will be left alone.
        """
        structures = self._all_structures(delay, batch_size)
        branches = self._all_branches()

        # Guard against the race condition that branch.structure_id or its
        # ancestors are not in `structures`. Make sure that we add those.
        LOG.info(
            "Checking for missing Structures (a small number are expected "
            "unless edits are disabled during change plan creation)."
        )
        missing_count = 0
        for branch in branches:
            structure_id = branch.structure_id
            while structure_id and (structure_id not in structures):
                structures[structure_id] = self._get_structure(structure_id)
                missing_count += 1
                LOG.warning(
                    "Structure %s linked from Active Structure %s (%s) fetched.",
                    structure_id,
                    branch.structure_id,
                    branch.key,
                )
                structure_id = structures[structure_id].previous_id

        LOG.info("Finished checking for missing Structures, found %s", missing_count)

        return StructuresGraph(branches, structures)

    def _all_structures(self, delay, batch_size):
        """
        Return a dict mapping Structure IDs to Structures for all Structures in
        the database.

        `batch_size` is the number of structure documents we pull at a time.
        `delay` is the delay in seconds between batch queries.
        """
        LOG.info("Fetching all known Structures (this might take a while)...")
        LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size)

        # Important to keep this as a generator to limit memory usage.
        parsed_docs = (
            self.parse_structure_doc(doc)
            for doc
            in self._structures_from_db(delay, batch_size)
        )
        structures = {structure.id: structure for structure in parsed_docs}
        LOG.info("Fetched %s Structures", len(structures))

        return structures

    def _structures_from_db(self, delay, batch_size):
        """
        Iterate through all Structure documents in the database.

        `batch_size` is the number of structure documents we pull at a time.
        `delay` is the delay in seconds between batch queries.
        """
        cursor = self._structures.find(
            projection=['original_version', 'previous_version']
        )
        cursor.batch_size(batch_size)
        for i, structure_doc in enumerate(cursor, start=1):
            yield structure_doc
            if i % batch_size == 0:
                LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id'])
                time.sleep(delay)

    def _all_branches(self):
        """Retrieve list of all ActiveVersionBranch objects in the database."""
        branches = []
        LOG.info("Fetching all Active Version Branches...")

        for av_doc in self._active_versions.find():
            for branch, obj_id in av_doc['versions'].items():
                structure_id = str(obj_id)
                if branch == 'library':
                    key = LibraryLocator(av_doc['org'], av_doc['course'])
                else:
                    key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run'])

                branches.append(
                    ActiveVersionBranch(
                        str(av_doc['_id']),
                        branch,
                        structure_id,
                        key,
                        av_doc['edited_on'],
                    )
                )

        LOG.info("Fetched %s Active Version Branches", len(branches))

        return sorted(branches)

    def _get_structure(self, structure_id):
        """Get an individual Structure from the database."""
        structure_doc = self._structures.find_one(
            {'_id': ObjectId(structure_id)},
            projection=['original_version', 'previous_version']
        )
        return self.parse_structure_doc(structure_doc)

    def update(self, change_plan, delay=1000, batch_size=1000, start=None):
        """
        Update the backend according to the relinking and deletions specified in
        the change_plan.
        """
        # Step 1: Relink - Change the previous pointer for the oldest structure
        # we want to keep, so that it points back to the original. We never
        # delete the original. Relinking happens before deletion so that we
        # never leave our course in a broken state (at worst, parts of it
        # become unreachable).
        self._update_parents(change_plan.update_parents, delay, batch_size)

        # Step 2: Delete unused Structures
        self._delete(change_plan.delete, delay, batch_size, start)

    def _update_parents(self, id_parent_pairs, delay, batch_size):
        """
        Update Structure parent relationships.

        `id_parent_pairs` is a list of tuples, where the first element of each
        tuple is a Structure ID (str) to target, and the second element is the
        Structure ID that will be the new parent of the first element.
        """
        for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size):
            updates = [
                UpdateOne(
                    {'_id': ObjectId(structure_id)},
                    {'$set': {'previous_version': ObjectId(previous_id)}}
                )
                for structure_id, previous_id in id_parent_pairs_batch
            ]
            result = self._structures.bulk_write(updates)
            LOG.info(
                "Updated %s/%s parent relationships.",
                result.bulk_api_result['nModified'],
                result.bulk_api_result['nMatched'],
            )
            time.sleep(delay)

    def _delete(self, structure_ids, delay, batch_size, start=None):
        """
        Delete old structures in batches.

        `structure_ids` is a list of Structure IDs to delete.
        `delay` is the delay in seconds (floats are ok) between batch deletes.
        `batch_size` is how many we try to delete in each batch statement.
        """
        s_ids_with_offset = self.iter_from_start(structure_ids, start)
        for structure_ids_batch in self.batch(s_ids_with_offset, batch_size):
            result = self._structures.delete_many(
                {
                    '_id': {
                        '$in': [ObjectId(s_id) for s_id in structure_ids_batch]
                    }
                }
            )
            LOG.info(
                "Deleted %s/%s Structures: %s - %s",
                result.deleted_count,
                len(structure_ids_batch),
                structure_ids_batch[0],
                structure_ids_batch[-1],
            )
            time.sleep(delay)

    @staticmethod
    def parse_structure_doc(structure_doc):
        """
        Structure docs are pretty big, but we only care about three top level
        fields, all of which are ObjectIds:

          _id: The Structure ID

          previous_version: The Structure ID for the parent. An Original
                            Structure will have None for this field.

          original_version: The Original Structure that this Structure and all
                            its ancestors are ultimately dervied from. An
                            Original Structure points to itself with this field.
        """
        _id = str(structure_doc['_id'])
        original_id = str(structure_doc['original_version'])
        previous_id = structure_doc['previous_version']
        if previous_id is not None:
            previous_id = str(previous_id)
        return Structure(_id, original_id, previous_id)

    @staticmethod
    def batch(iterable, batch_size):
        """Yield lists of up to `batch_size` in length from `iterable`."""
        iterator = iter(iterable)
        curr_batch = []
        for i in count(1):
            try:
                curr_batch.append(next(iterator))
                if i % batch_size == 0:
                    yield curr_batch
                    curr_batch = []
            except StopIteration:
                break
        if curr_batch:
            yield curr_batch

    @staticmethod
    def iter_from_start(structure_ids, start=None):
        """
        Yields from an iterable once it encounters the `start` value. If `start`
        is None, just yields from the beginning.
        """
        if start is None:
            for structure_id in structure_ids:
                yield structure_id
            return

        for structure_id in structure_ids:
            if structure_id < start:
                continue
            yield structure_id