chore: Moved structures.py from tubular repository (#34328)

* chore: Moved structures.py from tubular repository
2024-03-12 18:46:34 +05:00
parent da244a99d3
commit 7808913916
13 changed files with 1566 additions and 1 deletions
--- a/.github/workflows/units-test-scripts-structures-pruning.yml
+++ b/.github/workflows/units-test-scripts-structures-pruning.yml
@@ -0,0 +1,33 @@
+name: units-test-scripts-common
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: [ '3.8', '3.12' ]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r scripts/structures_pruning/requirements/testing.txt
+
+      - name: Run pytest
+        run: |
+          pytest scripts/structures_pruning
--- a/4
+++ b/4
@@ -142,7 +142,9 @@ REQ_FILES = \
 	requirements/edx/semgrep \
 	scripts/xblock/requirements \
 	scripts/user_retirement/requirements/base \
-	scripts/user_retirement/requirements/testing
+	scripts/user_retirement/requirements/testing \
+	scripts/structures_pruning/requirements/base \
+	scripts/structures_pruning/requirements/testing

 define COMMON_CONSTRAINTS_TEMP_COMMENT
 # This is a temporary solution to override the real common_constraints.txt\n# In edx-lint, until the pyjwt constraint in edx-lint has been removed.\n# See BOM-2721 for more details.\n# Below is the copied and edited version of common_constraints\n
--- a/scripts/structures_pruning/README.rst
+++ b/scripts/structures_pruning/README.rst
@@ -0,0 +1,73 @@
+Structures Pruning Scripts
+==========================
+
+`This <https://github.com/openedx/edx-platform/tree/master/scripts/structures_pruning>`_ directory contains mongo db structures pruning script that is migrated from the 
+`tubular <https://github.com/openedx/tubular>`_ repository.
+
+
+This script could be called from any automation/CD framework.
+
+How to run the scripts
+======================
+
+Download the Scripts
+--------------------
+
+To download the scripts, you can perform a partial clone of the edx-platform repository to obtain only the required scripts. The following steps demonstrate how to achieve this. Alternatively, you may choose other utilities or libraries for the partial clone.
+
+.. code-block:: bash
+
+    repo_url=git@github.com:openedx/edx-platform.git
+    branch=master
+    directory=scripts/structures_pruning
+
+    git clone --branch $branch --single-branch --depth=1 --filter=tree:0 $repo_url
+    cd edx-platform
+    git sparse-checkout init --cone
+    git sparse-checkout set $directory
+
+Create Python Virtual Environment
+---------------------------------
+
+Create a Python virtual environment using Python 3.8:
+
+.. code-block:: bash
+
+    python3.8 -m venv ../venv
+    source ../venv/bin/activate
+
+Install Pip Packages
+--------------------
+
+Install the required pip packages using the provided requirements file:
+
+.. code-block:: bash
+
+    pip install -r scripts/structures_pruning/requirements/base.txt
+
+
+Execute Script
+--------------
+
+You can simply execute Python scripts with python command
+
+.. code-block:: bash
+
+    python scripts/structures_pruning/structures.py prune plan_file.json
+
+Feel free to customize these steps according to your specific environment and requirements.
+
+Run Test Cases
+==============
+
+Before running test cases, install the testing requirements:
+
+.. code-block:: bash
+
+    pip install -r scripts/structures_pruning/requirements/testing.txt
+
+Run the test cases using pytest:
+
+.. code-block:: bash
+
+    pytest scripts/structures_pruning
--- a/scripts/structures_pruning/init.py
+++ b/scripts/structures_pruning/init.py
--- a/scripts/structures_pruning/pytest.ini
+++ b/scripts/structures_pruning/pytest.ini
--- a/scripts/structures_pruning/requirements/base.in
+++ b/scripts/structures_pruning/requirements/base.in
@@ -0,0 +1,4 @@
+click
+click-log
+edx-opaque-keys
+pymongo
--- a/scripts/structures_pruning/requirements/base.txt
+++ b/scripts/structures_pruning/requirements/base.txt
@@ -0,0 +1,24 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    make upgrade
+#
+click==8.1.7
+    # via
+    #   -r scripts/structures_pruning/requirements/base.in
+    #   click-log
+click-log==0.4.0
+    # via -r scripts/structures_pruning/requirements/base.in
+edx-opaque-keys==2.5.1
+    # via -r scripts/structures_pruning/requirements/base.in
+pbr==6.0.0
+    # via stevedore
+pymongo==3.13.0
+    # via
+    #   -r scripts/structures_pruning/requirements/base.in
+    #   edx-opaque-keys
+stevedore==5.2.0
+    # via edx-opaque-keys
+typing-extensions==4.10.0
+    # via edx-opaque-keys
--- a/scripts/structures_pruning/requirements/testing.in
+++ b/scripts/structures_pruning/requirements/testing.in
@@ -0,0 +1,4 @@
+-r base.txt
+
+ddt
+pytest
--- a/scripts/structures_pruning/requirements/testing.txt
+++ b/scripts/structures_pruning/requirements/testing.txt
@@ -0,0 +1,44 @@
+#
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
+#
+#    make upgrade
+#
+click==8.1.7
+    # via
+    #   -r scripts/structures_pruning/requirements/base.txt
+    #   click-log
+click-log==0.4.0
+    # via -r scripts/structures_pruning/requirements/base.txt
+ddt==1.7.2
+    # via -r scripts/structures_pruning/requirements/testing.in
+edx-opaque-keys==2.5.1
+    # via -r scripts/structures_pruning/requirements/base.txt
+exceptiongroup==1.2.0
+    # via pytest
+iniconfig==2.0.0
+    # via pytest
+packaging==24.0
+    # via pytest
+pbr==6.0.0
+    # via
+    #   -r scripts/structures_pruning/requirements/base.txt
+    #   stevedore
+pluggy==1.4.0
+    # via pytest
+pymongo==3.13.0
+    # via
+    #   -r scripts/structures_pruning/requirements/base.txt
+    #   edx-opaque-keys
+pytest==8.1.1
+    # via -r scripts/structures_pruning/requirements/testing.in
+stevedore==5.2.0
+    # via
+    #   -r scripts/structures_pruning/requirements/base.txt
+    #   edx-opaque-keys
+tomli==2.0.1
+    # via pytest
+typing-extensions==4.10.0
+    # via
+    #   -r scripts/structures_pruning/requirements/base.txt
+    #   edx-opaque-keys
--- a/scripts/structures_pruning/structures.py
+++ b/scripts/structures_pruning/structures.py
@@ -0,0 +1,200 @@
+#! /usr/bin/env python3
+"""
+Script to detect and prune old Structure documents from the "Split" Modulestore
+MongoDB (edxapp.modulestore.structures by default). See docstring/help for the
+"make_plan" and "prune" commands for more details.
+"""
+
+import logging
+from os import path
+import sys
+
+import click
+import click_log
+
+# Add top-level project path to sys.path before importing scripts code
+sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
+
+from scripts.structures_pruning.utils.splitmongo import SplitMongoBackend, ChangePlan
+
+# Add top-level module path to sys.path before importing tubular code.
+# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# from tubular.splitmongo import ChangePlan, SplitMongoBackend  # pylint: disable=wrong-import-position
+
+LOG = logging.getLogger('structures')
+click_log.basic_config(LOG)
+
+
+@click.group()
+@click.option(
+    '--connection',
+    default="mongodb://localhost:27017",
+    help=(
+            'Connection string to the target mongo database. This defaults to '
+            'localhost without password (that will work against devstack). '
+            'You may need to use urllib.parse.quote_plus() to percent-escape '
+            'your username and password.'
+    )
+)
+@click.option(
+    '--database-name',
+    default='edxapp',
+    help='Name of the edX Mongo database containing the course structures to prune.'
+)
+@click.pass_context
+def cli(ctx, connection, database_name):
+    """
+    Recover space on MongoDB for edx-platform by deleting unreachable,
+    historical course content data. To use, first make a change plan with the
+    "make_plan" command, and then execute that plan against the database with
+    the "prune" command.
+
+    This script provides logic to clean up old, unused course content data for
+    the DraftVersioningModuleStore modulestore, more commonly referred to as the
+    "Split Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
+    SplitMongoModuleStore). All courses and assets that have newer style locator
+    keys use DraftVersioningModuleStore. These keys start with "course-v1:",
+    "ccx-v1:", or "block-v1:". Studio authored content data for this modulestore
+    is saved as immutable data structures. The edx-platform code never cleans up
+    old data however, meaning there is an unbounded history of a course's
+    content revisions stored in MongoDB.
+
+    The older modulestore is DraftModuleStore, sometimes called "Old Mongo".
+    This code does not address that modulestore in any way. That modulestore
+    handles courses that use the old "/" separator, such as
+    "MITx/6.002x/2012_Spring", as well as assets starting with "i4x://".
+    """
+    if ctx.obj is None:
+        ctx.obj = dict()
+
+    ctx.obj['BACKEND'] = SplitMongoBackend(connection, database_name)
+
+
+@cli.command("make_plan")
+@click_log.simple_verbosity_option(default='INFO')
+@click.argument('plan_file', type=click.File('w'))
+@click.option(
+    '--details',
+    type=click.File('w'),
+    default=None,
+    help="Name of file to write the human-readable details of the Change Plan."
+)
+@click.option(
+    '--retain',
+    default=2,
+    type=click.IntRange(0, None),
+    help=("The maximum number of intermediate structures to preserve for any "
+          "single branch of an active version. This value does not include the "
+          "active or original structures (those are always preserved). Defaults "
+          "to 2. Put 0 here if you want to prune as much as possible.")
+)
+@click.option(
+    '--delay',
+    default=15000,
+    type=click.IntRange(0, None),
+    help=("Delay in milliseconds between queries to fetch structures from MongoDB "
+          "during plan creation. Tune to adjust load on the database.")
+)
+@click.option(
+    '--batch-size',
+    default=10000,
+    type=click.IntRange(1, None),
+    help="How many Structures do we fetch at a time?"
+)
+@click.option(
+    '--ignore-missing/--no-ignore-missing',
+    default=False,
+    help=("Force plan creation, even if missing structures are found. "
+          "Should repair invalid ids by repointing to original. "
+          "Review of plan highly recommended")
+)
+@click.option(
+    '--dump-structures/--no-dump-structures',
+    default=False,
+    help="Dump all strucutres to stderr for debugging or recording state before cleanup."
+)
+@click.pass_context
+def make_plan(ctx, plan_file, details, retain, delay, batch_size, ignore_missing, dump_structures):
+    """
+    Create a Change Plan JSON file describing the operations needed to prune the
+    database. This command is read-only and does not alter the database.
+
+    The Change Plan JSON is a dictionary with two keys:
+
+    "delete" - A sorted array of Structure document IDs to delete. Since MongoDB
+    object IDs are created in ascending order by timestamp, this means that the
+    oldest documents come earlier in the list.
+
+    "update_parents" - A list of [Structure ID, New Parent/Previous ID] pairs.
+    This is used to re-link the oldest preserved Intermediate Structure back to
+    the Original Structure, so that we don't leave the database in a state where
+    a Structure's "previous_version" points to a deleted Structure.
+
+    Specifying a --details file will generate a more verbose, human-readable
+    text description of the Change Plan for verification purposes. The details
+    file will only display Structures that are reachable from an Active Version,
+    so any Structures that are "orphaned" as a result of partial runs of this
+    script or Studio race conditions will not be reflected. That being said,
+    orphaned Structures are detected and properly noted in the Change Plan JSON.
+    """
+    structures_graph = ctx.obj['BACKEND'].structures_graph(delay / 1000.0, batch_size)
+
+    # This will create the details file as a side-effect, if specified.
+    change_plan = ChangePlan.create(structures_graph, retain, ignore_missing, dump_structures, details)
+    change_plan.dump(plan_file)
+
+
+@cli.command()
+@click_log.simple_verbosity_option(default='INFO')
+@click.argument('plan_file', type=click.File('r'))
+@click.option(
+    '--delay',
+    default=15000,
+    type=click.IntRange(0, None),
+    help=("Delay in milliseconds between batch deletions during pruning. Tune to "
+          "adjust load on the database.")
+)
+@click.option(
+    '--batch-size',
+    default=1000,
+    type=click.IntRange(1, None),
+    help=("How many Structures do we delete at a time? Tune to adjust load on "
+          "the database.")
+)
+@click.option(
+    '--start',
+    default=None,
+    help=("Structure ID to start deleting from. Specifying a Structure ID that "
+          "is not in the Change Plan is an error. Specifying a Structure ID that "
+          "has already been deleted is NOT an error, so it's safe to re-run.")
+)
+@click.pass_context
+def prune(ctx, plan_file, delay, batch_size, start):
+    """
+    Prune the MongoDB database according to a Change Plan file.
+
+    This command tries to be as safe as possible. It executes parent updates
+    before deletes, so an interruption at any point should be safe in that it
+    won't leave the structure graphs in an inconsistent state. It should also
+    be safe to resume pruning with the same Change Plan in the event of an
+    interruption.
+
+    It's also safe to run while Studio is still operating, though you should be
+    careful to test and tweak the delay and batch_size options to throttle load
+    on your database.
+    """
+    change_plan = ChangePlan.load(plan_file)
+    if start is not None and start not in change_plan.delete:
+        raise click.BadParameter(
+            "{} is not in the Change Plan {}".format(
+                start, click.format_filename(plan_file.name)
+            ),
+            param_hint='--start'
+        )
+    ctx.obj['BACKEND'].update(change_plan, delay / 1000.0, batch_size, start)
+
+
+if __name__ == '__main__':
+    # pylint doesn't grok click magic, but this is straight from their docs...
+    cli(obj={})  # pylint: disable=no-value-for-parameter, unexpected-keyword-arg
--- a/scripts/structures_pruning/tests/init.py
+++ b/scripts/structures_pruning/tests/init.py
--- a/scripts/structures_pruning/tests/test_splitmongo.py
+++ b/scripts/structures_pruning/tests/test_splitmongo.py
@@ -0,0 +1,502 @@
+"""
+Test Structure pruning related Split Mongo code.
+
+IMPORTANT: If you are making changes to this code, please re-enable the
+TestSplitMongoBackend tests and run them locally against the MongoDB instance
+in your Docker Devstack. See the TestSplitMongoBackend docstring for more info.
+"""
+import itertools
+import sys
+import textwrap
+import unittest
+from datetime import datetime
+from io import StringIO
+from os import path
+from unittest.mock import patch
+
+import ddt
+from bson.objectid import ObjectId
+from opaque_keys.edx.locator import CourseLocator, LibraryLocator
+from pymongo import MongoClient
+
+# Add top-level project path to sys.path before importing scripts code
+sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
+
+from scripts.structures_pruning.utils.splitmongo import (
+    ActiveVersionBranch, ChangePlan, Structure, SplitMongoBackend, StructuresGraph
+)
+
+
+def create_test_graph(*version_histories):
+    """
+    Given any number of lists, where each list represents a history of Structure
+    IDs from oldest to newest, return a StructureGraph matching that
+    specification. Course names, branch names, and other attributes that exist
+    for debugging/reporting but do not change pruning behavior will be
+    automatically generated with plausible values.
+    """
+    all_structures = {}
+    all_active_version_branches = []
+
+    active_id_pool = ("A{:023x}".format(i) for i in itertools.count(1))
+    course_key_pool = (
+        CourseLocator('edx', 'splitmongo', str(i)) for i in itertools.count(1)
+    )
+    branch_pool = itertools.cycle(['draft-branch', 'published-branch'])
+
+    for version_history in version_histories:
+        assert version_history  # The history can't be empty
+        structure_ids = [str(version) for version in version_history]
+
+        # Create the Original
+        original_id = structure_ids[0]
+        history = [Structure(original_id, original_id, None)]
+
+        # Create all other Structures (if any)
+        for previous_id, current_id in zip(structure_ids, structure_ids[1:]):
+            history.append(Structure(current_id, original_id, previous_id))
+
+        # Add to our overall Structures dict (overwrites should be identical or
+        # our test data is bad).
+        for structure in history:
+            if structure.id in all_structures:
+                assert structure == all_structures[structure.id]
+            else:
+                all_structures[structure.id] = structure
+
+        active_version_id = structure_ids[-1]
+        all_active_version_branches.append(
+            ActiveVersionBranch(
+                id=next(active_id_pool),
+                branch=next(branch_pool),
+                structure_id=active_version_id,
+                key=next(course_key_pool),
+                edited_on=datetime(2012, 5, 2)
+
+            )
+        )
+
+    return StructuresGraph(all_active_version_branches, all_structures)
+
+
+@ddt.ddt
+class TestCourseChangePlan(unittest.TestCase):
+    """
+    ChangePlans for single and multiple courses.
+    """
+
+    def test_simple(self):
+        """Simple happy path ChangePlans."""
+        graph = create_test_graph(["1", "2", "3", "4"])
+
+        # Preserve no intermediate structures -- prune the middle structures.
+        plan_no_intermediate = ChangePlan.create(graph, 0, False, False)
+        self.assertEqual(plan_no_intermediate.delete, ["2", "3"])
+        self.assertEqual(plan_no_intermediate.update_parents, [("4", "1")])
+
+        # Preserve one intermediate structure
+        plan_1_intermediate = ChangePlan.create(graph, 1, False, False)
+        self.assertEqual(plan_1_intermediate.delete, ["2"])
+        self.assertEqual(plan_1_intermediate.update_parents, [("3", "1")])
+
+        # Preserve two intermediate structures -- Do nothing
+        plan_2_intermediate = ChangePlan.create(graph, 2, False, False)
+        self.assertEqual(plan_2_intermediate.delete, [])
+        self.assertEqual(plan_2_intermediate.update_parents, [])
+
+    @ddt.data(
+        create_test_graph(["1"]),  # Original (is also Active)
+        create_test_graph(["1", "2"]),  # "1" = Original, "2" = Active
+    )
+    def test_no_changes(self, graph):
+        """These scenarios should result in no Changes."""
+        plan_1 = ChangePlan.create(graph, 0, False, False)
+        plan_2 = ChangePlan.create(graph, 2, False, False)
+        self.assertEqual(plan_1, plan_2)
+        self.assertEqual(plan_1.delete, [])
+        self.assertEqual(plan_1.update_parents, [])
+
+    def test_overlapping_shared_history(self):
+        """Test multiple branches that overlap in what history to preserve."""
+        graph = create_test_graph(
+            ["1", "2", "3"],
+            ["1", "2", "3", "4", "5"],
+            ["1", "2", "3", "6"],
+            ["1", "2", "7", "8", "9", "10"],
+        )
+        plan = ChangePlan.create(graph, 1, False, False)
+
+        # We specified only one intermediate structure in each branch should be
+        # preserved. So why do we only delete "7" and "8" here?
+        # "1" is the original structure, and will always be preserved.
+        # "2" is the intermediate structure preserved by the first branch. It
+        #     won't be deleted, even if other branches might want to flag it for
+        #     deletion.
+        # "3" would be deleted by the second branch, but it's Active in the
+        #     first, and so is preserved. Active Structures are never deleted.
+        # "4" is preserved by the second branch.
+        # "5" is the Active Structure for the second branch.
+        # "6" is the Active Structure for the third branch.
+        # "7" is marked for deletion by the fourth branch.
+        # "8" is marked for deletion by the fourth branch.
+        # "9" is preserved by the fourth branch.
+        # "10" is the Active Structure for the fourth branch.
+        self.assertEqual(plan.delete, ["7", "8"])
+        self.assertEqual(plan.update_parents, [("9", "1")])
+
+    def test_non_overlapping_shared_history(self):
+        """Test shared history, preserved intermediate set doesn't overlap."""
+        graph = create_test_graph(
+            ["1", "2", "3"],
+            ["1", "2", "3", "4", "5", "6"],
+        )
+        plan = ChangePlan.create(graph, 0, False, False)
+        self.assertEqual(plan.delete, ["2", "4", "5"])
+        self.assertEqual(plan.update_parents, [("3", "1"), ("6", "1")])
+
+        graph_save_1 = create_test_graph(
+            ["1", "2", "3", "4"],
+            ["1", "2", "3", "4", "5", "6", "7"],
+        )
+        plan_save_1 = ChangePlan.create(graph_save_1, 1, False, False)
+        self.assertEqual(plan_save_1.delete, ["2", "5"])
+        self.assertEqual(plan_save_1.update_parents, [("3", "1"), ("6", "1")])
+
+    def test_details_output(self):
+        """Test our details file output."""
+        graph = create_test_graph(
+            ["1"],
+            ["2", "3"],
+            ["4", "5", "6"]
+        )
+        buff = StringIO()
+        buff.name = "test_file.txt"
+        plan = ChangePlan.create(graph, 0, False, False, buff)
+        details_txt = buff.getvalue()
+
+        # pylint: disable=line-too-long
+        expected_output = textwrap.dedent(
+            """
+            == Summary ==
+            Active Version Branches: 3
+            Total Structures: 6
+            Structures to Save: 5
+            Structures to Delete: 1
+            Structures to Rewrite Parent Link: 1
+
+            == Active Versions ==
+            Active Version A00000000000000000000001 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+1
+            + 1 (active) (original)
+
+            Active Version A00000000000000000000002 [2012-05-02 00:00:00] published-branch for course-v1:edx+splitmongo+2
+            + 3 (active)
+            + 2 (original)
+
+            Active Version A00000000000000000000003 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+3
+            + 6 (active) (re-link to original)
+            - 5
+            + 4 (original)
+
+            """
+        ).lstrip()
+        # pylint: enable=line-too-long
+        self.assertEqual(expected_output, details_txt)
+        self.assertEqual(
+            plan,
+            ChangePlan(
+                delete=["5"],
+                update_parents=[("6", "4")]
+            )
+        )
+
+
+class TestSplitMongoBackendHelpers(unittest.TestCase):
+    """
+    Test the static helper methods of SplitMongoBackend.
+
+    Requires no actual database connection.
+    """
+
+    def test_parse_structure_doc(self):
+        """Test basic parsing of Structures."""
+        original_structure = SplitMongoBackend.parse_structure_doc(
+            {
+                '_id': obj_id(1),
+                'original_version': obj_id(1),
+                'previous_version': None,
+                'extra_data': "This is ignored"
+            }
+        )
+        self.assertEqual(
+            original_structure,
+            Structure(id=str_id(1), original_id=str_id(1), previous_id=None)
+        )
+        self.assertTrue(original_structure.is_original())
+
+        other_structure = SplitMongoBackend.parse_structure_doc(
+            {
+                '_id': obj_id(2),
+                'original_version': obj_id(1),
+                'previous_version': obj_id(1),
+                'extra_data': "This is ignored"
+            }
+        )
+        self.assertEqual(
+            other_structure,
+            Structure(id=str_id(2), original_id=str_id(1), previous_id=str_id(1))
+        )
+        self.assertFalse(other_structure.is_original())
+
+    def test_batch(self):
+        """Test the batch helper that breaks up iterables for DB operations."""
+        self.assertEqual(
+            list(SplitMongoBackend.batch([], 1)),
+            []
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.batch([1, 2, 3], 1)),
+            [[1], [2], [3]]
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.batch([1, 2, 3], 2)),
+            [[1, 2], [3]]
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.batch([1, 2, 3, 4], 2)),
+            [[1, 2], [3, 4]]
+        )
+
+    def test_iter_from_start(self):
+        """Test what we use to resume deletion from a given Structure ID."""
+        all_ids = [1, 2, 3]
+        self.assertEqual(
+            list(SplitMongoBackend.iter_from_start(all_ids, None)),
+            all_ids
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.iter_from_start(all_ids, 1)),
+            all_ids
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.iter_from_start(all_ids, 2)),
+            [2, 3]
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.iter_from_start(all_ids, 3)),
+            [3]
+        )
+        self.assertEqual(
+            list(SplitMongoBackend.iter_from_start(all_ids, 4)),
+            []
+        )
+
+
+@unittest.skip("Requires local MongoDB instance (run manually).")
+class TestSplitMongoBackend(unittest.TestCase):
+    """
+    Tests the MongoDB-specific portions of the code.
+
+    These tests should be about simple read/write from the database. Complex
+    trees of Structures can be created and tested in TestSingleCourseChangePlan
+    without invoking the database.
+
+    These tests will be disabled by default because I didn't want to add MongoDB
+    as a test-time dependency for tubular, and the only decent looking MongoDB
+    mocking library I could find was no longer being maintained. Given how
+    isolated Split Mongo related code is in tubular (nothing else touches it),
+    the main danger of breakage comes from file format changes in edx-platform,
+    which automated testing at this level wouldn't catch anyway.
+
+    So basically, if you want to work on this code, please run these tests
+    locally by spinning up the MongoDB server used for Docker Devstack and
+    commenting out the unittest.skip decorator above.
+    """
+    CONNECT_STR = "mongodb://localhost:27017"
+    DATABASE_NAME = "splitmongo_test"
+
+    def setUp(self):
+        """Clear our test MongoDB instance of data."""
+        super().setUp()
+
+        self.client = MongoClient(self.CONNECT_STR)
+        database = self.client[self.DATABASE_NAME]
+
+        # Remove anything that might have been there from a previous test.
+        database.drop_collection('modulestore.active_versions')
+        database.drop_collection('modulestore.structures')
+
+        # Convenince pointers to our collections.
+        self.active_versions = database['modulestore.active_versions']
+        self.structures = database['modulestore.structures']
+
+        # The backend we should use in our tests for querying.
+        self.backend = SplitMongoBackend(self.CONNECT_STR, self.DATABASE_NAME)
+        self.seed_data()
+
+    def seed_data(self):
+        """Create a Course and Library."""
+        structure_docs = [
+            # Branch 1
+            dict(_id=obj_id(1), original_version=obj_id(1), previous_version=None),
+            dict(_id=obj_id(2), original_version=obj_id(1), previous_version=obj_id(1)),
+            dict(_id=obj_id(3), original_version=obj_id(1), previous_version=obj_id(2)),
+            dict(_id=obj_id(4), original_version=obj_id(1), previous_version=obj_id(3)),
+
+            # Branch 2
+            dict(_id=obj_id(10), original_version=obj_id(10), previous_version=None),
+            dict(_id=obj_id(11), original_version=obj_id(10), previous_version=obj_id(10)),
+
+            # Branch 3
+            dict(_id=obj_id(20), original_version=obj_id(20), previous_version=None),
+        ]
+        active_versions_docs = [
+            {
+                '_id': obj_id(100),
+                'edited_on': datetime(2012, 5, 2),
+                'org': 'edx',
+                'course': 'split_course',
+                'run': '2017',
+                'versions': {
+                    'draft-branch': obj_id(4),
+                    'published-branch': obj_id(11)
+                }
+            },
+            {
+                '_id': obj_id(101),
+                'edited_on': datetime(2012, 5, 3),
+                'org': 'edx',
+                'course': 'split_library',
+                'run': 'library',
+                'versions': {
+                    'library': obj_id(20),
+                }
+            }
+        ]
+        self.structures.insert_many(structure_docs)
+        self.active_versions.insert_many(active_versions_docs)
+
+    def test_structures_graph(self):
+        """Test pulling a full graph out."""
+        graph = self.backend.structures_graph(0, 100)
+        self.assertEqual(
+            graph.branches,
+            [
+                ActiveVersionBranch(
+                    id=str_id(100),
+                    branch='draft-branch',
+                    structure_id=str_id(4),
+                    key=CourseLocator('edx', 'split_course', '2017'),
+                    edited_on=datetime(2012, 5, 2),
+                ),
+                ActiveVersionBranch(
+                    id=str_id(100),
+                    branch='published-branch',
+                    structure_id=str_id(11),
+                    key=CourseLocator('edx', 'split_course', '2017'),
+                    edited_on=datetime(2012, 5, 2),
+                ),
+                ActiveVersionBranch(
+                    id=str_id(101),
+                    branch='library',
+                    structure_id=str_id(20),
+                    key=LibraryLocator('edx', 'split_library'),
+                    edited_on=datetime(2012, 5, 3),
+                ),
+            ]
+        )
+        self.assertEqual(
+            list(graph.structures.keys()),
+            [str_id(i) for i in [1, 2, 3, 4, 10, 11, 20]]
+        )
+
+    def test_update(self):
+        """Execute a simple update."""
+        self.backend.update(
+            ChangePlan(
+                delete=[str_id(i) for i in [2, 3]],
+                update_parents=[(str_id(4), str_id(1))]
+            ),
+            delay=0
+        )
+        graph = self.backend.structures_graph(0, 100)
+        self.assertEqual(
+            list(graph.structures.keys()),
+            [str_id(i) for i in [1, 4, 10, 11, 20]]
+        )
+        self.assertEqual(
+            graph.structures,
+            {
+                str_id(1): Structure(id=str_id(1), original_id=str_id(1), previous_id=None),
+                # This one got its previous_id rewritten from 3 -> 1
+                str_id(4): Structure(id=str_id(4), original_id=str_id(1), previous_id=str_id(1)),
+                str_id(10): Structure(id=str_id(10), original_id=str_id(10), previous_id=None),
+                str_id(11): Structure(id=str_id(11), original_id=str_id(10), previous_id=str_id(10)),
+                str_id(20): Structure(id=str_id(20), original_id=str_id(20), previous_id=None),
+            }
+        )
+
+    def test_race_condition(self):
+        """Create new Structures are during ChangePlan creation."""
+        # Get the real method before we patch it...
+        real_all_structures_fn = SplitMongoBackend._all_structures  # pylint: disable=protected-access
+
+        def add_structures(backend, delay, batch_size):
+            """Do what _all_structures() would do, then add new Structures."""
+            structures = real_all_structures_fn(backend, delay, batch_size)
+
+            # Create new Structures
+            self.structures.insert_one(
+                dict(_id=obj_id(5), original_version=obj_id(1), previous_version=obj_id(4)),
+            )
+            self.structures.insert_one(
+                dict(_id=obj_id(6), original_version=obj_id(1), previous_version=obj_id(5)),
+            )
+            self.structures.insert_one(
+                dict(_id=obj_id(7), original_version=obj_id(1), previous_version=obj_id(6)),
+            )
+
+            # Update the Draft branch of course-v1:edx+split_course+2017 to
+            # point to one of the new Structures
+            self.active_versions.update_one(
+                {'_id': obj_id(100)},
+                {'$set': {'versions.draft-branch': obj_id(5)}}
+            )
+
+            # Create an entirely new ActiveVersion and point it to the newest
+            # Structure.
+            self.active_versions.insert_one(
+                {
+                    '_id': obj_id(102),
+                    'edited_on': datetime(2012, 5, 3),
+                    'org': 'edx',
+                    'course': 'split_library_race',
+                    'run': 'library',
+                    'versions': {
+                        'library': obj_id(7),
+                    }
+                }
+            )
+
+            return structures
+
+        with patch.object(SplitMongoBackend, '_all_structures', autospec=True) as all_structures_mock:
+            all_structures_mock.side_effect = add_structures
+            graph = self.backend.structures_graph(0, 100)
+            self.assertEqual(len(graph.structures), 10)
+            self.assertEqual(len(graph.branches), 4)
+
+            plan = ChangePlan.create(graph, 0, False, False)
+            self.assertNotIn(str_id(5), plan.delete)  # Active updated to this for our course.
+            self.assertNotIn(str_id(7), plan.delete)  # Active for our new Library
+            self.assertIn(str_id(4), plan.delete)  # Was our Active before
+            self.assertIn(str_id(6), plan.delete)  # Intermediate structure to new Library
+
+
+def str_id(int_id):
+    """Return the string version of Object IDs that PyMongo will accept."""
+    return "{:024}".format(int_id)
+
+
+def obj_id(int_id):
+    """Helper to create Object IDs that PyMongo will accept."""
+    return ObjectId(str_id(int_id))
--- a/scripts/structures_pruning/utils/splitmongo.py
+++ b/scripts/structures_pruning/utils/splitmongo.py
@@ -0,0 +1,679 @@
+"""
+This module provides logic to clean up old, unused course content data for the
+DraftVersioningModuleStore modulestore, more commonly referred to as the "Split
+Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
+SplitMongoModuleStore). All courses and assets that have newer style locator
+keys use DraftVersioningModuleStore. These keys start with "course-v1:",
+"ccx-v1:", or "block-v1:".
+
+The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This
+code does not address that modulestore in any way. That modulestore handles
+courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as
+well as assets starting with "i4x://".
+
+"Split" gets its name from the fact that it separates the Structure of a course
+from the content in the leaf nodes. In theory, the Structure is an outline of
+the course that contains all the parent/child relations for different content
+blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as
+small, commonly inherited metadata like due dates. More detailed information
+about any particular block of content is stored in a separate collection as
+Definitions.
+
+Both Structures and Definitions are immutable in Split. When a course is edited,
+a new Structure is created, and the Active Versions entry for a course is
+updated to point to that new Structure. In that way, we never get a partially
+applied edit -- it either succeeds or fails atomically. The Active Versions
+entry for a Course has pointers to "published" and "draft" Structures. There is
+also a special "library" pointer that is only used by Content Libraries. We do
+not need to distinguish between these for the purposes of cleanup.
+
+The problem is that Structure documents have become far larger than they were
+intended to be, and we never created code to properly clean them up. As such, it
+is not uncommon for the majority of Mongo storage space to be used by old
+Structure documents that are completely unused (and are unreachable) by LMS or
+Studio.
+
+This module provides cleanup functionality with various tweakable options for
+how much history to preserve. For simplicity, it reads all Structure IDs into
+memory instead of working on subsets of the data. As a practical matter, this
+means that it will work for databases with up to about 10 million Structures
+before RAM usage starts to become a problem.
+"""
+from collections import deque, namedtuple
+from itertools import count, takewhile
+import json
+import logging
+import os
+import sys
+import time
+
+from bson.objectid import ObjectId
+from pymongo import MongoClient, UpdateOne
+from opaque_keys.edx.locator import CourseLocator, LibraryLocator
+
+LOG = logging.getLogger('structures')
+
+
+class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')):
+    """
+    This summarizes the entire set of Structure relationships in a database.
+
+    Each Structure represents a saved state for the Course or Content Library.
+    For each branch ("published", "draft", or "library"), there is a sequence of
+    Structures that starts with an Original and ends in an Active Structure::
+
+      Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
+
+    `branches` is a list of ActiveVersionBranch objects representing what's
+    currently live on the LMS and Studio. Active Structures referenced in this
+    list cannot be removed because it would break the site for users.
+
+    `structures` is a dict of Structure IDs (Strings) to Structure objects
+    (described above). All the Structure objects store ID locations to their
+    parent and original Structures rather than having direct references to them.
+    This is partly because we don't really need to traverse the vast majority of
+    the graph. Look at `ChangePlan` for details on why that is.
+    """
+    def traverse_ids(self, start_id, limit=None, include_start=False):
+        """
+        Given a Structure ID to start from, this will iterate through the
+        previous_id chain, for up to `limit` parent relationships. If `limit` is
+        None, it will keep going until it gets through the Original.
+        """
+        if include_start:
+            yield start_id
+
+        current_id = start_id
+        i = 0
+        while current_id in self.structures:
+            if limit is not None and i >= limit:
+                return
+
+            current_id = self.structures[current_id].previous_id
+            if current_id is None:
+                return
+
+            yield current_id
+            i += 1
+
+
+class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')):
+    """
+    An Active Version document can point to multiple branches (e.g. "published",
+    "draft"). This object represensts one of those branches.
+
+    The value for `branch` can be "draft-branch", "published-branch", or
+    "library". All Courses have a draft-branch and a published-branch. Content
+    Libraries have only a "library" branch.
+
+    The value for `key` is the Opaque Key representing the Course or Library,
+    mostly for debugging purposes (they're not a part of the plan file).
+
+    The value for `edited_on` is a timestamp showing the last time the Active
+    Version document was modified -- for a Course, this means when *either* the
+    published-branch or draft-branch was most recently modified. Again, this is
+    not used for pruning, but just provides debug information.
+    """
+    def __str__(self):
+        return "Active Version {} [{}] {} for {}".format(
+            self.id,
+            self.edited_on.strftime('%Y-%m-%d %H:%M:%S'),
+            self.branch,
+            self.key,
+        )
+
+
+class Structure(namedtuple('Structure', 'id original_id previous_id')):
+    """
+    The parts of a SplitMongo Structure document that we care about, namely the
+    ID (str'd version of the ObjectID), and the IDs of the Original and Previous
+    structure documents. The previous_id may be None ()
+
+    We use a namedtuple for this specifically because it's more space efficient
+    than a dict, and we can have millions of Structures.
+    """
+    def is_original(self):
+        """Is this Structure an original (i.e. should never be deleted)?"""
+        return self.previous_id is None
+
+
+class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')):
+    """
+    Summary of the pruning actions we want a Backend to take.
+
+    The idea of having this data structure and being able to serialize it is so
+    that we can save our plan of action somewhere for debugging, failure
+    recovery, and batching updates.
+
+    `delete` is a list of Structure IDs we want to delete.
+
+    `update_parents` is a list of (structure_id, new_previous_id) tuples
+    representing the previous_id updates we need to make.
+
+    A ChangePlan is just a declarative. It is the responsibility of the
+    Backend to figure out how to implement a ChangePlan safely and efficiently
+    in order to do the actual updates.
+    """
+    def dump(self, file_obj):
+        """Serialize ChangePlan to a file (JSON format)."""
+        json.dump(
+            {
+                "delete": self.delete,
+                "update_parents": self.update_parents,
+            },
+            file_obj,
+            indent=2,
+        )
+        LOG.info(
+            "Wrote Change Plan: %s (%s deletions, %s parent updates)",
+            os.path.realpath(file_obj.name),
+            len(self.delete),
+            len(self.update_parents)
+        )
+
+    @classmethod
+    def load(cls, file_obj):
+        """Load a ChangePlan from a JSON file. Takes a file object."""
+        data = json.load(file_obj)
+        return cls(
+            delete=data["delete"], update_parents=data["update_parents"]
+        )
+
+    @classmethod
+    def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None):
+        """
+        Given a StructuresGraph and a target number for intermediate Structures
+        to preserve, return a ChangePlan that represents the changes needed to
+        prune the database. The overall strategy is to iterate through all
+        Active Structures, walk back through the ancestors, and add all the
+        Structure IDs we should save to a set. After we have our save set, we
+        know that we can delete all other structures without worrying about
+        whether those Structures are reachable or knowing what their
+        relationships are. This keeps things simpler, and means that we should
+        be more resilient to failures when pruning.
+
+        Structure documents exist in chains of parent/child relationships,
+        starting with an Original Structure, having some number of Intermediate
+        Structures, and ending in an Active Structure::
+
+          Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
+
+        Pruning Rules:
+
+        1. All Active Structures must be preserved, as those are being used by
+           the LMS and Studio to serve course content.
+
+        2. All Original Structures should be preserved, since those are used by
+           the LMS and Studio to determine common shared ancestry between
+           Structures.
+
+        3. Up to `num_intermediate_structures` Intermediate Structures will be
+           kept. These Structures are not actually used in edx-platform code,
+           but they are sometimes used by developers to allow emergency reverts
+           in course team support situations (e.g. someone accidentally wiped
+           out their course with a bad import).
+
+        4. The oldest preserved Intermediate Structure will be modified so that
+           its `previous_id` is updated to point to the Original Structure. That
+           way, we're not preserving references to the IDs of Structures that
+           have been pruned.
+
+        """
+        structure_ids_to_save = set()
+        set_parent_to_original = set()
+
+        branches, structures = structures_graph
+
+        # Figure out which Structures to save...
+        for branch in branches:
+            # Anything that's actively being pointed to (is the head of a branch)
+            # must be preserved. This is what's being served by Studio and LMS.
+            active_structure_id = branch.structure_id
+            structure_ids_to_save.add(active_structure_id)
+
+            # All originals will be saved.
+            structure_ids_to_save.add(structures[active_structure_id].original_id)
+
+            # Save up to `num_intermediate_structures` intermediate nodes
+            int_structure_ids_to_save = structures_graph.traverse_ids(
+                active_structure_id, limit=num_intermediate_structures
+            )
+            for int_structure_id in int_structure_ids_to_save:
+                structure_ids_to_save.add(int_structure_id)
+
+        missing_structure_ids = structure_ids_to_save - structures.keys()
+
+        if ignore_missing:
+            # Remove missing structures since we can't save them
+            structure_ids_to_save -= missing_structure_ids
+        elif len(missing_structure_ids) > 0:
+            LOG.error("Missing structures detected")
+            sys.exit(1)
+
+        # Figure out what links to rewrite -- the oldest structure to save that
+        # isn't an original.
+        for branch in branches:
+            rewrite_candidates = takewhile(
+                lambda s: s in structure_ids_to_save and not structures[s].is_original(),
+                structures_graph.traverse_ids(branch.structure_id, include_start=True)
+            )
+            # `last_seen` will have the last structure_id from the
+            # `rewrite_candidates` iterable.
+            last_seen = deque(rewrite_candidates, 1)
+            if last_seen:
+                structure = structures[last_seen.pop()]
+                # Don't do a rewrite if it's just a no-op...
+                if structure.original_id != structure.previous_id:
+                    set_parent_to_original.add(structure.id)
+
+        # Sort the items in the ChangePlan. This might not be helpful, but I'm
+        # hoping that it will keep disk changes more localized and not thrash
+        # things as much as randomly distributed deletes. Mongo ObjectIDs are
+        # ordered (they have a timestamp component).
+        change_plan = cls(
+            delete=sorted(structures.keys() - structure_ids_to_save),
+            update_parents=sorted(
+                (s_id, structures[s_id].original_id)
+                for s_id in set_parent_to_original
+            )
+        )
+
+        if details_file:
+            change_plan.write_details(
+                details_file, structures_graph, structure_ids_to_save, set_parent_to_original
+            )
+
+        if dump_structures:
+            active_structure_ids = {branch.structure_id for branch in branches}
+            for sid in structures:
+                save = sid in structure_ids_to_save
+                active = sid in active_structure_ids
+                relink = sid in set_parent_to_original
+                prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
+                LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
+
+        for missing_structure_id in missing_structure_ids:
+            active_structure_ids = {branch.structure_id for branch in branches}
+
+            LOG.error(f"Missing structure ID: {missing_structure_id}")
+            original_ids = set()
+            for structure in structures.values():
+                if structure.previous_id == missing_structure_id:
+                    save = structure.id in structure_ids_to_save
+                    active = structure.id in active_structure_ids
+                    relink = structure.id in set_parent_to_original
+                    prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
+                    LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}")
+                    original_ids.add(structure.original_id)
+
+            active_structure_ids = {branch.structure_id for branch in branches}
+
+            branches_to_log = []
+
+            LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}")
+            for branch in branches:
+                structure = structures[branch.structure_id]
+                if structure.original_id in original_ids:
+                    for sid in structures_graph.traverse_ids(branch.structure_id):
+                        if sid not in structures:
+                            branches_to_log.append(branch)
+
+            for branch in branches_to_log:
+                structure = structures[branch.structure_id]
+
+                LOG.info(f"Branch: {branch}")
+
+                save = branch.structure_id in structure_ids_to_save
+                active = branch.structure_id in active_structure_ids
+                relink = branch.structure_id in set_parent_to_original
+                prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
+
+                for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True):
+                    if sid in structures:
+                        save = sid in structure_ids_to_save
+                        active = sid in active_structure_ids
+                        relink = sid in set_parent_to_original
+                        prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
+                        LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
+
+        return change_plan
+
+    @staticmethod
+    def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original):
+        """
+        Simple dump of the changes we're going to make to the database.
+
+        This method requires information that we don't actually keep in the
+        ChangePlan file, such as the Course IDs and edit times. Because of this,
+        it can only be created at the time the ChangePlan is being generated,
+        and cannot be derived from an existing ChangePlan. The goal was to
+        provide this debug information while keeping the ChangePlan file format
+        as stupidly simple as possible.
+        """
+        branches, structures = structures_graph
+        active_structure_ids = {branch.structure_id for branch in branches}
+
+        def text_for(s_id):
+            """Helper method to format Structures consistently."""
+            action = "+" if s_id in structure_ids_to_save else "-"
+            notes = []
+            if s_id in active_structure_ids:
+                notes.append("(active)")
+            if s_id in set_parent_to_original:
+                notes.append("(re-link to original)")
+            if s_id in structures and structures[s_id].is_original():
+                notes.append("(original)")
+
+            if notes:
+                return "{} {} {}".format(action, s_id, " ".join(notes))
+
+            return "{} {}".format(action, s_id)
+
+        print("== Summary ==", file=details_file)
+        print("Active Version Branches: {}".format(len(branches)), file=details_file)
+        print("Total Structures: {}".format(len(structures)), file=details_file)
+        print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file)
+        print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file)
+        print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file)
+        print("\n== Active Versions ==", file=details_file)
+
+        for branch in branches:
+            print("{}".format(branch), file=details_file)
+            for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True):
+                print(text_for(structure_id), file=details_file)
+            print("", file=details_file)
+
+        LOG.info(
+            "Wrote Change Details File: %s", os.path.realpath(details_file.name)
+        )
+
+
+class SplitMongoBackend:
+    """
+    Interface to the MongoDB backend. This is currently the only supported KV
+    store for the Split(DraftVersioning)ModuleStore, but having this as a
+    separate class makes it easier to stub in test data.
+
+    The methods on this class should accept and return backend-agnostic data
+    structures, so no BSON details should leak out.
+    """
+    def __init__(self, mongo_connection_str, db_name):
+        self._db = MongoClient(
+            mongo_connection_str,
+            connectTimeoutMS=2000,
+            socketTimeoutMS=300000,  # *long* operations
+            serverSelectionTimeoutMS=2000
+        )
+        self._active_versions = self._db[db_name].modulestore.active_versions
+        self._structures = self._db[db_name].modulestore.structures
+
+    def structures_graph(self, delay, batch_size):
+        """
+        Return StructuresGraph for the entire modulestore.
+
+        `batch_size` is the number of structure documents we pull at a time.
+        `delay` is the delay in seconds between batch queries.
+
+        This has one slight complication. A StructuresGraph is expected to be a
+        consistent view of the database, but MongoDB doesn't offer a "repeatable
+        read" transaction isolation mode. That means that Structures may be
+        added at any time between our database calls. Because of this, we have
+        to be careful in stitching together something that is safe. The
+        guarantees we try to make about the StructuresGraph being returned are:
+
+          1. Every Structure ID in `active_structure_ids` is also in `structures`
+          2. If `branches` is stale and there is a new Structure that is Active
+             in the database, it is *not* in `structures`.
+
+        Scenario A: We fetch branches, then structures
+          1. Get Branches (and thus Active Structure IDs)
+          2. New Structures created by Studio
+          3. Get all Structures
+
+        It is almost certainly the case that the new Structures created in (2)
+        should be active. Our algorithm works by starting from the Active
+        Structure IDs that we know about, making a "save" list, and then
+        deleting all other Structures. The problem in this scenario is that we
+        fetch the new Structures in (3), but we don't know that they're Active
+        because our `active_structure_ids` comes from (1) and is stale. So we
+        would in fact delete what should be Active Structures.
+
+        Scenario B: We fetch structures, then branches
+          1. Get all Structures
+          2. New Structures created by Studio
+          3. Get Branches (and thus Active Structure IDs)
+
+        In this scenario, we may see Active Structure IDs that are not in
+        our Structures dict. This is bad because we won't know how to crawl
+        their ancestry and mark the appropriate Structure IDs to be saved.
+
+        So the approach we take is Scenario B with a fallback. After we fetch
+        everything, we go through the Active Structure IDs and make sure that
+        those Structures and their ancestors exist in `structures`. If they
+        don't, we make extra fetches to get them. Misses should be rare, so it
+        shouldn't have a drastic performance impact overall.
+
+        Note that it's safe if the ChangePlan as a whole is a little stale, so
+        long as it's internally consistent. We only ever delete Structures that
+        are in the `structures` doc, so a new Active Version that we're
+        completely unaware of will be left alone.
+        """
+        structures = self._all_structures(delay, batch_size)
+        branches = self._all_branches()
+
+        # Guard against the race condition that branch.structure_id or its
+        # ancestors are not in `structures`. Make sure that we add those.
+        LOG.info(
+            "Checking for missing Structures (a small number are expected "
+            "unless edits are disabled during change plan creation)."
+        )
+        missing_count = 0
+        for branch in branches:
+            structure_id = branch.structure_id
+            while structure_id and (structure_id not in structures):
+                structures[structure_id] = self._get_structure(structure_id)
+                missing_count += 1
+                LOG.warning(
+                    "Structure %s linked from Active Structure %s (%s) fetched.",
+                    structure_id,
+                    branch.structure_id,
+                    branch.key,
+                )
+                structure_id = structures[structure_id].previous_id
+
+        LOG.info("Finished checking for missing Structures, found %s", missing_count)
+
+        return StructuresGraph(branches, structures)
+
+    def _all_structures(self, delay, batch_size):
+        """
+        Return a dict mapping Structure IDs to Structures for all Structures in
+        the database.
+
+        `batch_size` is the number of structure documents we pull at a time.
+        `delay` is the delay in seconds between batch queries.
+        """
+        LOG.info("Fetching all known Structures (this might take a while)...")
+        LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size)
+
+        # Important to keep this as a generator to limit memory usage.
+        parsed_docs = (
+            self.parse_structure_doc(doc)
+            for doc
+            in self._structures_from_db(delay, batch_size)
+        )
+        structures = {structure.id: structure for structure in parsed_docs}
+        LOG.info("Fetched %s Structures", len(structures))
+
+        return structures
+
+    def _structures_from_db(self, delay, batch_size):
+        """
+        Iterate through all Structure documents in the database.
+
+        `batch_size` is the number of structure documents we pull at a time.
+        `delay` is the delay in seconds between batch queries.
+        """
+        cursor = self._structures.find(
+            projection=['original_version', 'previous_version']
+        )
+        cursor.batch_size(batch_size)
+        for i, structure_doc in enumerate(cursor, start=1):
+            yield structure_doc
+            if i % batch_size == 0:
+                LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id'])
+                time.sleep(delay)
+
+    def _all_branches(self):
+        """Retrieve list of all ActiveVersionBranch objects in the database."""
+        branches = []
+        LOG.info("Fetching all Active Version Branches...")
+
+        for av_doc in self._active_versions.find():
+            for branch, obj_id in av_doc['versions'].items():
+                structure_id = str(obj_id)
+                if branch == 'library':
+                    key = LibraryLocator(av_doc['org'], av_doc['course'])
+                else:
+                    key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run'])
+
+                branches.append(
+                    ActiveVersionBranch(
+                        str(av_doc['_id']),
+                        branch,
+                        structure_id,
+                        key,
+                        av_doc['edited_on'],
+                    )
+                )
+
+        LOG.info("Fetched %s Active Version Branches", len(branches))
+
+        return sorted(branches)
+
+    def _get_structure(self, structure_id):
+        """Get an individual Structure from the database."""
+        structure_doc = self._structures.find_one(
+            {'_id': ObjectId(structure_id)},
+            projection=['original_version', 'previous_version']
+        )
+        return self.parse_structure_doc(structure_doc)
+
+    def update(self, change_plan, delay=1000, batch_size=1000, start=None):
+        """
+        Update the backend according to the relinking and deletions specified in
+        the change_plan.
+        """
+        # Step 1: Relink - Change the previous pointer for the oldest structure
+        # we want to keep, so that it points back to the original. We never
+        # delete the original. Relinking happens before deletion so that we
+        # never leave our course in a broken state (at worst, parts of it
+        # become unreachable).
+        self._update_parents(change_plan.update_parents, delay, batch_size)
+
+        # Step 2: Delete unused Structures
+        self._delete(change_plan.delete, delay, batch_size, start)
+
+    def _update_parents(self, id_parent_pairs, delay, batch_size):
+        """
+        Update Structure parent relationships.
+
+        `id_parent_pairs` is a list of tuples, where the first element of each
+        tuple is a Structure ID (str) to target, and the second element is the
+        Structure ID that will be the new parent of the first element.
+        """
+        for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size):
+            updates = [
+                UpdateOne(
+                    {'_id': ObjectId(structure_id)},
+                    {'$set': {'previous_version': ObjectId(previous_id)}}
+                )
+                for structure_id, previous_id in id_parent_pairs_batch
+            ]
+            result = self._structures.bulk_write(updates)
+            LOG.info(
+                "Updated %s/%s parent relationships.",
+                result.bulk_api_result['nModified'],
+                result.bulk_api_result['nMatched'],
+            )
+            time.sleep(delay)
+
+    def _delete(self, structure_ids, delay, batch_size, start=None):
+        """
+        Delete old structures in batches.
+
+        `structure_ids` is a list of Structure IDs to delete.
+        `delay` is the delay in seconds (floats are ok) between batch deletes.
+        `batch_size` is how many we try to delete in each batch statement.
+        """
+        s_ids_with_offset = self.iter_from_start(structure_ids, start)
+        for structure_ids_batch in self.batch(s_ids_with_offset, batch_size):
+            result = self._structures.delete_many(
+                {
+                    '_id': {
+                        '$in': [ObjectId(s_id) for s_id in structure_ids_batch]
+                    }
+                }
+            )
+            LOG.info(
+                "Deleted %s/%s Structures: %s - %s",
+                result.deleted_count,
+                len(structure_ids_batch),
+                structure_ids_batch[0],
+                structure_ids_batch[-1],
+            )
+            time.sleep(delay)
+
+    @staticmethod
+    def parse_structure_doc(structure_doc):
+        """
+        Structure docs are pretty big, but we only care about three top level
+        fields, all of which are ObjectIds:
+
+          _id: The Structure ID
+
+          previous_version: The Structure ID for the parent. An Original
+                            Structure will have None for this field.
+
+          original_version: The Original Structure that this Structure and all
+                            its ancestors are ultimately dervied from. An
+                            Original Structure points to itself with this field.
+        """
+        _id = str(structure_doc['_id'])
+        original_id = str(structure_doc['original_version'])
+        previous_id = structure_doc['previous_version']
+        if previous_id is not None:
+            previous_id = str(previous_id)
+        return Structure(_id, original_id, previous_id)
+
+    @staticmethod
+    def batch(iterable, batch_size):
+        """Yield lists of up to `batch_size` in length from `iterable`."""
+        iterator = iter(iterable)
+        curr_batch = []
+        for i in count(1):
+            try:
+                curr_batch.append(next(iterator))
+                if i % batch_size == 0:
+                    yield curr_batch
+                    curr_batch = []
+            except StopIteration:
+                break
+        if curr_batch:
+            yield curr_batch
+
+    @staticmethod
+    def iter_from_start(structure_ids, start=None):
+        """
+        Yields from an iterable once it encounters the `start` value. If `start`
+        is None, just yields from the beginning.
+        """
+        if start is None:
+            for structure_id in structure_ids:
+                yield structure_id
+            return
+
+        for structure_id in structure_ids:
+            if structure_id < start:
+                continue
+            yield structure_id