chore: Moved structures.py from tubular repository (#34328)
* chore: Moved structures.py from tubular repository
This commit is contained in:
committed by
GitHub
parent
da244a99d3
commit
7808913916
33
.github/workflows/units-test-scripts-structures-pruning.yml
vendored
Normal file
33
.github/workflows/units-test-scripts-structures-pruning.yml
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
name: units-test-scripts-common
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [ '3.8', '3.12' ]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r scripts/structures_pruning/requirements/testing.txt
|
||||
|
||||
- name: Run pytest
|
||||
run: |
|
||||
pytest scripts/structures_pruning
|
||||
4
Makefile
4
Makefile
@@ -142,7 +142,9 @@ REQ_FILES = \
|
||||
requirements/edx/semgrep \
|
||||
scripts/xblock/requirements \
|
||||
scripts/user_retirement/requirements/base \
|
||||
scripts/user_retirement/requirements/testing
|
||||
scripts/user_retirement/requirements/testing \
|
||||
scripts/structures_pruning/requirements/base \
|
||||
scripts/structures_pruning/requirements/testing
|
||||
|
||||
define COMMON_CONSTRAINTS_TEMP_COMMENT
|
||||
# This is a temporary solution to override the real common_constraints.txt\n# In edx-lint, until the pyjwt constraint in edx-lint has been removed.\n# See BOM-2721 for more details.\n# Below is the copied and edited version of common_constraints\n
|
||||
|
||||
73
scripts/structures_pruning/README.rst
Normal file
73
scripts/structures_pruning/README.rst
Normal file
@@ -0,0 +1,73 @@
|
||||
Structures Pruning Scripts
|
||||
==========================
|
||||
|
||||
`This <https://github.com/openedx/edx-platform/tree/master/scripts/structures_pruning>`_ directory contains mongo db structures pruning script that is migrated from the
|
||||
`tubular <https://github.com/openedx/tubular>`_ repository.
|
||||
|
||||
|
||||
This script could be called from any automation/CD framework.
|
||||
|
||||
How to run the scripts
|
||||
======================
|
||||
|
||||
Download the Scripts
|
||||
--------------------
|
||||
|
||||
To download the scripts, you can perform a partial clone of the edx-platform repository to obtain only the required scripts. The following steps demonstrate how to achieve this. Alternatively, you may choose other utilities or libraries for the partial clone.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
repo_url=git@github.com:openedx/edx-platform.git
|
||||
branch=master
|
||||
directory=scripts/structures_pruning
|
||||
|
||||
git clone --branch $branch --single-branch --depth=1 --filter=tree:0 $repo_url
|
||||
cd edx-platform
|
||||
git sparse-checkout init --cone
|
||||
git sparse-checkout set $directory
|
||||
|
||||
Create Python Virtual Environment
|
||||
---------------------------------
|
||||
|
||||
Create a Python virtual environment using Python 3.8:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3.8 -m venv ../venv
|
||||
source ../venv/bin/activate
|
||||
|
||||
Install Pip Packages
|
||||
--------------------
|
||||
|
||||
Install the required pip packages using the provided requirements file:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -r scripts/structures_pruning/requirements/base.txt
|
||||
|
||||
|
||||
Execute Script
|
||||
--------------
|
||||
|
||||
You can simply execute Python scripts with python command
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python scripts/structures_pruning/structures.py prune plan_file.json
|
||||
|
||||
Feel free to customize these steps according to your specific environment and requirements.
|
||||
|
||||
Run Test Cases
|
||||
==============
|
||||
|
||||
Before running test cases, install the testing requirements:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -r scripts/structures_pruning/requirements/testing.txt
|
||||
|
||||
Run the test cases using pytest:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pytest scripts/structures_pruning
|
||||
0
scripts/structures_pruning/__init__.py
Normal file
0
scripts/structures_pruning/__init__.py
Normal file
0
scripts/structures_pruning/pytest.ini
Normal file
0
scripts/structures_pruning/pytest.ini
Normal file
4
scripts/structures_pruning/requirements/base.in
Normal file
4
scripts/structures_pruning/requirements/base.in
Normal file
@@ -0,0 +1,4 @@
|
||||
click
|
||||
click-log
|
||||
edx-opaque-keys
|
||||
pymongo
|
||||
24
scripts/structures_pruning/requirements/base.txt
Normal file
24
scripts/structures_pruning/requirements/base.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# make upgrade
|
||||
#
|
||||
click==8.1.7
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.in
|
||||
# click-log
|
||||
click-log==0.4.0
|
||||
# via -r scripts/structures_pruning/requirements/base.in
|
||||
edx-opaque-keys==2.5.1
|
||||
# via -r scripts/structures_pruning/requirements/base.in
|
||||
pbr==6.0.0
|
||||
# via stevedore
|
||||
pymongo==3.13.0
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.in
|
||||
# edx-opaque-keys
|
||||
stevedore==5.2.0
|
||||
# via edx-opaque-keys
|
||||
typing-extensions==4.10.0
|
||||
# via edx-opaque-keys
|
||||
4
scripts/structures_pruning/requirements/testing.in
Normal file
4
scripts/structures_pruning/requirements/testing.in
Normal file
@@ -0,0 +1,4 @@
|
||||
-r base.txt
|
||||
|
||||
ddt
|
||||
pytest
|
||||
44
scripts/structures_pruning/requirements/testing.txt
Normal file
44
scripts/structures_pruning/requirements/testing.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.8
|
||||
# by the following command:
|
||||
#
|
||||
# make upgrade
|
||||
#
|
||||
click==8.1.7
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.txt
|
||||
# click-log
|
||||
click-log==0.4.0
|
||||
# via -r scripts/structures_pruning/requirements/base.txt
|
||||
ddt==1.7.2
|
||||
# via -r scripts/structures_pruning/requirements/testing.in
|
||||
edx-opaque-keys==2.5.1
|
||||
# via -r scripts/structures_pruning/requirements/base.txt
|
||||
exceptiongroup==1.2.0
|
||||
# via pytest
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
packaging==24.0
|
||||
# via pytest
|
||||
pbr==6.0.0
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.txt
|
||||
# stevedore
|
||||
pluggy==1.4.0
|
||||
# via pytest
|
||||
pymongo==3.13.0
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.txt
|
||||
# edx-opaque-keys
|
||||
pytest==8.1.1
|
||||
# via -r scripts/structures_pruning/requirements/testing.in
|
||||
stevedore==5.2.0
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.txt
|
||||
# edx-opaque-keys
|
||||
tomli==2.0.1
|
||||
# via pytest
|
||||
typing-extensions==4.10.0
|
||||
# via
|
||||
# -r scripts/structures_pruning/requirements/base.txt
|
||||
# edx-opaque-keys
|
||||
200
scripts/structures_pruning/structures.py
Normal file
200
scripts/structures_pruning/structures.py
Normal file
@@ -0,0 +1,200 @@
|
||||
#! /usr/bin/env python3
|
||||
"""
|
||||
Script to detect and prune old Structure documents from the "Split" Modulestore
|
||||
MongoDB (edxapp.modulestore.structures by default). See docstring/help for the
|
||||
"make_plan" and "prune" commands for more details.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from os import path
|
||||
import sys
|
||||
|
||||
import click
|
||||
import click_log
|
||||
|
||||
# Add top-level project path to sys.path before importing scripts code
|
||||
sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
|
||||
|
||||
from scripts.structures_pruning.utils.splitmongo import SplitMongoBackend, ChangePlan
|
||||
|
||||
# Add top-level module path to sys.path before importing tubular code.
|
||||
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
# from tubular.splitmongo import ChangePlan, SplitMongoBackend # pylint: disable=wrong-import-position
|
||||
|
||||
LOG = logging.getLogger('structures')
|
||||
click_log.basic_config(LOG)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option(
|
||||
'--connection',
|
||||
default="mongodb://localhost:27017",
|
||||
help=(
|
||||
'Connection string to the target mongo database. This defaults to '
|
||||
'localhost without password (that will work against devstack). '
|
||||
'You may need to use urllib.parse.quote_plus() to percent-escape '
|
||||
'your username and password.'
|
||||
)
|
||||
)
|
||||
@click.option(
|
||||
'--database-name',
|
||||
default='edxapp',
|
||||
help='Name of the edX Mongo database containing the course structures to prune.'
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx, connection, database_name):
|
||||
"""
|
||||
Recover space on MongoDB for edx-platform by deleting unreachable,
|
||||
historical course content data. To use, first make a change plan with the
|
||||
"make_plan" command, and then execute that plan against the database with
|
||||
the "prune" command.
|
||||
|
||||
This script provides logic to clean up old, unused course content data for
|
||||
the DraftVersioningModuleStore modulestore, more commonly referred to as the
|
||||
"Split Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
|
||||
SplitMongoModuleStore). All courses and assets that have newer style locator
|
||||
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
|
||||
"ccx-v1:", or "block-v1:". Studio authored content data for this modulestore
|
||||
is saved as immutable data structures. The edx-platform code never cleans up
|
||||
old data however, meaning there is an unbounded history of a course's
|
||||
content revisions stored in MongoDB.
|
||||
|
||||
The older modulestore is DraftModuleStore, sometimes called "Old Mongo".
|
||||
This code does not address that modulestore in any way. That modulestore
|
||||
handles courses that use the old "/" separator, such as
|
||||
"MITx/6.002x/2012_Spring", as well as assets starting with "i4x://".
|
||||
"""
|
||||
if ctx.obj is None:
|
||||
ctx.obj = dict()
|
||||
|
||||
ctx.obj['BACKEND'] = SplitMongoBackend(connection, database_name)
|
||||
|
||||
|
||||
@cli.command("make_plan")
|
||||
@click_log.simple_verbosity_option(default='INFO')
|
||||
@click.argument('plan_file', type=click.File('w'))
|
||||
@click.option(
|
||||
'--details',
|
||||
type=click.File('w'),
|
||||
default=None,
|
||||
help="Name of file to write the human-readable details of the Change Plan."
|
||||
)
|
||||
@click.option(
|
||||
'--retain',
|
||||
default=2,
|
||||
type=click.IntRange(0, None),
|
||||
help=("The maximum number of intermediate structures to preserve for any "
|
||||
"single branch of an active version. This value does not include the "
|
||||
"active or original structures (those are always preserved). Defaults "
|
||||
"to 2. Put 0 here if you want to prune as much as possible.")
|
||||
)
|
||||
@click.option(
|
||||
'--delay',
|
||||
default=15000,
|
||||
type=click.IntRange(0, None),
|
||||
help=("Delay in milliseconds between queries to fetch structures from MongoDB "
|
||||
"during plan creation. Tune to adjust load on the database.")
|
||||
)
|
||||
@click.option(
|
||||
'--batch-size',
|
||||
default=10000,
|
||||
type=click.IntRange(1, None),
|
||||
help="How many Structures do we fetch at a time?"
|
||||
)
|
||||
@click.option(
|
||||
'--ignore-missing/--no-ignore-missing',
|
||||
default=False,
|
||||
help=("Force plan creation, even if missing structures are found. "
|
||||
"Should repair invalid ids by repointing to original. "
|
||||
"Review of plan highly recommended")
|
||||
)
|
||||
@click.option(
|
||||
'--dump-structures/--no-dump-structures',
|
||||
default=False,
|
||||
help="Dump all strucutres to stderr for debugging or recording state before cleanup."
|
||||
)
|
||||
@click.pass_context
|
||||
def make_plan(ctx, plan_file, details, retain, delay, batch_size, ignore_missing, dump_structures):
|
||||
"""
|
||||
Create a Change Plan JSON file describing the operations needed to prune the
|
||||
database. This command is read-only and does not alter the database.
|
||||
|
||||
The Change Plan JSON is a dictionary with two keys:
|
||||
|
||||
"delete" - A sorted array of Structure document IDs to delete. Since MongoDB
|
||||
object IDs are created in ascending order by timestamp, this means that the
|
||||
oldest documents come earlier in the list.
|
||||
|
||||
"update_parents" - A list of [Structure ID, New Parent/Previous ID] pairs.
|
||||
This is used to re-link the oldest preserved Intermediate Structure back to
|
||||
the Original Structure, so that we don't leave the database in a state where
|
||||
a Structure's "previous_version" points to a deleted Structure.
|
||||
|
||||
Specifying a --details file will generate a more verbose, human-readable
|
||||
text description of the Change Plan for verification purposes. The details
|
||||
file will only display Structures that are reachable from an Active Version,
|
||||
so any Structures that are "orphaned" as a result of partial runs of this
|
||||
script or Studio race conditions will not be reflected. That being said,
|
||||
orphaned Structures are detected and properly noted in the Change Plan JSON.
|
||||
"""
|
||||
structures_graph = ctx.obj['BACKEND'].structures_graph(delay / 1000.0, batch_size)
|
||||
|
||||
# This will create the details file as a side-effect, if specified.
|
||||
change_plan = ChangePlan.create(structures_graph, retain, ignore_missing, dump_structures, details)
|
||||
change_plan.dump(plan_file)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click_log.simple_verbosity_option(default='INFO')
|
||||
@click.argument('plan_file', type=click.File('r'))
|
||||
@click.option(
|
||||
'--delay',
|
||||
default=15000,
|
||||
type=click.IntRange(0, None),
|
||||
help=("Delay in milliseconds between batch deletions during pruning. Tune to "
|
||||
"adjust load on the database.")
|
||||
)
|
||||
@click.option(
|
||||
'--batch-size',
|
||||
default=1000,
|
||||
type=click.IntRange(1, None),
|
||||
help=("How many Structures do we delete at a time? Tune to adjust load on "
|
||||
"the database.")
|
||||
)
|
||||
@click.option(
|
||||
'--start',
|
||||
default=None,
|
||||
help=("Structure ID to start deleting from. Specifying a Structure ID that "
|
||||
"is not in the Change Plan is an error. Specifying a Structure ID that "
|
||||
"has already been deleted is NOT an error, so it's safe to re-run.")
|
||||
)
|
||||
@click.pass_context
|
||||
def prune(ctx, plan_file, delay, batch_size, start):
|
||||
"""
|
||||
Prune the MongoDB database according to a Change Plan file.
|
||||
|
||||
This command tries to be as safe as possible. It executes parent updates
|
||||
before deletes, so an interruption at any point should be safe in that it
|
||||
won't leave the structure graphs in an inconsistent state. It should also
|
||||
be safe to resume pruning with the same Change Plan in the event of an
|
||||
interruption.
|
||||
|
||||
It's also safe to run while Studio is still operating, though you should be
|
||||
careful to test and tweak the delay and batch_size options to throttle load
|
||||
on your database.
|
||||
"""
|
||||
change_plan = ChangePlan.load(plan_file)
|
||||
if start is not None and start not in change_plan.delete:
|
||||
raise click.BadParameter(
|
||||
"{} is not in the Change Plan {}".format(
|
||||
start, click.format_filename(plan_file.name)
|
||||
),
|
||||
param_hint='--start'
|
||||
)
|
||||
ctx.obj['BACKEND'].update(change_plan, delay / 1000.0, batch_size, start)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# pylint doesn't grok click magic, but this is straight from their docs...
|
||||
cli(obj={}) # pylint: disable=no-value-for-parameter, unexpected-keyword-arg
|
||||
0
scripts/structures_pruning/tests/__init__.py
Normal file
0
scripts/structures_pruning/tests/__init__.py
Normal file
502
scripts/structures_pruning/tests/test_splitmongo.py
Normal file
502
scripts/structures_pruning/tests/test_splitmongo.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""
|
||||
Test Structure pruning related Split Mongo code.
|
||||
|
||||
IMPORTANT: If you are making changes to this code, please re-enable the
|
||||
TestSplitMongoBackend tests and run them locally against the MongoDB instance
|
||||
in your Docker Devstack. See the TestSplitMongoBackend docstring for more info.
|
||||
"""
|
||||
import itertools
|
||||
import sys
|
||||
import textwrap
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
from io import StringIO
|
||||
from os import path
|
||||
from unittest.mock import patch
|
||||
|
||||
import ddt
|
||||
from bson.objectid import ObjectId
|
||||
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
|
||||
from pymongo import MongoClient
|
||||
|
||||
# Add top-level project path to sys.path before importing scripts code
|
||||
sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
|
||||
|
||||
from scripts.structures_pruning.utils.splitmongo import (
|
||||
ActiveVersionBranch, ChangePlan, Structure, SplitMongoBackend, StructuresGraph
|
||||
)
|
||||
|
||||
|
||||
def create_test_graph(*version_histories):
|
||||
"""
|
||||
Given any number of lists, where each list represents a history of Structure
|
||||
IDs from oldest to newest, return a StructureGraph matching that
|
||||
specification. Course names, branch names, and other attributes that exist
|
||||
for debugging/reporting but do not change pruning behavior will be
|
||||
automatically generated with plausible values.
|
||||
"""
|
||||
all_structures = {}
|
||||
all_active_version_branches = []
|
||||
|
||||
active_id_pool = ("A{:023x}".format(i) for i in itertools.count(1))
|
||||
course_key_pool = (
|
||||
CourseLocator('edx', 'splitmongo', str(i)) for i in itertools.count(1)
|
||||
)
|
||||
branch_pool = itertools.cycle(['draft-branch', 'published-branch'])
|
||||
|
||||
for version_history in version_histories:
|
||||
assert version_history # The history can't be empty
|
||||
structure_ids = [str(version) for version in version_history]
|
||||
|
||||
# Create the Original
|
||||
original_id = structure_ids[0]
|
||||
history = [Structure(original_id, original_id, None)]
|
||||
|
||||
# Create all other Structures (if any)
|
||||
for previous_id, current_id in zip(structure_ids, structure_ids[1:]):
|
||||
history.append(Structure(current_id, original_id, previous_id))
|
||||
|
||||
# Add to our overall Structures dict (overwrites should be identical or
|
||||
# our test data is bad).
|
||||
for structure in history:
|
||||
if structure.id in all_structures:
|
||||
assert structure == all_structures[structure.id]
|
||||
else:
|
||||
all_structures[structure.id] = structure
|
||||
|
||||
active_version_id = structure_ids[-1]
|
||||
all_active_version_branches.append(
|
||||
ActiveVersionBranch(
|
||||
id=next(active_id_pool),
|
||||
branch=next(branch_pool),
|
||||
structure_id=active_version_id,
|
||||
key=next(course_key_pool),
|
||||
edited_on=datetime(2012, 5, 2)
|
||||
|
||||
)
|
||||
)
|
||||
|
||||
return StructuresGraph(all_active_version_branches, all_structures)
|
||||
|
||||
|
||||
@ddt.ddt
|
||||
class TestCourseChangePlan(unittest.TestCase):
|
||||
"""
|
||||
ChangePlans for single and multiple courses.
|
||||
"""
|
||||
|
||||
def test_simple(self):
|
||||
"""Simple happy path ChangePlans."""
|
||||
graph = create_test_graph(["1", "2", "3", "4"])
|
||||
|
||||
# Preserve no intermediate structures -- prune the middle structures.
|
||||
plan_no_intermediate = ChangePlan.create(graph, 0, False, False)
|
||||
self.assertEqual(plan_no_intermediate.delete, ["2", "3"])
|
||||
self.assertEqual(plan_no_intermediate.update_parents, [("4", "1")])
|
||||
|
||||
# Preserve one intermediate structure
|
||||
plan_1_intermediate = ChangePlan.create(graph, 1, False, False)
|
||||
self.assertEqual(plan_1_intermediate.delete, ["2"])
|
||||
self.assertEqual(plan_1_intermediate.update_parents, [("3", "1")])
|
||||
|
||||
# Preserve two intermediate structures -- Do nothing
|
||||
plan_2_intermediate = ChangePlan.create(graph, 2, False, False)
|
||||
self.assertEqual(plan_2_intermediate.delete, [])
|
||||
self.assertEqual(plan_2_intermediate.update_parents, [])
|
||||
|
||||
@ddt.data(
|
||||
create_test_graph(["1"]), # Original (is also Active)
|
||||
create_test_graph(["1", "2"]), # "1" = Original, "2" = Active
|
||||
)
|
||||
def test_no_changes(self, graph):
|
||||
"""These scenarios should result in no Changes."""
|
||||
plan_1 = ChangePlan.create(graph, 0, False, False)
|
||||
plan_2 = ChangePlan.create(graph, 2, False, False)
|
||||
self.assertEqual(plan_1, plan_2)
|
||||
self.assertEqual(plan_1.delete, [])
|
||||
self.assertEqual(plan_1.update_parents, [])
|
||||
|
||||
def test_overlapping_shared_history(self):
|
||||
"""Test multiple branches that overlap in what history to preserve."""
|
||||
graph = create_test_graph(
|
||||
["1", "2", "3"],
|
||||
["1", "2", "3", "4", "5"],
|
||||
["1", "2", "3", "6"],
|
||||
["1", "2", "7", "8", "9", "10"],
|
||||
)
|
||||
plan = ChangePlan.create(graph, 1, False, False)
|
||||
|
||||
# We specified only one intermediate structure in each branch should be
|
||||
# preserved. So why do we only delete "7" and "8" here?
|
||||
# "1" is the original structure, and will always be preserved.
|
||||
# "2" is the intermediate structure preserved by the first branch. It
|
||||
# won't be deleted, even if other branches might want to flag it for
|
||||
# deletion.
|
||||
# "3" would be deleted by the second branch, but it's Active in the
|
||||
# first, and so is preserved. Active Structures are never deleted.
|
||||
# "4" is preserved by the second branch.
|
||||
# "5" is the Active Structure for the second branch.
|
||||
# "6" is the Active Structure for the third branch.
|
||||
# "7" is marked for deletion by the fourth branch.
|
||||
# "8" is marked for deletion by the fourth branch.
|
||||
# "9" is preserved by the fourth branch.
|
||||
# "10" is the Active Structure for the fourth branch.
|
||||
self.assertEqual(plan.delete, ["7", "8"])
|
||||
self.assertEqual(plan.update_parents, [("9", "1")])
|
||||
|
||||
def test_non_overlapping_shared_history(self):
|
||||
"""Test shared history, preserved intermediate set doesn't overlap."""
|
||||
graph = create_test_graph(
|
||||
["1", "2", "3"],
|
||||
["1", "2", "3", "4", "5", "6"],
|
||||
)
|
||||
plan = ChangePlan.create(graph, 0, False, False)
|
||||
self.assertEqual(plan.delete, ["2", "4", "5"])
|
||||
self.assertEqual(plan.update_parents, [("3", "1"), ("6", "1")])
|
||||
|
||||
graph_save_1 = create_test_graph(
|
||||
["1", "2", "3", "4"],
|
||||
["1", "2", "3", "4", "5", "6", "7"],
|
||||
)
|
||||
plan_save_1 = ChangePlan.create(graph_save_1, 1, False, False)
|
||||
self.assertEqual(plan_save_1.delete, ["2", "5"])
|
||||
self.assertEqual(plan_save_1.update_parents, [("3", "1"), ("6", "1")])
|
||||
|
||||
def test_details_output(self):
|
||||
"""Test our details file output."""
|
||||
graph = create_test_graph(
|
||||
["1"],
|
||||
["2", "3"],
|
||||
["4", "5", "6"]
|
||||
)
|
||||
buff = StringIO()
|
||||
buff.name = "test_file.txt"
|
||||
plan = ChangePlan.create(graph, 0, False, False, buff)
|
||||
details_txt = buff.getvalue()
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
expected_output = textwrap.dedent(
|
||||
"""
|
||||
== Summary ==
|
||||
Active Version Branches: 3
|
||||
Total Structures: 6
|
||||
Structures to Save: 5
|
||||
Structures to Delete: 1
|
||||
Structures to Rewrite Parent Link: 1
|
||||
|
||||
== Active Versions ==
|
||||
Active Version A00000000000000000000001 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+1
|
||||
+ 1 (active) (original)
|
||||
|
||||
Active Version A00000000000000000000002 [2012-05-02 00:00:00] published-branch for course-v1:edx+splitmongo+2
|
||||
+ 3 (active)
|
||||
+ 2 (original)
|
||||
|
||||
Active Version A00000000000000000000003 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+3
|
||||
+ 6 (active) (re-link to original)
|
||||
- 5
|
||||
+ 4 (original)
|
||||
|
||||
"""
|
||||
).lstrip()
|
||||
# pylint: enable=line-too-long
|
||||
self.assertEqual(expected_output, details_txt)
|
||||
self.assertEqual(
|
||||
plan,
|
||||
ChangePlan(
|
||||
delete=["5"],
|
||||
update_parents=[("6", "4")]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class TestSplitMongoBackendHelpers(unittest.TestCase):
|
||||
"""
|
||||
Test the static helper methods of SplitMongoBackend.
|
||||
|
||||
Requires no actual database connection.
|
||||
"""
|
||||
|
||||
def test_parse_structure_doc(self):
|
||||
"""Test basic parsing of Structures."""
|
||||
original_structure = SplitMongoBackend.parse_structure_doc(
|
||||
{
|
||||
'_id': obj_id(1),
|
||||
'original_version': obj_id(1),
|
||||
'previous_version': None,
|
||||
'extra_data': "This is ignored"
|
||||
}
|
||||
)
|
||||
self.assertEqual(
|
||||
original_structure,
|
||||
Structure(id=str_id(1), original_id=str_id(1), previous_id=None)
|
||||
)
|
||||
self.assertTrue(original_structure.is_original())
|
||||
|
||||
other_structure = SplitMongoBackend.parse_structure_doc(
|
||||
{
|
||||
'_id': obj_id(2),
|
||||
'original_version': obj_id(1),
|
||||
'previous_version': obj_id(1),
|
||||
'extra_data': "This is ignored"
|
||||
}
|
||||
)
|
||||
self.assertEqual(
|
||||
other_structure,
|
||||
Structure(id=str_id(2), original_id=str_id(1), previous_id=str_id(1))
|
||||
)
|
||||
self.assertFalse(other_structure.is_original())
|
||||
|
||||
def test_batch(self):
|
||||
"""Test the batch helper that breaks up iterables for DB operations."""
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.batch([], 1)),
|
||||
[]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.batch([1, 2, 3], 1)),
|
||||
[[1], [2], [3]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.batch([1, 2, 3], 2)),
|
||||
[[1, 2], [3]]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.batch([1, 2, 3, 4], 2)),
|
||||
[[1, 2], [3, 4]]
|
||||
)
|
||||
|
||||
def test_iter_from_start(self):
|
||||
"""Test what we use to resume deletion from a given Structure ID."""
|
||||
all_ids = [1, 2, 3]
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.iter_from_start(all_ids, None)),
|
||||
all_ids
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.iter_from_start(all_ids, 1)),
|
||||
all_ids
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.iter_from_start(all_ids, 2)),
|
||||
[2, 3]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.iter_from_start(all_ids, 3)),
|
||||
[3]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(SplitMongoBackend.iter_from_start(all_ids, 4)),
|
||||
[]
|
||||
)
|
||||
|
||||
|
||||
@unittest.skip("Requires local MongoDB instance (run manually).")
|
||||
class TestSplitMongoBackend(unittest.TestCase):
|
||||
"""
|
||||
Tests the MongoDB-specific portions of the code.
|
||||
|
||||
These tests should be about simple read/write from the database. Complex
|
||||
trees of Structures can be created and tested in TestSingleCourseChangePlan
|
||||
without invoking the database.
|
||||
|
||||
These tests will be disabled by default because I didn't want to add MongoDB
|
||||
as a test-time dependency for tubular, and the only decent looking MongoDB
|
||||
mocking library I could find was no longer being maintained. Given how
|
||||
isolated Split Mongo related code is in tubular (nothing else touches it),
|
||||
the main danger of breakage comes from file format changes in edx-platform,
|
||||
which automated testing at this level wouldn't catch anyway.
|
||||
|
||||
So basically, if you want to work on this code, please run these tests
|
||||
locally by spinning up the MongoDB server used for Docker Devstack and
|
||||
commenting out the unittest.skip decorator above.
|
||||
"""
|
||||
CONNECT_STR = "mongodb://localhost:27017"
|
||||
DATABASE_NAME = "splitmongo_test"
|
||||
|
||||
def setUp(self):
|
||||
"""Clear our test MongoDB instance of data."""
|
||||
super().setUp()
|
||||
|
||||
self.client = MongoClient(self.CONNECT_STR)
|
||||
database = self.client[self.DATABASE_NAME]
|
||||
|
||||
# Remove anything that might have been there from a previous test.
|
||||
database.drop_collection('modulestore.active_versions')
|
||||
database.drop_collection('modulestore.structures')
|
||||
|
||||
# Convenince pointers to our collections.
|
||||
self.active_versions = database['modulestore.active_versions']
|
||||
self.structures = database['modulestore.structures']
|
||||
|
||||
# The backend we should use in our tests for querying.
|
||||
self.backend = SplitMongoBackend(self.CONNECT_STR, self.DATABASE_NAME)
|
||||
self.seed_data()
|
||||
|
||||
def seed_data(self):
|
||||
"""Create a Course and Library."""
|
||||
structure_docs = [
|
||||
# Branch 1
|
||||
dict(_id=obj_id(1), original_version=obj_id(1), previous_version=None),
|
||||
dict(_id=obj_id(2), original_version=obj_id(1), previous_version=obj_id(1)),
|
||||
dict(_id=obj_id(3), original_version=obj_id(1), previous_version=obj_id(2)),
|
||||
dict(_id=obj_id(4), original_version=obj_id(1), previous_version=obj_id(3)),
|
||||
|
||||
# Branch 2
|
||||
dict(_id=obj_id(10), original_version=obj_id(10), previous_version=None),
|
||||
dict(_id=obj_id(11), original_version=obj_id(10), previous_version=obj_id(10)),
|
||||
|
||||
# Branch 3
|
||||
dict(_id=obj_id(20), original_version=obj_id(20), previous_version=None),
|
||||
]
|
||||
active_versions_docs = [
|
||||
{
|
||||
'_id': obj_id(100),
|
||||
'edited_on': datetime(2012, 5, 2),
|
||||
'org': 'edx',
|
||||
'course': 'split_course',
|
||||
'run': '2017',
|
||||
'versions': {
|
||||
'draft-branch': obj_id(4),
|
||||
'published-branch': obj_id(11)
|
||||
}
|
||||
},
|
||||
{
|
||||
'_id': obj_id(101),
|
||||
'edited_on': datetime(2012, 5, 3),
|
||||
'org': 'edx',
|
||||
'course': 'split_library',
|
||||
'run': 'library',
|
||||
'versions': {
|
||||
'library': obj_id(20),
|
||||
}
|
||||
}
|
||||
]
|
||||
self.structures.insert_many(structure_docs)
|
||||
self.active_versions.insert_many(active_versions_docs)
|
||||
|
||||
def test_structures_graph(self):
|
||||
"""Test pulling a full graph out."""
|
||||
graph = self.backend.structures_graph(0, 100)
|
||||
self.assertEqual(
|
||||
graph.branches,
|
||||
[
|
||||
ActiveVersionBranch(
|
||||
id=str_id(100),
|
||||
branch='draft-branch',
|
||||
structure_id=str_id(4),
|
||||
key=CourseLocator('edx', 'split_course', '2017'),
|
||||
edited_on=datetime(2012, 5, 2),
|
||||
),
|
||||
ActiveVersionBranch(
|
||||
id=str_id(100),
|
||||
branch='published-branch',
|
||||
structure_id=str_id(11),
|
||||
key=CourseLocator('edx', 'split_course', '2017'),
|
||||
edited_on=datetime(2012, 5, 2),
|
||||
),
|
||||
ActiveVersionBranch(
|
||||
id=str_id(101),
|
||||
branch='library',
|
||||
structure_id=str_id(20),
|
||||
key=LibraryLocator('edx', 'split_library'),
|
||||
edited_on=datetime(2012, 5, 3),
|
||||
),
|
||||
]
|
||||
)
|
||||
self.assertEqual(
|
||||
list(graph.structures.keys()),
|
||||
[str_id(i) for i in [1, 2, 3, 4, 10, 11, 20]]
|
||||
)
|
||||
|
||||
def test_update(self):
|
||||
"""Execute a simple update."""
|
||||
self.backend.update(
|
||||
ChangePlan(
|
||||
delete=[str_id(i) for i in [2, 3]],
|
||||
update_parents=[(str_id(4), str_id(1))]
|
||||
),
|
||||
delay=0
|
||||
)
|
||||
graph = self.backend.structures_graph(0, 100)
|
||||
self.assertEqual(
|
||||
list(graph.structures.keys()),
|
||||
[str_id(i) for i in [1, 4, 10, 11, 20]]
|
||||
)
|
||||
self.assertEqual(
|
||||
graph.structures,
|
||||
{
|
||||
str_id(1): Structure(id=str_id(1), original_id=str_id(1), previous_id=None),
|
||||
# This one got its previous_id rewritten from 3 -> 1
|
||||
str_id(4): Structure(id=str_id(4), original_id=str_id(1), previous_id=str_id(1)),
|
||||
str_id(10): Structure(id=str_id(10), original_id=str_id(10), previous_id=None),
|
||||
str_id(11): Structure(id=str_id(11), original_id=str_id(10), previous_id=str_id(10)),
|
||||
str_id(20): Structure(id=str_id(20), original_id=str_id(20), previous_id=None),
|
||||
}
|
||||
)
|
||||
|
||||
def test_race_condition(self):
|
||||
"""Create new Structures are during ChangePlan creation."""
|
||||
# Get the real method before we patch it...
|
||||
real_all_structures_fn = SplitMongoBackend._all_structures # pylint: disable=protected-access
|
||||
|
||||
def add_structures(backend, delay, batch_size):
|
||||
"""Do what _all_structures() would do, then add new Structures."""
|
||||
structures = real_all_structures_fn(backend, delay, batch_size)
|
||||
|
||||
# Create new Structures
|
||||
self.structures.insert_one(
|
||||
dict(_id=obj_id(5), original_version=obj_id(1), previous_version=obj_id(4)),
|
||||
)
|
||||
self.structures.insert_one(
|
||||
dict(_id=obj_id(6), original_version=obj_id(1), previous_version=obj_id(5)),
|
||||
)
|
||||
self.structures.insert_one(
|
||||
dict(_id=obj_id(7), original_version=obj_id(1), previous_version=obj_id(6)),
|
||||
)
|
||||
|
||||
# Update the Draft branch of course-v1:edx+split_course+2017 to
|
||||
# point to one of the new Structures
|
||||
self.active_versions.update_one(
|
||||
{'_id': obj_id(100)},
|
||||
{'$set': {'versions.draft-branch': obj_id(5)}}
|
||||
)
|
||||
|
||||
# Create an entirely new ActiveVersion and point it to the newest
|
||||
# Structure.
|
||||
self.active_versions.insert_one(
|
||||
{
|
||||
'_id': obj_id(102),
|
||||
'edited_on': datetime(2012, 5, 3),
|
||||
'org': 'edx',
|
||||
'course': 'split_library_race',
|
||||
'run': 'library',
|
||||
'versions': {
|
||||
'library': obj_id(7),
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return structures
|
||||
|
||||
with patch.object(SplitMongoBackend, '_all_structures', autospec=True) as all_structures_mock:
|
||||
all_structures_mock.side_effect = add_structures
|
||||
graph = self.backend.structures_graph(0, 100)
|
||||
self.assertEqual(len(graph.structures), 10)
|
||||
self.assertEqual(len(graph.branches), 4)
|
||||
|
||||
plan = ChangePlan.create(graph, 0, False, False)
|
||||
self.assertNotIn(str_id(5), plan.delete) # Active updated to this for our course.
|
||||
self.assertNotIn(str_id(7), plan.delete) # Active for our new Library
|
||||
self.assertIn(str_id(4), plan.delete) # Was our Active before
|
||||
self.assertIn(str_id(6), plan.delete) # Intermediate structure to new Library
|
||||
|
||||
|
||||
def str_id(int_id):
|
||||
"""Return the string version of Object IDs that PyMongo will accept."""
|
||||
return "{:024}".format(int_id)
|
||||
|
||||
|
||||
def obj_id(int_id):
|
||||
"""Helper to create Object IDs that PyMongo will accept."""
|
||||
return ObjectId(str_id(int_id))
|
||||
679
scripts/structures_pruning/utils/splitmongo.py
Normal file
679
scripts/structures_pruning/utils/splitmongo.py
Normal file
@@ -0,0 +1,679 @@
|
||||
"""
|
||||
This module provides logic to clean up old, unused course content data for the
|
||||
DraftVersioningModuleStore modulestore, more commonly referred to as the "Split
|
||||
Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
|
||||
SplitMongoModuleStore). All courses and assets that have newer style locator
|
||||
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
|
||||
"ccx-v1:", or "block-v1:".
|
||||
|
||||
The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This
|
||||
code does not address that modulestore in any way. That modulestore handles
|
||||
courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as
|
||||
well as assets starting with "i4x://".
|
||||
|
||||
"Split" gets its name from the fact that it separates the Structure of a course
|
||||
from the content in the leaf nodes. In theory, the Structure is an outline of
|
||||
the course that contains all the parent/child relations for different content
|
||||
blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as
|
||||
small, commonly inherited metadata like due dates. More detailed information
|
||||
about any particular block of content is stored in a separate collection as
|
||||
Definitions.
|
||||
|
||||
Both Structures and Definitions are immutable in Split. When a course is edited,
|
||||
a new Structure is created, and the Active Versions entry for a course is
|
||||
updated to point to that new Structure. In that way, we never get a partially
|
||||
applied edit -- it either succeeds or fails atomically. The Active Versions
|
||||
entry for a Course has pointers to "published" and "draft" Structures. There is
|
||||
also a special "library" pointer that is only used by Content Libraries. We do
|
||||
not need to distinguish between these for the purposes of cleanup.
|
||||
|
||||
The problem is that Structure documents have become far larger than they were
|
||||
intended to be, and we never created code to properly clean them up. As such, it
|
||||
is not uncommon for the majority of Mongo storage space to be used by old
|
||||
Structure documents that are completely unused (and are unreachable) by LMS or
|
||||
Studio.
|
||||
|
||||
This module provides cleanup functionality with various tweakable options for
|
||||
how much history to preserve. For simplicity, it reads all Structure IDs into
|
||||
memory instead of working on subsets of the data. As a practical matter, this
|
||||
means that it will work for databases with up to about 10 million Structures
|
||||
before RAM usage starts to become a problem.
|
||||
"""
|
||||
from collections import deque, namedtuple
|
||||
from itertools import count, takewhile
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
from bson.objectid import ObjectId
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
|
||||
|
||||
LOG = logging.getLogger('structures')
|
||||
|
||||
|
||||
class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')):
|
||||
"""
|
||||
This summarizes the entire set of Structure relationships in a database.
|
||||
|
||||
Each Structure represents a saved state for the Course or Content Library.
|
||||
For each branch ("published", "draft", or "library"), there is a sequence of
|
||||
Structures that starts with an Original and ends in an Active Structure::
|
||||
|
||||
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
|
||||
|
||||
`branches` is a list of ActiveVersionBranch objects representing what's
|
||||
currently live on the LMS and Studio. Active Structures referenced in this
|
||||
list cannot be removed because it would break the site for users.
|
||||
|
||||
`structures` is a dict of Structure IDs (Strings) to Structure objects
|
||||
(described above). All the Structure objects store ID locations to their
|
||||
parent and original Structures rather than having direct references to them.
|
||||
This is partly because we don't really need to traverse the vast majority of
|
||||
the graph. Look at `ChangePlan` for details on why that is.
|
||||
"""
|
||||
def traverse_ids(self, start_id, limit=None, include_start=False):
|
||||
"""
|
||||
Given a Structure ID to start from, this will iterate through the
|
||||
previous_id chain, for up to `limit` parent relationships. If `limit` is
|
||||
None, it will keep going until it gets through the Original.
|
||||
"""
|
||||
if include_start:
|
||||
yield start_id
|
||||
|
||||
current_id = start_id
|
||||
i = 0
|
||||
while current_id in self.structures:
|
||||
if limit is not None and i >= limit:
|
||||
return
|
||||
|
||||
current_id = self.structures[current_id].previous_id
|
||||
if current_id is None:
|
||||
return
|
||||
|
||||
yield current_id
|
||||
i += 1
|
||||
|
||||
|
||||
class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')):
|
||||
"""
|
||||
An Active Version document can point to multiple branches (e.g. "published",
|
||||
"draft"). This object represensts one of those branches.
|
||||
|
||||
The value for `branch` can be "draft-branch", "published-branch", or
|
||||
"library". All Courses have a draft-branch and a published-branch. Content
|
||||
Libraries have only a "library" branch.
|
||||
|
||||
The value for `key` is the Opaque Key representing the Course or Library,
|
||||
mostly for debugging purposes (they're not a part of the plan file).
|
||||
|
||||
The value for `edited_on` is a timestamp showing the last time the Active
|
||||
Version document was modified -- for a Course, this means when *either* the
|
||||
published-branch or draft-branch was most recently modified. Again, this is
|
||||
not used for pruning, but just provides debug information.
|
||||
"""
|
||||
def __str__(self):
|
||||
return "Active Version {} [{}] {} for {}".format(
|
||||
self.id,
|
||||
self.edited_on.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
self.branch,
|
||||
self.key,
|
||||
)
|
||||
|
||||
|
||||
class Structure(namedtuple('Structure', 'id original_id previous_id')):
|
||||
"""
|
||||
The parts of a SplitMongo Structure document that we care about, namely the
|
||||
ID (str'd version of the ObjectID), and the IDs of the Original and Previous
|
||||
structure documents. The previous_id may be None ()
|
||||
|
||||
We use a namedtuple for this specifically because it's more space efficient
|
||||
than a dict, and we can have millions of Structures.
|
||||
"""
|
||||
def is_original(self):
|
||||
"""Is this Structure an original (i.e. should never be deleted)?"""
|
||||
return self.previous_id is None
|
||||
|
||||
|
||||
class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')):
|
||||
"""
|
||||
Summary of the pruning actions we want a Backend to take.
|
||||
|
||||
The idea of having this data structure and being able to serialize it is so
|
||||
that we can save our plan of action somewhere for debugging, failure
|
||||
recovery, and batching updates.
|
||||
|
||||
`delete` is a list of Structure IDs we want to delete.
|
||||
|
||||
`update_parents` is a list of (structure_id, new_previous_id) tuples
|
||||
representing the previous_id updates we need to make.
|
||||
|
||||
A ChangePlan is just a declarative. It is the responsibility of the
|
||||
Backend to figure out how to implement a ChangePlan safely and efficiently
|
||||
in order to do the actual updates.
|
||||
"""
|
||||
def dump(self, file_obj):
|
||||
"""Serialize ChangePlan to a file (JSON format)."""
|
||||
json.dump(
|
||||
{
|
||||
"delete": self.delete,
|
||||
"update_parents": self.update_parents,
|
||||
},
|
||||
file_obj,
|
||||
indent=2,
|
||||
)
|
||||
LOG.info(
|
||||
"Wrote Change Plan: %s (%s deletions, %s parent updates)",
|
||||
os.path.realpath(file_obj.name),
|
||||
len(self.delete),
|
||||
len(self.update_parents)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load(cls, file_obj):
|
||||
"""Load a ChangePlan from a JSON file. Takes a file object."""
|
||||
data = json.load(file_obj)
|
||||
return cls(
|
||||
delete=data["delete"], update_parents=data["update_parents"]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None):
|
||||
"""
|
||||
Given a StructuresGraph and a target number for intermediate Structures
|
||||
to preserve, return a ChangePlan that represents the changes needed to
|
||||
prune the database. The overall strategy is to iterate through all
|
||||
Active Structures, walk back through the ancestors, and add all the
|
||||
Structure IDs we should save to a set. After we have our save set, we
|
||||
know that we can delete all other structures without worrying about
|
||||
whether those Structures are reachable or knowing what their
|
||||
relationships are. This keeps things simpler, and means that we should
|
||||
be more resilient to failures when pruning.
|
||||
|
||||
Structure documents exist in chains of parent/child relationships,
|
||||
starting with an Original Structure, having some number of Intermediate
|
||||
Structures, and ending in an Active Structure::
|
||||
|
||||
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
|
||||
|
||||
Pruning Rules:
|
||||
|
||||
1. All Active Structures must be preserved, as those are being used by
|
||||
the LMS and Studio to serve course content.
|
||||
|
||||
2. All Original Structures should be preserved, since those are used by
|
||||
the LMS and Studio to determine common shared ancestry between
|
||||
Structures.
|
||||
|
||||
3. Up to `num_intermediate_structures` Intermediate Structures will be
|
||||
kept. These Structures are not actually used in edx-platform code,
|
||||
but they are sometimes used by developers to allow emergency reverts
|
||||
in course team support situations (e.g. someone accidentally wiped
|
||||
out their course with a bad import).
|
||||
|
||||
4. The oldest preserved Intermediate Structure will be modified so that
|
||||
its `previous_id` is updated to point to the Original Structure. That
|
||||
way, we're not preserving references to the IDs of Structures that
|
||||
have been pruned.
|
||||
|
||||
"""
|
||||
structure_ids_to_save = set()
|
||||
set_parent_to_original = set()
|
||||
|
||||
branches, structures = structures_graph
|
||||
|
||||
# Figure out which Structures to save...
|
||||
for branch in branches:
|
||||
# Anything that's actively being pointed to (is the head of a branch)
|
||||
# must be preserved. This is what's being served by Studio and LMS.
|
||||
active_structure_id = branch.structure_id
|
||||
structure_ids_to_save.add(active_structure_id)
|
||||
|
||||
# All originals will be saved.
|
||||
structure_ids_to_save.add(structures[active_structure_id].original_id)
|
||||
|
||||
# Save up to `num_intermediate_structures` intermediate nodes
|
||||
int_structure_ids_to_save = structures_graph.traverse_ids(
|
||||
active_structure_id, limit=num_intermediate_structures
|
||||
)
|
||||
for int_structure_id in int_structure_ids_to_save:
|
||||
structure_ids_to_save.add(int_structure_id)
|
||||
|
||||
missing_structure_ids = structure_ids_to_save - structures.keys()
|
||||
|
||||
if ignore_missing:
|
||||
# Remove missing structures since we can't save them
|
||||
structure_ids_to_save -= missing_structure_ids
|
||||
elif len(missing_structure_ids) > 0:
|
||||
LOG.error("Missing structures detected")
|
||||
sys.exit(1)
|
||||
|
||||
# Figure out what links to rewrite -- the oldest structure to save that
|
||||
# isn't an original.
|
||||
for branch in branches:
|
||||
rewrite_candidates = takewhile(
|
||||
lambda s: s in structure_ids_to_save and not structures[s].is_original(),
|
||||
structures_graph.traverse_ids(branch.structure_id, include_start=True)
|
||||
)
|
||||
# `last_seen` will have the last structure_id from the
|
||||
# `rewrite_candidates` iterable.
|
||||
last_seen = deque(rewrite_candidates, 1)
|
||||
if last_seen:
|
||||
structure = structures[last_seen.pop()]
|
||||
# Don't do a rewrite if it's just a no-op...
|
||||
if structure.original_id != structure.previous_id:
|
||||
set_parent_to_original.add(structure.id)
|
||||
|
||||
# Sort the items in the ChangePlan. This might not be helpful, but I'm
|
||||
# hoping that it will keep disk changes more localized and not thrash
|
||||
# things as much as randomly distributed deletes. Mongo ObjectIDs are
|
||||
# ordered (they have a timestamp component).
|
||||
change_plan = cls(
|
||||
delete=sorted(structures.keys() - structure_ids_to_save),
|
||||
update_parents=sorted(
|
||||
(s_id, structures[s_id].original_id)
|
||||
for s_id in set_parent_to_original
|
||||
)
|
||||
)
|
||||
|
||||
if details_file:
|
||||
change_plan.write_details(
|
||||
details_file, structures_graph, structure_ids_to_save, set_parent_to_original
|
||||
)
|
||||
|
||||
if dump_structures:
|
||||
active_structure_ids = {branch.structure_id for branch in branches}
|
||||
for sid in structures:
|
||||
save = sid in structure_ids_to_save
|
||||
active = sid in active_structure_ids
|
||||
relink = sid in set_parent_to_original
|
||||
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
|
||||
LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
|
||||
|
||||
for missing_structure_id in missing_structure_ids:
|
||||
active_structure_ids = {branch.structure_id for branch in branches}
|
||||
|
||||
LOG.error(f"Missing structure ID: {missing_structure_id}")
|
||||
original_ids = set()
|
||||
for structure in structures.values():
|
||||
if structure.previous_id == missing_structure_id:
|
||||
save = structure.id in structure_ids_to_save
|
||||
active = structure.id in active_structure_ids
|
||||
relink = structure.id in set_parent_to_original
|
||||
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
|
||||
LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}")
|
||||
original_ids.add(structure.original_id)
|
||||
|
||||
active_structure_ids = {branch.structure_id for branch in branches}
|
||||
|
||||
branches_to_log = []
|
||||
|
||||
LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}")
|
||||
for branch in branches:
|
||||
structure = structures[branch.structure_id]
|
||||
if structure.original_id in original_ids:
|
||||
for sid in structures_graph.traverse_ids(branch.structure_id):
|
||||
if sid not in structures:
|
||||
branches_to_log.append(branch)
|
||||
|
||||
for branch in branches_to_log:
|
||||
structure = structures[branch.structure_id]
|
||||
|
||||
LOG.info(f"Branch: {branch}")
|
||||
|
||||
save = branch.structure_id in structure_ids_to_save
|
||||
active = branch.structure_id in active_structure_ids
|
||||
relink = branch.structure_id in set_parent_to_original
|
||||
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
|
||||
|
||||
for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True):
|
||||
if sid in structures:
|
||||
save = sid in structure_ids_to_save
|
||||
active = sid in active_structure_ids
|
||||
relink = sid in set_parent_to_original
|
||||
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
|
||||
LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
|
||||
|
||||
return change_plan
|
||||
|
||||
@staticmethod
|
||||
def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original):
|
||||
"""
|
||||
Simple dump of the changes we're going to make to the database.
|
||||
|
||||
This method requires information that we don't actually keep in the
|
||||
ChangePlan file, such as the Course IDs and edit times. Because of this,
|
||||
it can only be created at the time the ChangePlan is being generated,
|
||||
and cannot be derived from an existing ChangePlan. The goal was to
|
||||
provide this debug information while keeping the ChangePlan file format
|
||||
as stupidly simple as possible.
|
||||
"""
|
||||
branches, structures = structures_graph
|
||||
active_structure_ids = {branch.structure_id for branch in branches}
|
||||
|
||||
def text_for(s_id):
|
||||
"""Helper method to format Structures consistently."""
|
||||
action = "+" if s_id in structure_ids_to_save else "-"
|
||||
notes = []
|
||||
if s_id in active_structure_ids:
|
||||
notes.append("(active)")
|
||||
if s_id in set_parent_to_original:
|
||||
notes.append("(re-link to original)")
|
||||
if s_id in structures and structures[s_id].is_original():
|
||||
notes.append("(original)")
|
||||
|
||||
if notes:
|
||||
return "{} {} {}".format(action, s_id, " ".join(notes))
|
||||
|
||||
return "{} {}".format(action, s_id)
|
||||
|
||||
print("== Summary ==", file=details_file)
|
||||
print("Active Version Branches: {}".format(len(branches)), file=details_file)
|
||||
print("Total Structures: {}".format(len(structures)), file=details_file)
|
||||
print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file)
|
||||
print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file)
|
||||
print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file)
|
||||
print("\n== Active Versions ==", file=details_file)
|
||||
|
||||
for branch in branches:
|
||||
print("{}".format(branch), file=details_file)
|
||||
for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True):
|
||||
print(text_for(structure_id), file=details_file)
|
||||
print("", file=details_file)
|
||||
|
||||
LOG.info(
|
||||
"Wrote Change Details File: %s", os.path.realpath(details_file.name)
|
||||
)
|
||||
|
||||
|
||||
class SplitMongoBackend:
|
||||
"""
|
||||
Interface to the MongoDB backend. This is currently the only supported KV
|
||||
store for the Split(DraftVersioning)ModuleStore, but having this as a
|
||||
separate class makes it easier to stub in test data.
|
||||
|
||||
The methods on this class should accept and return backend-agnostic data
|
||||
structures, so no BSON details should leak out.
|
||||
"""
|
||||
def __init__(self, mongo_connection_str, db_name):
|
||||
self._db = MongoClient(
|
||||
mongo_connection_str,
|
||||
connectTimeoutMS=2000,
|
||||
socketTimeoutMS=300000, # *long* operations
|
||||
serverSelectionTimeoutMS=2000
|
||||
)
|
||||
self._active_versions = self._db[db_name].modulestore.active_versions
|
||||
self._structures = self._db[db_name].modulestore.structures
|
||||
|
||||
def structures_graph(self, delay, batch_size):
|
||||
"""
|
||||
Return StructuresGraph for the entire modulestore.
|
||||
|
||||
`batch_size` is the number of structure documents we pull at a time.
|
||||
`delay` is the delay in seconds between batch queries.
|
||||
|
||||
This has one slight complication. A StructuresGraph is expected to be a
|
||||
consistent view of the database, but MongoDB doesn't offer a "repeatable
|
||||
read" transaction isolation mode. That means that Structures may be
|
||||
added at any time between our database calls. Because of this, we have
|
||||
to be careful in stitching together something that is safe. The
|
||||
guarantees we try to make about the StructuresGraph being returned are:
|
||||
|
||||
1. Every Structure ID in `active_structure_ids` is also in `structures`
|
||||
2. If `branches` is stale and there is a new Structure that is Active
|
||||
in the database, it is *not* in `structures`.
|
||||
|
||||
Scenario A: We fetch branches, then structures
|
||||
1. Get Branches (and thus Active Structure IDs)
|
||||
2. New Structures created by Studio
|
||||
3. Get all Structures
|
||||
|
||||
It is almost certainly the case that the new Structures created in (2)
|
||||
should be active. Our algorithm works by starting from the Active
|
||||
Structure IDs that we know about, making a "save" list, and then
|
||||
deleting all other Structures. The problem in this scenario is that we
|
||||
fetch the new Structures in (3), but we don't know that they're Active
|
||||
because our `active_structure_ids` comes from (1) and is stale. So we
|
||||
would in fact delete what should be Active Structures.
|
||||
|
||||
Scenario B: We fetch structures, then branches
|
||||
1. Get all Structures
|
||||
2. New Structures created by Studio
|
||||
3. Get Branches (and thus Active Structure IDs)
|
||||
|
||||
In this scenario, we may see Active Structure IDs that are not in
|
||||
our Structures dict. This is bad because we won't know how to crawl
|
||||
their ancestry and mark the appropriate Structure IDs to be saved.
|
||||
|
||||
So the approach we take is Scenario B with a fallback. After we fetch
|
||||
everything, we go through the Active Structure IDs and make sure that
|
||||
those Structures and their ancestors exist in `structures`. If they
|
||||
don't, we make extra fetches to get them. Misses should be rare, so it
|
||||
shouldn't have a drastic performance impact overall.
|
||||
|
||||
Note that it's safe if the ChangePlan as a whole is a little stale, so
|
||||
long as it's internally consistent. We only ever delete Structures that
|
||||
are in the `structures` doc, so a new Active Version that we're
|
||||
completely unaware of will be left alone.
|
||||
"""
|
||||
structures = self._all_structures(delay, batch_size)
|
||||
branches = self._all_branches()
|
||||
|
||||
# Guard against the race condition that branch.structure_id or its
|
||||
# ancestors are not in `structures`. Make sure that we add those.
|
||||
LOG.info(
|
||||
"Checking for missing Structures (a small number are expected "
|
||||
"unless edits are disabled during change plan creation)."
|
||||
)
|
||||
missing_count = 0
|
||||
for branch in branches:
|
||||
structure_id = branch.structure_id
|
||||
while structure_id and (structure_id not in structures):
|
||||
structures[structure_id] = self._get_structure(structure_id)
|
||||
missing_count += 1
|
||||
LOG.warning(
|
||||
"Structure %s linked from Active Structure %s (%s) fetched.",
|
||||
structure_id,
|
||||
branch.structure_id,
|
||||
branch.key,
|
||||
)
|
||||
structure_id = structures[structure_id].previous_id
|
||||
|
||||
LOG.info("Finished checking for missing Structures, found %s", missing_count)
|
||||
|
||||
return StructuresGraph(branches, structures)
|
||||
|
||||
def _all_structures(self, delay, batch_size):
|
||||
"""
|
||||
Return a dict mapping Structure IDs to Structures for all Structures in
|
||||
the database.
|
||||
|
||||
`batch_size` is the number of structure documents we pull at a time.
|
||||
`delay` is the delay in seconds between batch queries.
|
||||
"""
|
||||
LOG.info("Fetching all known Structures (this might take a while)...")
|
||||
LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size)
|
||||
|
||||
# Important to keep this as a generator to limit memory usage.
|
||||
parsed_docs = (
|
||||
self.parse_structure_doc(doc)
|
||||
for doc
|
||||
in self._structures_from_db(delay, batch_size)
|
||||
)
|
||||
structures = {structure.id: structure for structure in parsed_docs}
|
||||
LOG.info("Fetched %s Structures", len(structures))
|
||||
|
||||
return structures
|
||||
|
||||
def _structures_from_db(self, delay, batch_size):
|
||||
"""
|
||||
Iterate through all Structure documents in the database.
|
||||
|
||||
`batch_size` is the number of structure documents we pull at a time.
|
||||
`delay` is the delay in seconds between batch queries.
|
||||
"""
|
||||
cursor = self._structures.find(
|
||||
projection=['original_version', 'previous_version']
|
||||
)
|
||||
cursor.batch_size(batch_size)
|
||||
for i, structure_doc in enumerate(cursor, start=1):
|
||||
yield structure_doc
|
||||
if i % batch_size == 0:
|
||||
LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id'])
|
||||
time.sleep(delay)
|
||||
|
||||
def _all_branches(self):
|
||||
"""Retrieve list of all ActiveVersionBranch objects in the database."""
|
||||
branches = []
|
||||
LOG.info("Fetching all Active Version Branches...")
|
||||
|
||||
for av_doc in self._active_versions.find():
|
||||
for branch, obj_id in av_doc['versions'].items():
|
||||
structure_id = str(obj_id)
|
||||
if branch == 'library':
|
||||
key = LibraryLocator(av_doc['org'], av_doc['course'])
|
||||
else:
|
||||
key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run'])
|
||||
|
||||
branches.append(
|
||||
ActiveVersionBranch(
|
||||
str(av_doc['_id']),
|
||||
branch,
|
||||
structure_id,
|
||||
key,
|
||||
av_doc['edited_on'],
|
||||
)
|
||||
)
|
||||
|
||||
LOG.info("Fetched %s Active Version Branches", len(branches))
|
||||
|
||||
return sorted(branches)
|
||||
|
||||
def _get_structure(self, structure_id):
|
||||
"""Get an individual Structure from the database."""
|
||||
structure_doc = self._structures.find_one(
|
||||
{'_id': ObjectId(structure_id)},
|
||||
projection=['original_version', 'previous_version']
|
||||
)
|
||||
return self.parse_structure_doc(structure_doc)
|
||||
|
||||
def update(self, change_plan, delay=1000, batch_size=1000, start=None):
|
||||
"""
|
||||
Update the backend according to the relinking and deletions specified in
|
||||
the change_plan.
|
||||
"""
|
||||
# Step 1: Relink - Change the previous pointer for the oldest structure
|
||||
# we want to keep, so that it points back to the original. We never
|
||||
# delete the original. Relinking happens before deletion so that we
|
||||
# never leave our course in a broken state (at worst, parts of it
|
||||
# become unreachable).
|
||||
self._update_parents(change_plan.update_parents, delay, batch_size)
|
||||
|
||||
# Step 2: Delete unused Structures
|
||||
self._delete(change_plan.delete, delay, batch_size, start)
|
||||
|
||||
def _update_parents(self, id_parent_pairs, delay, batch_size):
|
||||
"""
|
||||
Update Structure parent relationships.
|
||||
|
||||
`id_parent_pairs` is a list of tuples, where the first element of each
|
||||
tuple is a Structure ID (str) to target, and the second element is the
|
||||
Structure ID that will be the new parent of the first element.
|
||||
"""
|
||||
for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size):
|
||||
updates = [
|
||||
UpdateOne(
|
||||
{'_id': ObjectId(structure_id)},
|
||||
{'$set': {'previous_version': ObjectId(previous_id)}}
|
||||
)
|
||||
for structure_id, previous_id in id_parent_pairs_batch
|
||||
]
|
||||
result = self._structures.bulk_write(updates)
|
||||
LOG.info(
|
||||
"Updated %s/%s parent relationships.",
|
||||
result.bulk_api_result['nModified'],
|
||||
result.bulk_api_result['nMatched'],
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
def _delete(self, structure_ids, delay, batch_size, start=None):
|
||||
"""
|
||||
Delete old structures in batches.
|
||||
|
||||
`structure_ids` is a list of Structure IDs to delete.
|
||||
`delay` is the delay in seconds (floats are ok) between batch deletes.
|
||||
`batch_size` is how many we try to delete in each batch statement.
|
||||
"""
|
||||
s_ids_with_offset = self.iter_from_start(structure_ids, start)
|
||||
for structure_ids_batch in self.batch(s_ids_with_offset, batch_size):
|
||||
result = self._structures.delete_many(
|
||||
{
|
||||
'_id': {
|
||||
'$in': [ObjectId(s_id) for s_id in structure_ids_batch]
|
||||
}
|
||||
}
|
||||
)
|
||||
LOG.info(
|
||||
"Deleted %s/%s Structures: %s - %s",
|
||||
result.deleted_count,
|
||||
len(structure_ids_batch),
|
||||
structure_ids_batch[0],
|
||||
structure_ids_batch[-1],
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
@staticmethod
|
||||
def parse_structure_doc(structure_doc):
|
||||
"""
|
||||
Structure docs are pretty big, but we only care about three top level
|
||||
fields, all of which are ObjectIds:
|
||||
|
||||
_id: The Structure ID
|
||||
|
||||
previous_version: The Structure ID for the parent. An Original
|
||||
Structure will have None for this field.
|
||||
|
||||
original_version: The Original Structure that this Structure and all
|
||||
its ancestors are ultimately dervied from. An
|
||||
Original Structure points to itself with this field.
|
||||
"""
|
||||
_id = str(structure_doc['_id'])
|
||||
original_id = str(structure_doc['original_version'])
|
||||
previous_id = structure_doc['previous_version']
|
||||
if previous_id is not None:
|
||||
previous_id = str(previous_id)
|
||||
return Structure(_id, original_id, previous_id)
|
||||
|
||||
@staticmethod
|
||||
def batch(iterable, batch_size):
|
||||
"""Yield lists of up to `batch_size` in length from `iterable`."""
|
||||
iterator = iter(iterable)
|
||||
curr_batch = []
|
||||
for i in count(1):
|
||||
try:
|
||||
curr_batch.append(next(iterator))
|
||||
if i % batch_size == 0:
|
||||
yield curr_batch
|
||||
curr_batch = []
|
||||
except StopIteration:
|
||||
break
|
||||
if curr_batch:
|
||||
yield curr_batch
|
||||
|
||||
@staticmethod
|
||||
def iter_from_start(structure_ids, start=None):
|
||||
"""
|
||||
Yields from an iterable once it encounters the `start` value. If `start`
|
||||
is None, just yields from the beginning.
|
||||
"""
|
||||
if start is None:
|
||||
for structure_id in structure_ids:
|
||||
yield structure_id
|
||||
return
|
||||
|
||||
for structure_id in structure_ids:
|
||||
if structure_id < start:
|
||||
continue
|
||||
yield structure_id
|
||||
Reference in New Issue
Block a user