Files
edx-platform/scripts/structures_pruning/utils/splitmongo.py
Muhammad Farhan Khan 7808913916 chore: Moved structures.py from tubular repository (#34328)
* chore: Moved structures.py from tubular repository
2024-03-12 09:46:34 -04:00

680 lines
30 KiB
Python

"""
This module provides logic to clean up old, unused course content data for the
DraftVersioningModuleStore modulestore, more commonly referred to as the "Split
Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
SplitMongoModuleStore). All courses and assets that have newer style locator
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
"ccx-v1:", or "block-v1:".
The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This
code does not address that modulestore in any way. That modulestore handles
courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as
well as assets starting with "i4x://".
"Split" gets its name from the fact that it separates the Structure of a course
from the content in the leaf nodes. In theory, the Structure is an outline of
the course that contains all the parent/child relations for different content
blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as
small, commonly inherited metadata like due dates. More detailed information
about any particular block of content is stored in a separate collection as
Definitions.
Both Structures and Definitions are immutable in Split. When a course is edited,
a new Structure is created, and the Active Versions entry for a course is
updated to point to that new Structure. In that way, we never get a partially
applied edit -- it either succeeds or fails atomically. The Active Versions
entry for a Course has pointers to "published" and "draft" Structures. There is
also a special "library" pointer that is only used by Content Libraries. We do
not need to distinguish between these for the purposes of cleanup.
The problem is that Structure documents have become far larger than they were
intended to be, and we never created code to properly clean them up. As such, it
is not uncommon for the majority of Mongo storage space to be used by old
Structure documents that are completely unused (and are unreachable) by LMS or
Studio.
This module provides cleanup functionality with various tweakable options for
how much history to preserve. For simplicity, it reads all Structure IDs into
memory instead of working on subsets of the data. As a practical matter, this
means that it will work for databases with up to about 10 million Structures
before RAM usage starts to become a problem.
"""
from collections import deque, namedtuple
from itertools import count, takewhile
import json
import logging
import os
import sys
import time
from bson.objectid import ObjectId
from pymongo import MongoClient, UpdateOne
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
LOG = logging.getLogger('structures')
class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')):
"""
This summarizes the entire set of Structure relationships in a database.
Each Structure represents a saved state for the Course or Content Library.
For each branch ("published", "draft", or "library"), there is a sequence of
Structures that starts with an Original and ends in an Active Structure::
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
`branches` is a list of ActiveVersionBranch objects representing what's
currently live on the LMS and Studio. Active Structures referenced in this
list cannot be removed because it would break the site for users.
`structures` is a dict of Structure IDs (Strings) to Structure objects
(described above). All the Structure objects store ID locations to their
parent and original Structures rather than having direct references to them.
This is partly because we don't really need to traverse the vast majority of
the graph. Look at `ChangePlan` for details on why that is.
"""
def traverse_ids(self, start_id, limit=None, include_start=False):
"""
Given a Structure ID to start from, this will iterate through the
previous_id chain, for up to `limit` parent relationships. If `limit` is
None, it will keep going until it gets through the Original.
"""
if include_start:
yield start_id
current_id = start_id
i = 0
while current_id in self.structures:
if limit is not None and i >= limit:
return
current_id = self.structures[current_id].previous_id
if current_id is None:
return
yield current_id
i += 1
class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')):
"""
An Active Version document can point to multiple branches (e.g. "published",
"draft"). This object represensts one of those branches.
The value for `branch` can be "draft-branch", "published-branch", or
"library". All Courses have a draft-branch and a published-branch. Content
Libraries have only a "library" branch.
The value for `key` is the Opaque Key representing the Course or Library,
mostly for debugging purposes (they're not a part of the plan file).
The value for `edited_on` is a timestamp showing the last time the Active
Version document was modified -- for a Course, this means when *either* the
published-branch or draft-branch was most recently modified. Again, this is
not used for pruning, but just provides debug information.
"""
def __str__(self):
return "Active Version {} [{}] {} for {}".format(
self.id,
self.edited_on.strftime('%Y-%m-%d %H:%M:%S'),
self.branch,
self.key,
)
class Structure(namedtuple('Structure', 'id original_id previous_id')):
"""
The parts of a SplitMongo Structure document that we care about, namely the
ID (str'd version of the ObjectID), and the IDs of the Original and Previous
structure documents. The previous_id may be None ()
We use a namedtuple for this specifically because it's more space efficient
than a dict, and we can have millions of Structures.
"""
def is_original(self):
"""Is this Structure an original (i.e. should never be deleted)?"""
return self.previous_id is None
class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')):
"""
Summary of the pruning actions we want a Backend to take.
The idea of having this data structure and being able to serialize it is so
that we can save our plan of action somewhere for debugging, failure
recovery, and batching updates.
`delete` is a list of Structure IDs we want to delete.
`update_parents` is a list of (structure_id, new_previous_id) tuples
representing the previous_id updates we need to make.
A ChangePlan is just a declarative. It is the responsibility of the
Backend to figure out how to implement a ChangePlan safely and efficiently
in order to do the actual updates.
"""
def dump(self, file_obj):
"""Serialize ChangePlan to a file (JSON format)."""
json.dump(
{
"delete": self.delete,
"update_parents": self.update_parents,
},
file_obj,
indent=2,
)
LOG.info(
"Wrote Change Plan: %s (%s deletions, %s parent updates)",
os.path.realpath(file_obj.name),
len(self.delete),
len(self.update_parents)
)
@classmethod
def load(cls, file_obj):
"""Load a ChangePlan from a JSON file. Takes a file object."""
data = json.load(file_obj)
return cls(
delete=data["delete"], update_parents=data["update_parents"]
)
@classmethod
def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None):
"""
Given a StructuresGraph and a target number for intermediate Structures
to preserve, return a ChangePlan that represents the changes needed to
prune the database. The overall strategy is to iterate through all
Active Structures, walk back through the ancestors, and add all the
Structure IDs we should save to a set. After we have our save set, we
know that we can delete all other structures without worrying about
whether those Structures are reachable or knowing what their
relationships are. This keeps things simpler, and means that we should
be more resilient to failures when pruning.
Structure documents exist in chains of parent/child relationships,
starting with an Original Structure, having some number of Intermediate
Structures, and ending in an Active Structure::
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
Pruning Rules:
1. All Active Structures must be preserved, as those are being used by
the LMS and Studio to serve course content.
2. All Original Structures should be preserved, since those are used by
the LMS and Studio to determine common shared ancestry between
Structures.
3. Up to `num_intermediate_structures` Intermediate Structures will be
kept. These Structures are not actually used in edx-platform code,
but they are sometimes used by developers to allow emergency reverts
in course team support situations (e.g. someone accidentally wiped
out their course with a bad import).
4. The oldest preserved Intermediate Structure will be modified so that
its `previous_id` is updated to point to the Original Structure. That
way, we're not preserving references to the IDs of Structures that
have been pruned.
"""
structure_ids_to_save = set()
set_parent_to_original = set()
branches, structures = structures_graph
# Figure out which Structures to save...
for branch in branches:
# Anything that's actively being pointed to (is the head of a branch)
# must be preserved. This is what's being served by Studio and LMS.
active_structure_id = branch.structure_id
structure_ids_to_save.add(active_structure_id)
# All originals will be saved.
structure_ids_to_save.add(structures[active_structure_id].original_id)
# Save up to `num_intermediate_structures` intermediate nodes
int_structure_ids_to_save = structures_graph.traverse_ids(
active_structure_id, limit=num_intermediate_structures
)
for int_structure_id in int_structure_ids_to_save:
structure_ids_to_save.add(int_structure_id)
missing_structure_ids = structure_ids_to_save - structures.keys()
if ignore_missing:
# Remove missing structures since we can't save them
structure_ids_to_save -= missing_structure_ids
elif len(missing_structure_ids) > 0:
LOG.error("Missing structures detected")
sys.exit(1)
# Figure out what links to rewrite -- the oldest structure to save that
# isn't an original.
for branch in branches:
rewrite_candidates = takewhile(
lambda s: s in structure_ids_to_save and not structures[s].is_original(),
structures_graph.traverse_ids(branch.structure_id, include_start=True)
)
# `last_seen` will have the last structure_id from the
# `rewrite_candidates` iterable.
last_seen = deque(rewrite_candidates, 1)
if last_seen:
structure = structures[last_seen.pop()]
# Don't do a rewrite if it's just a no-op...
if structure.original_id != structure.previous_id:
set_parent_to_original.add(structure.id)
# Sort the items in the ChangePlan. This might not be helpful, but I'm
# hoping that it will keep disk changes more localized and not thrash
# things as much as randomly distributed deletes. Mongo ObjectIDs are
# ordered (they have a timestamp component).
change_plan = cls(
delete=sorted(structures.keys() - structure_ids_to_save),
update_parents=sorted(
(s_id, structures[s_id].original_id)
for s_id in set_parent_to_original
)
)
if details_file:
change_plan.write_details(
details_file, structures_graph, structure_ids_to_save, set_parent_to_original
)
if dump_structures:
active_structure_ids = {branch.structure_id for branch in branches}
for sid in structures:
save = sid in structure_ids_to_save
active = sid in active_structure_ids
relink = sid in set_parent_to_original
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
for missing_structure_id in missing_structure_ids:
active_structure_ids = {branch.structure_id for branch in branches}
LOG.error(f"Missing structure ID: {missing_structure_id}")
original_ids = set()
for structure in structures.values():
if structure.previous_id == missing_structure_id:
save = structure.id in structure_ids_to_save
active = structure.id in active_structure_ids
relink = structure.id in set_parent_to_original
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}")
original_ids.add(structure.original_id)
active_structure_ids = {branch.structure_id for branch in branches}
branches_to_log = []
LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}")
for branch in branches:
structure = structures[branch.structure_id]
if structure.original_id in original_ids:
for sid in structures_graph.traverse_ids(branch.structure_id):
if sid not in structures:
branches_to_log.append(branch)
for branch in branches_to_log:
structure = structures[branch.structure_id]
LOG.info(f"Branch: {branch}")
save = branch.structure_id in structure_ids_to_save
active = branch.structure_id in active_structure_ids
relink = branch.structure_id in set_parent_to_original
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True):
if sid in structures:
save = sid in structure_ids_to_save
active = sid in active_structure_ids
relink = sid in set_parent_to_original
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
return change_plan
@staticmethod
def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original):
"""
Simple dump of the changes we're going to make to the database.
This method requires information that we don't actually keep in the
ChangePlan file, such as the Course IDs and edit times. Because of this,
it can only be created at the time the ChangePlan is being generated,
and cannot be derived from an existing ChangePlan. The goal was to
provide this debug information while keeping the ChangePlan file format
as stupidly simple as possible.
"""
branches, structures = structures_graph
active_structure_ids = {branch.structure_id for branch in branches}
def text_for(s_id):
"""Helper method to format Structures consistently."""
action = "+" if s_id in structure_ids_to_save else "-"
notes = []
if s_id in active_structure_ids:
notes.append("(active)")
if s_id in set_parent_to_original:
notes.append("(re-link to original)")
if s_id in structures and structures[s_id].is_original():
notes.append("(original)")
if notes:
return "{} {} {}".format(action, s_id, " ".join(notes))
return "{} {}".format(action, s_id)
print("== Summary ==", file=details_file)
print("Active Version Branches: {}".format(len(branches)), file=details_file)
print("Total Structures: {}".format(len(structures)), file=details_file)
print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file)
print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file)
print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file)
print("\n== Active Versions ==", file=details_file)
for branch in branches:
print("{}".format(branch), file=details_file)
for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True):
print(text_for(structure_id), file=details_file)
print("", file=details_file)
LOG.info(
"Wrote Change Details File: %s", os.path.realpath(details_file.name)
)
class SplitMongoBackend:
"""
Interface to the MongoDB backend. This is currently the only supported KV
store for the Split(DraftVersioning)ModuleStore, but having this as a
separate class makes it easier to stub in test data.
The methods on this class should accept and return backend-agnostic data
structures, so no BSON details should leak out.
"""
def __init__(self, mongo_connection_str, db_name):
self._db = MongoClient(
mongo_connection_str,
connectTimeoutMS=2000,
socketTimeoutMS=300000, # *long* operations
serverSelectionTimeoutMS=2000
)
self._active_versions = self._db[db_name].modulestore.active_versions
self._structures = self._db[db_name].modulestore.structures
def structures_graph(self, delay, batch_size):
"""
Return StructuresGraph for the entire modulestore.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
This has one slight complication. A StructuresGraph is expected to be a
consistent view of the database, but MongoDB doesn't offer a "repeatable
read" transaction isolation mode. That means that Structures may be
added at any time between our database calls. Because of this, we have
to be careful in stitching together something that is safe. The
guarantees we try to make about the StructuresGraph being returned are:
1. Every Structure ID in `active_structure_ids` is also in `structures`
2. If `branches` is stale and there is a new Structure that is Active
in the database, it is *not* in `structures`.
Scenario A: We fetch branches, then structures
1. Get Branches (and thus Active Structure IDs)
2. New Structures created by Studio
3. Get all Structures
It is almost certainly the case that the new Structures created in (2)
should be active. Our algorithm works by starting from the Active
Structure IDs that we know about, making a "save" list, and then
deleting all other Structures. The problem in this scenario is that we
fetch the new Structures in (3), but we don't know that they're Active
because our `active_structure_ids` comes from (1) and is stale. So we
would in fact delete what should be Active Structures.
Scenario B: We fetch structures, then branches
1. Get all Structures
2. New Structures created by Studio
3. Get Branches (and thus Active Structure IDs)
In this scenario, we may see Active Structure IDs that are not in
our Structures dict. This is bad because we won't know how to crawl
their ancestry and mark the appropriate Structure IDs to be saved.
So the approach we take is Scenario B with a fallback. After we fetch
everything, we go through the Active Structure IDs and make sure that
those Structures and their ancestors exist in `structures`. If they
don't, we make extra fetches to get them. Misses should be rare, so it
shouldn't have a drastic performance impact overall.
Note that it's safe if the ChangePlan as a whole is a little stale, so
long as it's internally consistent. We only ever delete Structures that
are in the `structures` doc, so a new Active Version that we're
completely unaware of will be left alone.
"""
structures = self._all_structures(delay, batch_size)
branches = self._all_branches()
# Guard against the race condition that branch.structure_id or its
# ancestors are not in `structures`. Make sure that we add those.
LOG.info(
"Checking for missing Structures (a small number are expected "
"unless edits are disabled during change plan creation)."
)
missing_count = 0
for branch in branches:
structure_id = branch.structure_id
while structure_id and (structure_id not in structures):
structures[structure_id] = self._get_structure(structure_id)
missing_count += 1
LOG.warning(
"Structure %s linked from Active Structure %s (%s) fetched.",
structure_id,
branch.structure_id,
branch.key,
)
structure_id = structures[structure_id].previous_id
LOG.info("Finished checking for missing Structures, found %s", missing_count)
return StructuresGraph(branches, structures)
def _all_structures(self, delay, batch_size):
"""
Return a dict mapping Structure IDs to Structures for all Structures in
the database.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
"""
LOG.info("Fetching all known Structures (this might take a while)...")
LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size)
# Important to keep this as a generator to limit memory usage.
parsed_docs = (
self.parse_structure_doc(doc)
for doc
in self._structures_from_db(delay, batch_size)
)
structures = {structure.id: structure for structure in parsed_docs}
LOG.info("Fetched %s Structures", len(structures))
return structures
def _structures_from_db(self, delay, batch_size):
"""
Iterate through all Structure documents in the database.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
"""
cursor = self._structures.find(
projection=['original_version', 'previous_version']
)
cursor.batch_size(batch_size)
for i, structure_doc in enumerate(cursor, start=1):
yield structure_doc
if i % batch_size == 0:
LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id'])
time.sleep(delay)
def _all_branches(self):
"""Retrieve list of all ActiveVersionBranch objects in the database."""
branches = []
LOG.info("Fetching all Active Version Branches...")
for av_doc in self._active_versions.find():
for branch, obj_id in av_doc['versions'].items():
structure_id = str(obj_id)
if branch == 'library':
key = LibraryLocator(av_doc['org'], av_doc['course'])
else:
key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run'])
branches.append(
ActiveVersionBranch(
str(av_doc['_id']),
branch,
structure_id,
key,
av_doc['edited_on'],
)
)
LOG.info("Fetched %s Active Version Branches", len(branches))
return sorted(branches)
def _get_structure(self, structure_id):
"""Get an individual Structure from the database."""
structure_doc = self._structures.find_one(
{'_id': ObjectId(structure_id)},
projection=['original_version', 'previous_version']
)
return self.parse_structure_doc(structure_doc)
def update(self, change_plan, delay=1000, batch_size=1000, start=None):
"""
Update the backend according to the relinking and deletions specified in
the change_plan.
"""
# Step 1: Relink - Change the previous pointer for the oldest structure
# we want to keep, so that it points back to the original. We never
# delete the original. Relinking happens before deletion so that we
# never leave our course in a broken state (at worst, parts of it
# become unreachable).
self._update_parents(change_plan.update_parents, delay, batch_size)
# Step 2: Delete unused Structures
self._delete(change_plan.delete, delay, batch_size, start)
def _update_parents(self, id_parent_pairs, delay, batch_size):
"""
Update Structure parent relationships.
`id_parent_pairs` is a list of tuples, where the first element of each
tuple is a Structure ID (str) to target, and the second element is the
Structure ID that will be the new parent of the first element.
"""
for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size):
updates = [
UpdateOne(
{'_id': ObjectId(structure_id)},
{'$set': {'previous_version': ObjectId(previous_id)}}
)
for structure_id, previous_id in id_parent_pairs_batch
]
result = self._structures.bulk_write(updates)
LOG.info(
"Updated %s/%s parent relationships.",
result.bulk_api_result['nModified'],
result.bulk_api_result['nMatched'],
)
time.sleep(delay)
def _delete(self, structure_ids, delay, batch_size, start=None):
"""
Delete old structures in batches.
`structure_ids` is a list of Structure IDs to delete.
`delay` is the delay in seconds (floats are ok) between batch deletes.
`batch_size` is how many we try to delete in each batch statement.
"""
s_ids_with_offset = self.iter_from_start(structure_ids, start)
for structure_ids_batch in self.batch(s_ids_with_offset, batch_size):
result = self._structures.delete_many(
{
'_id': {
'$in': [ObjectId(s_id) for s_id in structure_ids_batch]
}
}
)
LOG.info(
"Deleted %s/%s Structures: %s - %s",
result.deleted_count,
len(structure_ids_batch),
structure_ids_batch[0],
structure_ids_batch[-1],
)
time.sleep(delay)
@staticmethod
def parse_structure_doc(structure_doc):
"""
Structure docs are pretty big, but we only care about three top level
fields, all of which are ObjectIds:
_id: The Structure ID
previous_version: The Structure ID for the parent. An Original
Structure will have None for this field.
original_version: The Original Structure that this Structure and all
its ancestors are ultimately dervied from. An
Original Structure points to itself with this field.
"""
_id = str(structure_doc['_id'])
original_id = str(structure_doc['original_version'])
previous_id = structure_doc['previous_version']
if previous_id is not None:
previous_id = str(previous_id)
return Structure(_id, original_id, previous_id)
@staticmethod
def batch(iterable, batch_size):
"""Yield lists of up to `batch_size` in length from `iterable`."""
iterator = iter(iterable)
curr_batch = []
for i in count(1):
try:
curr_batch.append(next(iterator))
if i % batch_size == 0:
yield curr_batch
curr_batch = []
except StopIteration:
break
if curr_batch:
yield curr_batch
@staticmethod
def iter_from_start(structure_ids, start=None):
"""
Yields from an iterable once it encounters the `start` value. If `start`
is None, just yields from the beginning.
"""
if start is None:
for structure_id in structure_ids:
yield structure_id
return
for structure_id in structure_ids:
if structure_id < start:
continue
yield structure_id