chore: Moved structures.py from tubular repository (#34328)

* chore: Moved structures.py from tubular repository
This commit is contained in:
Muhammad Farhan Khan
2024-03-12 18:46:34 +05:00
committed by GitHub
parent da244a99d3
commit 7808913916
13 changed files with 1566 additions and 1 deletions

View File

@@ -0,0 +1,33 @@
name: units-test-scripts-common
on:
pull_request:
push:
branches:
- master
jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.8', '3.12' ]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r scripts/structures_pruning/requirements/testing.txt
- name: Run pytest
run: |
pytest scripts/structures_pruning

View File

@@ -142,7 +142,9 @@ REQ_FILES = \
requirements/edx/semgrep \
scripts/xblock/requirements \
scripts/user_retirement/requirements/base \
scripts/user_retirement/requirements/testing
scripts/user_retirement/requirements/testing \
scripts/structures_pruning/requirements/base \
scripts/structures_pruning/requirements/testing
define COMMON_CONSTRAINTS_TEMP_COMMENT
# This is a temporary solution to override the real common_constraints.txt\n# In edx-lint, until the pyjwt constraint in edx-lint has been removed.\n# See BOM-2721 for more details.\n# Below is the copied and edited version of common_constraints\n

View File

@@ -0,0 +1,73 @@
Structures Pruning Scripts
==========================
`This <https://github.com/openedx/edx-platform/tree/master/scripts/structures_pruning>`_ directory contains mongo db structures pruning script that is migrated from the
`tubular <https://github.com/openedx/tubular>`_ repository.
This script could be called from any automation/CD framework.
How to run the scripts
======================
Download the Scripts
--------------------
To download the scripts, you can perform a partial clone of the edx-platform repository to obtain only the required scripts. The following steps demonstrate how to achieve this. Alternatively, you may choose other utilities or libraries for the partial clone.
.. code-block:: bash
repo_url=git@github.com:openedx/edx-platform.git
branch=master
directory=scripts/structures_pruning
git clone --branch $branch --single-branch --depth=1 --filter=tree:0 $repo_url
cd edx-platform
git sparse-checkout init --cone
git sparse-checkout set $directory
Create Python Virtual Environment
---------------------------------
Create a Python virtual environment using Python 3.8:
.. code-block:: bash
python3.8 -m venv ../venv
source ../venv/bin/activate
Install Pip Packages
--------------------
Install the required pip packages using the provided requirements file:
.. code-block:: bash
pip install -r scripts/structures_pruning/requirements/base.txt
Execute Script
--------------
You can simply execute Python scripts with python command
.. code-block:: bash
python scripts/structures_pruning/structures.py prune plan_file.json
Feel free to customize these steps according to your specific environment and requirements.
Run Test Cases
==============
Before running test cases, install the testing requirements:
.. code-block:: bash
pip install -r scripts/structures_pruning/requirements/testing.txt
Run the test cases using pytest:
.. code-block:: bash
pytest scripts/structures_pruning

View File

View File

View File

@@ -0,0 +1,4 @@
click
click-log
edx-opaque-keys
pymongo

View File

@@ -0,0 +1,24 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# make upgrade
#
click==8.1.7
# via
# -r scripts/structures_pruning/requirements/base.in
# click-log
click-log==0.4.0
# via -r scripts/structures_pruning/requirements/base.in
edx-opaque-keys==2.5.1
# via -r scripts/structures_pruning/requirements/base.in
pbr==6.0.0
# via stevedore
pymongo==3.13.0
# via
# -r scripts/structures_pruning/requirements/base.in
# edx-opaque-keys
stevedore==5.2.0
# via edx-opaque-keys
typing-extensions==4.10.0
# via edx-opaque-keys

View File

@@ -0,0 +1,4 @@
-r base.txt
ddt
pytest

View File

@@ -0,0 +1,44 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# make upgrade
#
click==8.1.7
# via
# -r scripts/structures_pruning/requirements/base.txt
# click-log
click-log==0.4.0
# via -r scripts/structures_pruning/requirements/base.txt
ddt==1.7.2
# via -r scripts/structures_pruning/requirements/testing.in
edx-opaque-keys==2.5.1
# via -r scripts/structures_pruning/requirements/base.txt
exceptiongroup==1.2.0
# via pytest
iniconfig==2.0.0
# via pytest
packaging==24.0
# via pytest
pbr==6.0.0
# via
# -r scripts/structures_pruning/requirements/base.txt
# stevedore
pluggy==1.4.0
# via pytest
pymongo==3.13.0
# via
# -r scripts/structures_pruning/requirements/base.txt
# edx-opaque-keys
pytest==8.1.1
# via -r scripts/structures_pruning/requirements/testing.in
stevedore==5.2.0
# via
# -r scripts/structures_pruning/requirements/base.txt
# edx-opaque-keys
tomli==2.0.1
# via pytest
typing-extensions==4.10.0
# via
# -r scripts/structures_pruning/requirements/base.txt
# edx-opaque-keys

View File

@@ -0,0 +1,200 @@
#! /usr/bin/env python3
"""
Script to detect and prune old Structure documents from the "Split" Modulestore
MongoDB (edxapp.modulestore.structures by default). See docstring/help for the
"make_plan" and "prune" commands for more details.
"""
import logging
from os import path
import sys
import click
import click_log
# Add top-level project path to sys.path before importing scripts code
sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
from scripts.structures_pruning.utils.splitmongo import SplitMongoBackend, ChangePlan
# Add top-level module path to sys.path before importing tubular code.
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# from tubular.splitmongo import ChangePlan, SplitMongoBackend # pylint: disable=wrong-import-position
LOG = logging.getLogger('structures')
click_log.basic_config(LOG)
@click.group()
@click.option(
'--connection',
default="mongodb://localhost:27017",
help=(
'Connection string to the target mongo database. This defaults to '
'localhost without password (that will work against devstack). '
'You may need to use urllib.parse.quote_plus() to percent-escape '
'your username and password.'
)
)
@click.option(
'--database-name',
default='edxapp',
help='Name of the edX Mongo database containing the course structures to prune.'
)
@click.pass_context
def cli(ctx, connection, database_name):
"""
Recover space on MongoDB for edx-platform by deleting unreachable,
historical course content data. To use, first make a change plan with the
"make_plan" command, and then execute that plan against the database with
the "prune" command.
This script provides logic to clean up old, unused course content data for
the DraftVersioningModuleStore modulestore, more commonly referred to as the
"Split Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
SplitMongoModuleStore). All courses and assets that have newer style locator
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
"ccx-v1:", or "block-v1:". Studio authored content data for this modulestore
is saved as immutable data structures. The edx-platform code never cleans up
old data however, meaning there is an unbounded history of a course's
content revisions stored in MongoDB.
The older modulestore is DraftModuleStore, sometimes called "Old Mongo".
This code does not address that modulestore in any way. That modulestore
handles courses that use the old "/" separator, such as
"MITx/6.002x/2012_Spring", as well as assets starting with "i4x://".
"""
if ctx.obj is None:
ctx.obj = dict()
ctx.obj['BACKEND'] = SplitMongoBackend(connection, database_name)
@cli.command("make_plan")
@click_log.simple_verbosity_option(default='INFO')
@click.argument('plan_file', type=click.File('w'))
@click.option(
'--details',
type=click.File('w'),
default=None,
help="Name of file to write the human-readable details of the Change Plan."
)
@click.option(
'--retain',
default=2,
type=click.IntRange(0, None),
help=("The maximum number of intermediate structures to preserve for any "
"single branch of an active version. This value does not include the "
"active or original structures (those are always preserved). Defaults "
"to 2. Put 0 here if you want to prune as much as possible.")
)
@click.option(
'--delay',
default=15000,
type=click.IntRange(0, None),
help=("Delay in milliseconds between queries to fetch structures from MongoDB "
"during plan creation. Tune to adjust load on the database.")
)
@click.option(
'--batch-size',
default=10000,
type=click.IntRange(1, None),
help="How many Structures do we fetch at a time?"
)
@click.option(
'--ignore-missing/--no-ignore-missing',
default=False,
help=("Force plan creation, even if missing structures are found. "
"Should repair invalid ids by repointing to original. "
"Review of plan highly recommended")
)
@click.option(
'--dump-structures/--no-dump-structures',
default=False,
help="Dump all strucutres to stderr for debugging or recording state before cleanup."
)
@click.pass_context
def make_plan(ctx, plan_file, details, retain, delay, batch_size, ignore_missing, dump_structures):
"""
Create a Change Plan JSON file describing the operations needed to prune the
database. This command is read-only and does not alter the database.
The Change Plan JSON is a dictionary with two keys:
"delete" - A sorted array of Structure document IDs to delete. Since MongoDB
object IDs are created in ascending order by timestamp, this means that the
oldest documents come earlier in the list.
"update_parents" - A list of [Structure ID, New Parent/Previous ID] pairs.
This is used to re-link the oldest preserved Intermediate Structure back to
the Original Structure, so that we don't leave the database in a state where
a Structure's "previous_version" points to a deleted Structure.
Specifying a --details file will generate a more verbose, human-readable
text description of the Change Plan for verification purposes. The details
file will only display Structures that are reachable from an Active Version,
so any Structures that are "orphaned" as a result of partial runs of this
script or Studio race conditions will not be reflected. That being said,
orphaned Structures are detected and properly noted in the Change Plan JSON.
"""
structures_graph = ctx.obj['BACKEND'].structures_graph(delay / 1000.0, batch_size)
# This will create the details file as a side-effect, if specified.
change_plan = ChangePlan.create(structures_graph, retain, ignore_missing, dump_structures, details)
change_plan.dump(plan_file)
@cli.command()
@click_log.simple_verbosity_option(default='INFO')
@click.argument('plan_file', type=click.File('r'))
@click.option(
'--delay',
default=15000,
type=click.IntRange(0, None),
help=("Delay in milliseconds between batch deletions during pruning. Tune to "
"adjust load on the database.")
)
@click.option(
'--batch-size',
default=1000,
type=click.IntRange(1, None),
help=("How many Structures do we delete at a time? Tune to adjust load on "
"the database.")
)
@click.option(
'--start',
default=None,
help=("Structure ID to start deleting from. Specifying a Structure ID that "
"is not in the Change Plan is an error. Specifying a Structure ID that "
"has already been deleted is NOT an error, so it's safe to re-run.")
)
@click.pass_context
def prune(ctx, plan_file, delay, batch_size, start):
"""
Prune the MongoDB database according to a Change Plan file.
This command tries to be as safe as possible. It executes parent updates
before deletes, so an interruption at any point should be safe in that it
won't leave the structure graphs in an inconsistent state. It should also
be safe to resume pruning with the same Change Plan in the event of an
interruption.
It's also safe to run while Studio is still operating, though you should be
careful to test and tweak the delay and batch_size options to throttle load
on your database.
"""
change_plan = ChangePlan.load(plan_file)
if start is not None and start not in change_plan.delete:
raise click.BadParameter(
"{} is not in the Change Plan {}".format(
start, click.format_filename(plan_file.name)
),
param_hint='--start'
)
ctx.obj['BACKEND'].update(change_plan, delay / 1000.0, batch_size, start)
if __name__ == '__main__':
# pylint doesn't grok click magic, but this is straight from their docs...
cli(obj={}) # pylint: disable=no-value-for-parameter, unexpected-keyword-arg

View File

@@ -0,0 +1,502 @@
"""
Test Structure pruning related Split Mongo code.
IMPORTANT: If you are making changes to this code, please re-enable the
TestSplitMongoBackend tests and run them locally against the MongoDB instance
in your Docker Devstack. See the TestSplitMongoBackend docstring for more info.
"""
import itertools
import sys
import textwrap
import unittest
from datetime import datetime
from io import StringIO
from os import path
from unittest.mock import patch
import ddt
from bson.objectid import ObjectId
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
from pymongo import MongoClient
# Add top-level project path to sys.path before importing scripts code
sys.path.append(path.abspath(path.join(path.dirname(__file__), '../..')))
from scripts.structures_pruning.utils.splitmongo import (
ActiveVersionBranch, ChangePlan, Structure, SplitMongoBackend, StructuresGraph
)
def create_test_graph(*version_histories):
"""
Given any number of lists, where each list represents a history of Structure
IDs from oldest to newest, return a StructureGraph matching that
specification. Course names, branch names, and other attributes that exist
for debugging/reporting but do not change pruning behavior will be
automatically generated with plausible values.
"""
all_structures = {}
all_active_version_branches = []
active_id_pool = ("A{:023x}".format(i) for i in itertools.count(1))
course_key_pool = (
CourseLocator('edx', 'splitmongo', str(i)) for i in itertools.count(1)
)
branch_pool = itertools.cycle(['draft-branch', 'published-branch'])
for version_history in version_histories:
assert version_history # The history can't be empty
structure_ids = [str(version) for version in version_history]
# Create the Original
original_id = structure_ids[0]
history = [Structure(original_id, original_id, None)]
# Create all other Structures (if any)
for previous_id, current_id in zip(structure_ids, structure_ids[1:]):
history.append(Structure(current_id, original_id, previous_id))
# Add to our overall Structures dict (overwrites should be identical or
# our test data is bad).
for structure in history:
if structure.id in all_structures:
assert structure == all_structures[structure.id]
else:
all_structures[structure.id] = structure
active_version_id = structure_ids[-1]
all_active_version_branches.append(
ActiveVersionBranch(
id=next(active_id_pool),
branch=next(branch_pool),
structure_id=active_version_id,
key=next(course_key_pool),
edited_on=datetime(2012, 5, 2)
)
)
return StructuresGraph(all_active_version_branches, all_structures)
@ddt.ddt
class TestCourseChangePlan(unittest.TestCase):
"""
ChangePlans for single and multiple courses.
"""
def test_simple(self):
"""Simple happy path ChangePlans."""
graph = create_test_graph(["1", "2", "3", "4"])
# Preserve no intermediate structures -- prune the middle structures.
plan_no_intermediate = ChangePlan.create(graph, 0, False, False)
self.assertEqual(plan_no_intermediate.delete, ["2", "3"])
self.assertEqual(plan_no_intermediate.update_parents, [("4", "1")])
# Preserve one intermediate structure
plan_1_intermediate = ChangePlan.create(graph, 1, False, False)
self.assertEqual(plan_1_intermediate.delete, ["2"])
self.assertEqual(plan_1_intermediate.update_parents, [("3", "1")])
# Preserve two intermediate structures -- Do nothing
plan_2_intermediate = ChangePlan.create(graph, 2, False, False)
self.assertEqual(plan_2_intermediate.delete, [])
self.assertEqual(plan_2_intermediate.update_parents, [])
@ddt.data(
create_test_graph(["1"]), # Original (is also Active)
create_test_graph(["1", "2"]), # "1" = Original, "2" = Active
)
def test_no_changes(self, graph):
"""These scenarios should result in no Changes."""
plan_1 = ChangePlan.create(graph, 0, False, False)
plan_2 = ChangePlan.create(graph, 2, False, False)
self.assertEqual(plan_1, plan_2)
self.assertEqual(plan_1.delete, [])
self.assertEqual(plan_1.update_parents, [])
def test_overlapping_shared_history(self):
"""Test multiple branches that overlap in what history to preserve."""
graph = create_test_graph(
["1", "2", "3"],
["1", "2", "3", "4", "5"],
["1", "2", "3", "6"],
["1", "2", "7", "8", "9", "10"],
)
plan = ChangePlan.create(graph, 1, False, False)
# We specified only one intermediate structure in each branch should be
# preserved. So why do we only delete "7" and "8" here?
# "1" is the original structure, and will always be preserved.
# "2" is the intermediate structure preserved by the first branch. It
# won't be deleted, even if other branches might want to flag it for
# deletion.
# "3" would be deleted by the second branch, but it's Active in the
# first, and so is preserved. Active Structures are never deleted.
# "4" is preserved by the second branch.
# "5" is the Active Structure for the second branch.
# "6" is the Active Structure for the third branch.
# "7" is marked for deletion by the fourth branch.
# "8" is marked for deletion by the fourth branch.
# "9" is preserved by the fourth branch.
# "10" is the Active Structure for the fourth branch.
self.assertEqual(plan.delete, ["7", "8"])
self.assertEqual(plan.update_parents, [("9", "1")])
def test_non_overlapping_shared_history(self):
"""Test shared history, preserved intermediate set doesn't overlap."""
graph = create_test_graph(
["1", "2", "3"],
["1", "2", "3", "4", "5", "6"],
)
plan = ChangePlan.create(graph, 0, False, False)
self.assertEqual(plan.delete, ["2", "4", "5"])
self.assertEqual(plan.update_parents, [("3", "1"), ("6", "1")])
graph_save_1 = create_test_graph(
["1", "2", "3", "4"],
["1", "2", "3", "4", "5", "6", "7"],
)
plan_save_1 = ChangePlan.create(graph_save_1, 1, False, False)
self.assertEqual(plan_save_1.delete, ["2", "5"])
self.assertEqual(plan_save_1.update_parents, [("3", "1"), ("6", "1")])
def test_details_output(self):
"""Test our details file output."""
graph = create_test_graph(
["1"],
["2", "3"],
["4", "5", "6"]
)
buff = StringIO()
buff.name = "test_file.txt"
plan = ChangePlan.create(graph, 0, False, False, buff)
details_txt = buff.getvalue()
# pylint: disable=line-too-long
expected_output = textwrap.dedent(
"""
== Summary ==
Active Version Branches: 3
Total Structures: 6
Structures to Save: 5
Structures to Delete: 1
Structures to Rewrite Parent Link: 1
== Active Versions ==
Active Version A00000000000000000000001 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+1
+ 1 (active) (original)
Active Version A00000000000000000000002 [2012-05-02 00:00:00] published-branch for course-v1:edx+splitmongo+2
+ 3 (active)
+ 2 (original)
Active Version A00000000000000000000003 [2012-05-02 00:00:00] draft-branch for course-v1:edx+splitmongo+3
+ 6 (active) (re-link to original)
- 5
+ 4 (original)
"""
).lstrip()
# pylint: enable=line-too-long
self.assertEqual(expected_output, details_txt)
self.assertEqual(
plan,
ChangePlan(
delete=["5"],
update_parents=[("6", "4")]
)
)
class TestSplitMongoBackendHelpers(unittest.TestCase):
"""
Test the static helper methods of SplitMongoBackend.
Requires no actual database connection.
"""
def test_parse_structure_doc(self):
"""Test basic parsing of Structures."""
original_structure = SplitMongoBackend.parse_structure_doc(
{
'_id': obj_id(1),
'original_version': obj_id(1),
'previous_version': None,
'extra_data': "This is ignored"
}
)
self.assertEqual(
original_structure,
Structure(id=str_id(1), original_id=str_id(1), previous_id=None)
)
self.assertTrue(original_structure.is_original())
other_structure = SplitMongoBackend.parse_structure_doc(
{
'_id': obj_id(2),
'original_version': obj_id(1),
'previous_version': obj_id(1),
'extra_data': "This is ignored"
}
)
self.assertEqual(
other_structure,
Structure(id=str_id(2), original_id=str_id(1), previous_id=str_id(1))
)
self.assertFalse(other_structure.is_original())
def test_batch(self):
"""Test the batch helper that breaks up iterables for DB operations."""
self.assertEqual(
list(SplitMongoBackend.batch([], 1)),
[]
)
self.assertEqual(
list(SplitMongoBackend.batch([1, 2, 3], 1)),
[[1], [2], [3]]
)
self.assertEqual(
list(SplitMongoBackend.batch([1, 2, 3], 2)),
[[1, 2], [3]]
)
self.assertEqual(
list(SplitMongoBackend.batch([1, 2, 3, 4], 2)),
[[1, 2], [3, 4]]
)
def test_iter_from_start(self):
"""Test what we use to resume deletion from a given Structure ID."""
all_ids = [1, 2, 3]
self.assertEqual(
list(SplitMongoBackend.iter_from_start(all_ids, None)),
all_ids
)
self.assertEqual(
list(SplitMongoBackend.iter_from_start(all_ids, 1)),
all_ids
)
self.assertEqual(
list(SplitMongoBackend.iter_from_start(all_ids, 2)),
[2, 3]
)
self.assertEqual(
list(SplitMongoBackend.iter_from_start(all_ids, 3)),
[3]
)
self.assertEqual(
list(SplitMongoBackend.iter_from_start(all_ids, 4)),
[]
)
@unittest.skip("Requires local MongoDB instance (run manually).")
class TestSplitMongoBackend(unittest.TestCase):
"""
Tests the MongoDB-specific portions of the code.
These tests should be about simple read/write from the database. Complex
trees of Structures can be created and tested in TestSingleCourseChangePlan
without invoking the database.
These tests will be disabled by default because I didn't want to add MongoDB
as a test-time dependency for tubular, and the only decent looking MongoDB
mocking library I could find was no longer being maintained. Given how
isolated Split Mongo related code is in tubular (nothing else touches it),
the main danger of breakage comes from file format changes in edx-platform,
which automated testing at this level wouldn't catch anyway.
So basically, if you want to work on this code, please run these tests
locally by spinning up the MongoDB server used for Docker Devstack and
commenting out the unittest.skip decorator above.
"""
CONNECT_STR = "mongodb://localhost:27017"
DATABASE_NAME = "splitmongo_test"
def setUp(self):
"""Clear our test MongoDB instance of data."""
super().setUp()
self.client = MongoClient(self.CONNECT_STR)
database = self.client[self.DATABASE_NAME]
# Remove anything that might have been there from a previous test.
database.drop_collection('modulestore.active_versions')
database.drop_collection('modulestore.structures')
# Convenince pointers to our collections.
self.active_versions = database['modulestore.active_versions']
self.structures = database['modulestore.structures']
# The backend we should use in our tests for querying.
self.backend = SplitMongoBackend(self.CONNECT_STR, self.DATABASE_NAME)
self.seed_data()
def seed_data(self):
"""Create a Course and Library."""
structure_docs = [
# Branch 1
dict(_id=obj_id(1), original_version=obj_id(1), previous_version=None),
dict(_id=obj_id(2), original_version=obj_id(1), previous_version=obj_id(1)),
dict(_id=obj_id(3), original_version=obj_id(1), previous_version=obj_id(2)),
dict(_id=obj_id(4), original_version=obj_id(1), previous_version=obj_id(3)),
# Branch 2
dict(_id=obj_id(10), original_version=obj_id(10), previous_version=None),
dict(_id=obj_id(11), original_version=obj_id(10), previous_version=obj_id(10)),
# Branch 3
dict(_id=obj_id(20), original_version=obj_id(20), previous_version=None),
]
active_versions_docs = [
{
'_id': obj_id(100),
'edited_on': datetime(2012, 5, 2),
'org': 'edx',
'course': 'split_course',
'run': '2017',
'versions': {
'draft-branch': obj_id(4),
'published-branch': obj_id(11)
}
},
{
'_id': obj_id(101),
'edited_on': datetime(2012, 5, 3),
'org': 'edx',
'course': 'split_library',
'run': 'library',
'versions': {
'library': obj_id(20),
}
}
]
self.structures.insert_many(structure_docs)
self.active_versions.insert_many(active_versions_docs)
def test_structures_graph(self):
"""Test pulling a full graph out."""
graph = self.backend.structures_graph(0, 100)
self.assertEqual(
graph.branches,
[
ActiveVersionBranch(
id=str_id(100),
branch='draft-branch',
structure_id=str_id(4),
key=CourseLocator('edx', 'split_course', '2017'),
edited_on=datetime(2012, 5, 2),
),
ActiveVersionBranch(
id=str_id(100),
branch='published-branch',
structure_id=str_id(11),
key=CourseLocator('edx', 'split_course', '2017'),
edited_on=datetime(2012, 5, 2),
),
ActiveVersionBranch(
id=str_id(101),
branch='library',
structure_id=str_id(20),
key=LibraryLocator('edx', 'split_library'),
edited_on=datetime(2012, 5, 3),
),
]
)
self.assertEqual(
list(graph.structures.keys()),
[str_id(i) for i in [1, 2, 3, 4, 10, 11, 20]]
)
def test_update(self):
"""Execute a simple update."""
self.backend.update(
ChangePlan(
delete=[str_id(i) for i in [2, 3]],
update_parents=[(str_id(4), str_id(1))]
),
delay=0
)
graph = self.backend.structures_graph(0, 100)
self.assertEqual(
list(graph.structures.keys()),
[str_id(i) for i in [1, 4, 10, 11, 20]]
)
self.assertEqual(
graph.structures,
{
str_id(1): Structure(id=str_id(1), original_id=str_id(1), previous_id=None),
# This one got its previous_id rewritten from 3 -> 1
str_id(4): Structure(id=str_id(4), original_id=str_id(1), previous_id=str_id(1)),
str_id(10): Structure(id=str_id(10), original_id=str_id(10), previous_id=None),
str_id(11): Structure(id=str_id(11), original_id=str_id(10), previous_id=str_id(10)),
str_id(20): Structure(id=str_id(20), original_id=str_id(20), previous_id=None),
}
)
def test_race_condition(self):
"""Create new Structures are during ChangePlan creation."""
# Get the real method before we patch it...
real_all_structures_fn = SplitMongoBackend._all_structures # pylint: disable=protected-access
def add_structures(backend, delay, batch_size):
"""Do what _all_structures() would do, then add new Structures."""
structures = real_all_structures_fn(backend, delay, batch_size)
# Create new Structures
self.structures.insert_one(
dict(_id=obj_id(5), original_version=obj_id(1), previous_version=obj_id(4)),
)
self.structures.insert_one(
dict(_id=obj_id(6), original_version=obj_id(1), previous_version=obj_id(5)),
)
self.structures.insert_one(
dict(_id=obj_id(7), original_version=obj_id(1), previous_version=obj_id(6)),
)
# Update the Draft branch of course-v1:edx+split_course+2017 to
# point to one of the new Structures
self.active_versions.update_one(
{'_id': obj_id(100)},
{'$set': {'versions.draft-branch': obj_id(5)}}
)
# Create an entirely new ActiveVersion and point it to the newest
# Structure.
self.active_versions.insert_one(
{
'_id': obj_id(102),
'edited_on': datetime(2012, 5, 3),
'org': 'edx',
'course': 'split_library_race',
'run': 'library',
'versions': {
'library': obj_id(7),
}
}
)
return structures
with patch.object(SplitMongoBackend, '_all_structures', autospec=True) as all_structures_mock:
all_structures_mock.side_effect = add_structures
graph = self.backend.structures_graph(0, 100)
self.assertEqual(len(graph.structures), 10)
self.assertEqual(len(graph.branches), 4)
plan = ChangePlan.create(graph, 0, False, False)
self.assertNotIn(str_id(5), plan.delete) # Active updated to this for our course.
self.assertNotIn(str_id(7), plan.delete) # Active for our new Library
self.assertIn(str_id(4), plan.delete) # Was our Active before
self.assertIn(str_id(6), plan.delete) # Intermediate structure to new Library
def str_id(int_id):
"""Return the string version of Object IDs that PyMongo will accept."""
return "{:024}".format(int_id)
def obj_id(int_id):
"""Helper to create Object IDs that PyMongo will accept."""
return ObjectId(str_id(int_id))

View File

@@ -0,0 +1,679 @@
"""
This module provides logic to clean up old, unused course content data for the
DraftVersioningModuleStore modulestore, more commonly referred to as the "Split
Mongo" or "Split" modulestore (DraftVersioningModuleStore subclasses
SplitMongoModuleStore). All courses and assets that have newer style locator
keys use DraftVersioningModuleStore. These keys start with "course-v1:",
"ccx-v1:", or "block-v1:".
The older modulestore is DraftModuleStore, sometimes called "Old Mongo". This
code does not address that modulestore in any way. That modulestore handles
courses that use the old "/" separator, such as "MITx/6.002x/2012_Spring", as
well as assets starting with "i4x://".
"Split" gets its name from the fact that it separates the Structure of a course
from the content in the leaf nodes. In theory, the Structure is an outline of
the course that contains all the parent/child relations for different content
blocks (chapters, sections, sub-sections, verticals, videos, etc.), as well as
small, commonly inherited metadata like due dates. More detailed information
about any particular block of content is stored in a separate collection as
Definitions.
Both Structures and Definitions are immutable in Split. When a course is edited,
a new Structure is created, and the Active Versions entry for a course is
updated to point to that new Structure. In that way, we never get a partially
applied edit -- it either succeeds or fails atomically. The Active Versions
entry for a Course has pointers to "published" and "draft" Structures. There is
also a special "library" pointer that is only used by Content Libraries. We do
not need to distinguish between these for the purposes of cleanup.
The problem is that Structure documents have become far larger than they were
intended to be, and we never created code to properly clean them up. As such, it
is not uncommon for the majority of Mongo storage space to be used by old
Structure documents that are completely unused (and are unreachable) by LMS or
Studio.
This module provides cleanup functionality with various tweakable options for
how much history to preserve. For simplicity, it reads all Structure IDs into
memory instead of working on subsets of the data. As a practical matter, this
means that it will work for databases with up to about 10 million Structures
before RAM usage starts to become a problem.
"""
from collections import deque, namedtuple
from itertools import count, takewhile
import json
import logging
import os
import sys
import time
from bson.objectid import ObjectId
from pymongo import MongoClient, UpdateOne
from opaque_keys.edx.locator import CourseLocator, LibraryLocator
LOG = logging.getLogger('structures')
class StructuresGraph(namedtuple('DatabaseSummary', 'branches structures')):
"""
This summarizes the entire set of Structure relationships in a database.
Each Structure represents a saved state for the Course or Content Library.
For each branch ("published", "draft", or "library"), there is a sequence of
Structures that starts with an Original and ends in an Active Structure::
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
`branches` is a list of ActiveVersionBranch objects representing what's
currently live on the LMS and Studio. Active Structures referenced in this
list cannot be removed because it would break the site for users.
`structures` is a dict of Structure IDs (Strings) to Structure objects
(described above). All the Structure objects store ID locations to their
parent and original Structures rather than having direct references to them.
This is partly because we don't really need to traverse the vast majority of
the graph. Look at `ChangePlan` for details on why that is.
"""
def traverse_ids(self, start_id, limit=None, include_start=False):
"""
Given a Structure ID to start from, this will iterate through the
previous_id chain, for up to `limit` parent relationships. If `limit` is
None, it will keep going until it gets through the Original.
"""
if include_start:
yield start_id
current_id = start_id
i = 0
while current_id in self.structures:
if limit is not None and i >= limit:
return
current_id = self.structures[current_id].previous_id
if current_id is None:
return
yield current_id
i += 1
class ActiveVersionBranch(namedtuple('ActiveVersionBranch', 'id branch structure_id key edited_on')):
"""
An Active Version document can point to multiple branches (e.g. "published",
"draft"). This object represensts one of those branches.
The value for `branch` can be "draft-branch", "published-branch", or
"library". All Courses have a draft-branch and a published-branch. Content
Libraries have only a "library" branch.
The value for `key` is the Opaque Key representing the Course or Library,
mostly for debugging purposes (they're not a part of the plan file).
The value for `edited_on` is a timestamp showing the last time the Active
Version document was modified -- for a Course, this means when *either* the
published-branch or draft-branch was most recently modified. Again, this is
not used for pruning, but just provides debug information.
"""
def __str__(self):
return "Active Version {} [{}] {} for {}".format(
self.id,
self.edited_on.strftime('%Y-%m-%d %H:%M:%S'),
self.branch,
self.key,
)
class Structure(namedtuple('Structure', 'id original_id previous_id')):
"""
The parts of a SplitMongo Structure document that we care about, namely the
ID (str'd version of the ObjectID), and the IDs of the Original and Previous
structure documents. The previous_id may be None ()
We use a namedtuple for this specifically because it's more space efficient
than a dict, and we can have millions of Structures.
"""
def is_original(self):
"""Is this Structure an original (i.e. should never be deleted)?"""
return self.previous_id is None
class ChangePlan(namedtuple('ChangePlan', 'delete update_parents')):
"""
Summary of the pruning actions we want a Backend to take.
The idea of having this data structure and being able to serialize it is so
that we can save our plan of action somewhere for debugging, failure
recovery, and batching updates.
`delete` is a list of Structure IDs we want to delete.
`update_parents` is a list of (structure_id, new_previous_id) tuples
representing the previous_id updates we need to make.
A ChangePlan is just a declarative. It is the responsibility of the
Backend to figure out how to implement a ChangePlan safely and efficiently
in order to do the actual updates.
"""
def dump(self, file_obj):
"""Serialize ChangePlan to a file (JSON format)."""
json.dump(
{
"delete": self.delete,
"update_parents": self.update_parents,
},
file_obj,
indent=2,
)
LOG.info(
"Wrote Change Plan: %s (%s deletions, %s parent updates)",
os.path.realpath(file_obj.name),
len(self.delete),
len(self.update_parents)
)
@classmethod
def load(cls, file_obj):
"""Load a ChangePlan from a JSON file. Takes a file object."""
data = json.load(file_obj)
return cls(
delete=data["delete"], update_parents=data["update_parents"]
)
@classmethod
def create(cls, structures_graph, num_intermediate_structures, ignore_missing, dump_structures, details_file=None):
"""
Given a StructuresGraph and a target number for intermediate Structures
to preserve, return a ChangePlan that represents the changes needed to
prune the database. The overall strategy is to iterate through all
Active Structures, walk back through the ancestors, and add all the
Structure IDs we should save to a set. After we have our save set, we
know that we can delete all other structures without worrying about
whether those Structures are reachable or knowing what their
relationships are. This keeps things simpler, and means that we should
be more resilient to failures when pruning.
Structure documents exist in chains of parent/child relationships,
starting with an Original Structure, having some number of Intermediate
Structures, and ending in an Active Structure::
Original -> (Intermediate 1) -> (Intermediate 2) -> ... -> Active
Pruning Rules:
1. All Active Structures must be preserved, as those are being used by
the LMS and Studio to serve course content.
2. All Original Structures should be preserved, since those are used by
the LMS and Studio to determine common shared ancestry between
Structures.
3. Up to `num_intermediate_structures` Intermediate Structures will be
kept. These Structures are not actually used in edx-platform code,
but they are sometimes used by developers to allow emergency reverts
in course team support situations (e.g. someone accidentally wiped
out their course with a bad import).
4. The oldest preserved Intermediate Structure will be modified so that
its `previous_id` is updated to point to the Original Structure. That
way, we're not preserving references to the IDs of Structures that
have been pruned.
"""
structure_ids_to_save = set()
set_parent_to_original = set()
branches, structures = structures_graph
# Figure out which Structures to save...
for branch in branches:
# Anything that's actively being pointed to (is the head of a branch)
# must be preserved. This is what's being served by Studio and LMS.
active_structure_id = branch.structure_id
structure_ids_to_save.add(active_structure_id)
# All originals will be saved.
structure_ids_to_save.add(structures[active_structure_id].original_id)
# Save up to `num_intermediate_structures` intermediate nodes
int_structure_ids_to_save = structures_graph.traverse_ids(
active_structure_id, limit=num_intermediate_structures
)
for int_structure_id in int_structure_ids_to_save:
structure_ids_to_save.add(int_structure_id)
missing_structure_ids = structure_ids_to_save - structures.keys()
if ignore_missing:
# Remove missing structures since we can't save them
structure_ids_to_save -= missing_structure_ids
elif len(missing_structure_ids) > 0:
LOG.error("Missing structures detected")
sys.exit(1)
# Figure out what links to rewrite -- the oldest structure to save that
# isn't an original.
for branch in branches:
rewrite_candidates = takewhile(
lambda s: s in structure_ids_to_save and not structures[s].is_original(),
structures_graph.traverse_ids(branch.structure_id, include_start=True)
)
# `last_seen` will have the last structure_id from the
# `rewrite_candidates` iterable.
last_seen = deque(rewrite_candidates, 1)
if last_seen:
structure = structures[last_seen.pop()]
# Don't do a rewrite if it's just a no-op...
if structure.original_id != structure.previous_id:
set_parent_to_original.add(structure.id)
# Sort the items in the ChangePlan. This might not be helpful, but I'm
# hoping that it will keep disk changes more localized and not thrash
# things as much as randomly distributed deletes. Mongo ObjectIDs are
# ordered (they have a timestamp component).
change_plan = cls(
delete=sorted(structures.keys() - structure_ids_to_save),
update_parents=sorted(
(s_id, structures[s_id].original_id)
for s_id in set_parent_to_original
)
)
if details_file:
change_plan.write_details(
details_file, structures_graph, structure_ids_to_save, set_parent_to_original
)
if dump_structures:
active_structure_ids = {branch.structure_id for branch in branches}
for sid in structures:
save = sid in structure_ids_to_save
active = sid in active_structure_ids
relink = sid in set_parent_to_original
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
LOG.info(f"DUMP id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
for missing_structure_id in missing_structure_ids:
active_structure_ids = {branch.structure_id for branch in branches}
LOG.error(f"Missing structure ID: {missing_structure_id}")
original_ids = set()
for structure in structures.values():
if structure.previous_id == missing_structure_id:
save = structure.id in structure_ids_to_save
active = structure.id in active_structure_ids
relink = structure.id in set_parent_to_original
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
LOG.info(f"Structure {structure.id} points to missing structure with ID: {structure.previous_id}")
original_ids.add(structure.original_id)
active_structure_ids = {branch.structure_id for branch in branches}
branches_to_log = []
LOG.info(f"Looking for branches that lead to missing ID {missing_structure_id}")
for branch in branches:
structure = structures[branch.structure_id]
if structure.original_id in original_ids:
for sid in structures_graph.traverse_ids(branch.structure_id):
if sid not in structures:
branches_to_log.append(branch)
for branch in branches_to_log:
structure = structures[branch.structure_id]
LOG.info(f"Branch: {branch}")
save = branch.structure_id in structure_ids_to_save
active = branch.structure_id in active_structure_ids
relink = branch.structure_id in set_parent_to_original
prev_misssing = structure.previous_id is not None and structure.previous_id not in structures
for sid in structures_graph.traverse_ids(branch.structure_id, include_start=True):
if sid in structures:
save = sid in structure_ids_to_save
active = sid in active_structure_ids
relink = sid in set_parent_to_original
prev_misssing = structures[sid].previous_id is not None and structures[sid].previous_id not in structures
LOG.info(f"id: {sid}, original_id: {structures[sid].original_id}, previous_id: {structures[sid].previous_id}, save: {save}, active: {active}, prev_missing: {prev_misssing}, rewrite_previous_to_original: {relink}")
return change_plan
@staticmethod
def write_details(details_file, structures_graph, structure_ids_to_save, set_parent_to_original):
"""
Simple dump of the changes we're going to make to the database.
This method requires information that we don't actually keep in the
ChangePlan file, such as the Course IDs and edit times. Because of this,
it can only be created at the time the ChangePlan is being generated,
and cannot be derived from an existing ChangePlan. The goal was to
provide this debug information while keeping the ChangePlan file format
as stupidly simple as possible.
"""
branches, structures = structures_graph
active_structure_ids = {branch.structure_id for branch in branches}
def text_for(s_id):
"""Helper method to format Structures consistently."""
action = "+" if s_id in structure_ids_to_save else "-"
notes = []
if s_id in active_structure_ids:
notes.append("(active)")
if s_id in set_parent_to_original:
notes.append("(re-link to original)")
if s_id in structures and structures[s_id].is_original():
notes.append("(original)")
if notes:
return "{} {} {}".format(action, s_id, " ".join(notes))
return "{} {}".format(action, s_id)
print("== Summary ==", file=details_file)
print("Active Version Branches: {}".format(len(branches)), file=details_file)
print("Total Structures: {}".format(len(structures)), file=details_file)
print("Structures to Save: {}".format(len(structure_ids_to_save)), file=details_file)
print("Structures to Delete: {}".format(len(structures) - len(structure_ids_to_save)), file=details_file)
print("Structures to Rewrite Parent Link: {}".format(len(set_parent_to_original)), file=details_file)
print("\n== Active Versions ==", file=details_file)
for branch in branches:
print("{}".format(branch), file=details_file)
for structure_id in structures_graph.traverse_ids(branch.structure_id, include_start=True):
print(text_for(structure_id), file=details_file)
print("", file=details_file)
LOG.info(
"Wrote Change Details File: %s", os.path.realpath(details_file.name)
)
class SplitMongoBackend:
"""
Interface to the MongoDB backend. This is currently the only supported KV
store for the Split(DraftVersioning)ModuleStore, but having this as a
separate class makes it easier to stub in test data.
The methods on this class should accept and return backend-agnostic data
structures, so no BSON details should leak out.
"""
def __init__(self, mongo_connection_str, db_name):
self._db = MongoClient(
mongo_connection_str,
connectTimeoutMS=2000,
socketTimeoutMS=300000, # *long* operations
serverSelectionTimeoutMS=2000
)
self._active_versions = self._db[db_name].modulestore.active_versions
self._structures = self._db[db_name].modulestore.structures
def structures_graph(self, delay, batch_size):
"""
Return StructuresGraph for the entire modulestore.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
This has one slight complication. A StructuresGraph is expected to be a
consistent view of the database, but MongoDB doesn't offer a "repeatable
read" transaction isolation mode. That means that Structures may be
added at any time between our database calls. Because of this, we have
to be careful in stitching together something that is safe. The
guarantees we try to make about the StructuresGraph being returned are:
1. Every Structure ID in `active_structure_ids` is also in `structures`
2. If `branches` is stale and there is a new Structure that is Active
in the database, it is *not* in `structures`.
Scenario A: We fetch branches, then structures
1. Get Branches (and thus Active Structure IDs)
2. New Structures created by Studio
3. Get all Structures
It is almost certainly the case that the new Structures created in (2)
should be active. Our algorithm works by starting from the Active
Structure IDs that we know about, making a "save" list, and then
deleting all other Structures. The problem in this scenario is that we
fetch the new Structures in (3), but we don't know that they're Active
because our `active_structure_ids` comes from (1) and is stale. So we
would in fact delete what should be Active Structures.
Scenario B: We fetch structures, then branches
1. Get all Structures
2. New Structures created by Studio
3. Get Branches (and thus Active Structure IDs)
In this scenario, we may see Active Structure IDs that are not in
our Structures dict. This is bad because we won't know how to crawl
their ancestry and mark the appropriate Structure IDs to be saved.
So the approach we take is Scenario B with a fallback. After we fetch
everything, we go through the Active Structure IDs and make sure that
those Structures and their ancestors exist in `structures`. If they
don't, we make extra fetches to get them. Misses should be rare, so it
shouldn't have a drastic performance impact overall.
Note that it's safe if the ChangePlan as a whole is a little stale, so
long as it's internally consistent. We only ever delete Structures that
are in the `structures` doc, so a new Active Version that we're
completely unaware of will be left alone.
"""
structures = self._all_structures(delay, batch_size)
branches = self._all_branches()
# Guard against the race condition that branch.structure_id or its
# ancestors are not in `structures`. Make sure that we add those.
LOG.info(
"Checking for missing Structures (a small number are expected "
"unless edits are disabled during change plan creation)."
)
missing_count = 0
for branch in branches:
structure_id = branch.structure_id
while structure_id and (structure_id not in structures):
structures[structure_id] = self._get_structure(structure_id)
missing_count += 1
LOG.warning(
"Structure %s linked from Active Structure %s (%s) fetched.",
structure_id,
branch.structure_id,
branch.key,
)
structure_id = structures[structure_id].previous_id
LOG.info("Finished checking for missing Structures, found %s", missing_count)
return StructuresGraph(branches, structures)
def _all_structures(self, delay, batch_size):
"""
Return a dict mapping Structure IDs to Structures for all Structures in
the database.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
"""
LOG.info("Fetching all known Structures (this might take a while)...")
LOG.info("Delay in seconds: %s, Batch size: %s", delay, batch_size)
# Important to keep this as a generator to limit memory usage.
parsed_docs = (
self.parse_structure_doc(doc)
for doc
in self._structures_from_db(delay, batch_size)
)
structures = {structure.id: structure for structure in parsed_docs}
LOG.info("Fetched %s Structures", len(structures))
return structures
def _structures_from_db(self, delay, batch_size):
"""
Iterate through all Structure documents in the database.
`batch_size` is the number of structure documents we pull at a time.
`delay` is the delay in seconds between batch queries.
"""
cursor = self._structures.find(
projection=['original_version', 'previous_version']
)
cursor.batch_size(batch_size)
for i, structure_doc in enumerate(cursor, start=1):
yield structure_doc
if i % batch_size == 0:
LOG.info("Structure Cursor at %s (%s)", i, structure_doc['_id'])
time.sleep(delay)
def _all_branches(self):
"""Retrieve list of all ActiveVersionBranch objects in the database."""
branches = []
LOG.info("Fetching all Active Version Branches...")
for av_doc in self._active_versions.find():
for branch, obj_id in av_doc['versions'].items():
structure_id = str(obj_id)
if branch == 'library':
key = LibraryLocator(av_doc['org'], av_doc['course'])
else:
key = CourseLocator(av_doc['org'], av_doc['course'], av_doc['run'])
branches.append(
ActiveVersionBranch(
str(av_doc['_id']),
branch,
structure_id,
key,
av_doc['edited_on'],
)
)
LOG.info("Fetched %s Active Version Branches", len(branches))
return sorted(branches)
def _get_structure(self, structure_id):
"""Get an individual Structure from the database."""
structure_doc = self._structures.find_one(
{'_id': ObjectId(structure_id)},
projection=['original_version', 'previous_version']
)
return self.parse_structure_doc(structure_doc)
def update(self, change_plan, delay=1000, batch_size=1000, start=None):
"""
Update the backend according to the relinking and deletions specified in
the change_plan.
"""
# Step 1: Relink - Change the previous pointer for the oldest structure
# we want to keep, so that it points back to the original. We never
# delete the original. Relinking happens before deletion so that we
# never leave our course in a broken state (at worst, parts of it
# become unreachable).
self._update_parents(change_plan.update_parents, delay, batch_size)
# Step 2: Delete unused Structures
self._delete(change_plan.delete, delay, batch_size, start)
def _update_parents(self, id_parent_pairs, delay, batch_size):
"""
Update Structure parent relationships.
`id_parent_pairs` is a list of tuples, where the first element of each
tuple is a Structure ID (str) to target, and the second element is the
Structure ID that will be the new parent of the first element.
"""
for id_parent_pairs_batch in self.batch(id_parent_pairs, batch_size):
updates = [
UpdateOne(
{'_id': ObjectId(structure_id)},
{'$set': {'previous_version': ObjectId(previous_id)}}
)
for structure_id, previous_id in id_parent_pairs_batch
]
result = self._structures.bulk_write(updates)
LOG.info(
"Updated %s/%s parent relationships.",
result.bulk_api_result['nModified'],
result.bulk_api_result['nMatched'],
)
time.sleep(delay)
def _delete(self, structure_ids, delay, batch_size, start=None):
"""
Delete old structures in batches.
`structure_ids` is a list of Structure IDs to delete.
`delay` is the delay in seconds (floats are ok) between batch deletes.
`batch_size` is how many we try to delete in each batch statement.
"""
s_ids_with_offset = self.iter_from_start(structure_ids, start)
for structure_ids_batch in self.batch(s_ids_with_offset, batch_size):
result = self._structures.delete_many(
{
'_id': {
'$in': [ObjectId(s_id) for s_id in structure_ids_batch]
}
}
)
LOG.info(
"Deleted %s/%s Structures: %s - %s",
result.deleted_count,
len(structure_ids_batch),
structure_ids_batch[0],
structure_ids_batch[-1],
)
time.sleep(delay)
@staticmethod
def parse_structure_doc(structure_doc):
"""
Structure docs are pretty big, but we only care about three top level
fields, all of which are ObjectIds:
_id: The Structure ID
previous_version: The Structure ID for the parent. An Original
Structure will have None for this field.
original_version: The Original Structure that this Structure and all
its ancestors are ultimately dervied from. An
Original Structure points to itself with this field.
"""
_id = str(structure_doc['_id'])
original_id = str(structure_doc['original_version'])
previous_id = structure_doc['previous_version']
if previous_id is not None:
previous_id = str(previous_id)
return Structure(_id, original_id, previous_id)
@staticmethod
def batch(iterable, batch_size):
"""Yield lists of up to `batch_size` in length from `iterable`."""
iterator = iter(iterable)
curr_batch = []
for i in count(1):
try:
curr_batch.append(next(iterator))
if i % batch_size == 0:
yield curr_batch
curr_batch = []
except StopIteration:
break
if curr_batch:
yield curr_batch
@staticmethod
def iter_from_start(structure_ids, start=None):
"""
Yields from an iterable once it encounters the `start` value. If `start`
is None, just yields from the beginning.
"""
if start is None:
for structure_id in structure_ids:
yield structure_id
return
for structure_id in structure_ids:
if structure_id < start:
continue
yield structure_id