diff --git a/lms/djangoapps/courseware/management/commands/clean_history.py b/lms/djangoapps/courseware/management/commands/clean_history.py new file mode 100644 index 0000000000..2c243a43be --- /dev/null +++ b/lms/djangoapps/courseware/management/commands/clean_history.py @@ -0,0 +1,217 @@ +"""A command to clean the StudentModuleHistory table. + +When we added XBlock storage, each field modification wrote a new history row +to the db. Now that we have bulk saves to avoid that database hammering, we +need to clean out the unnecessary rows from the database. + +This command that does that. + +""" + +import datetime +import json +from optparse import make_option +import traceback + +from django.core.management.base import NoArgsCommand +from django.db import connection + + +class Command(NoArgsCommand): + """The actual clean_history command to clean history rows.""" + + help = "Deletes unneeded rows from the StudentModuleHistory table." + + option_list = NoArgsCommand.option_list + ( + make_option( + '--dry-run', + action='store_true', + default=False, + help="Don't change the database, just show what would be done.", + ), + ) + + def handle_noargs(self, **options): + smhc = StudentModuleHistoryCleaner( + dry_run=options["dry_run"], + ) + smhc.main() + + +class StudentModuleHistoryCleaner(object): + """Logic to clean rows from the StudentModuleHistory table.""" + + DELETE_GAP_SECS = 0.5 # Rows this close can be discarded. + STATE_FILE = "clean_history.json" + BATCH_SIZE = 100 + + def __init__(self, dry_run=False): + self.dry_run = dry_run + self.next_student_module_id = 0 + self.last_student_module_id = 0 + + def main(self, batch_size=None): + """Invoked from the management command to do all the work.""" + + batch_size = batch_size or self.BATCH_SIZE + + self.last_student_module_id = self.get_last_student_module_id() + self.load_state() + + while self.next_student_module_id <= self.last_student_module_id: + for smid in self.module_ids_to_check(batch_size): + try: + self.clean_one_student_module(smid) + except Exception: # pylint: disable=W0703 + trace = traceback.format_exc() + self.say("Couldn't clean student_module_id {}:\n{}".format(smid, trace)) + self.commit() + self.save_state() + + def say(self, message): + """ + Display a message to the user. + + The message will have a trailing newline added to it. + + """ + print message + + def commit(self): + """ + Commit the transaction. + """ + self.say("Committing") + connection.commit() + + def load_state(self): + """ + Load the latest state from disk. + """ + try: + state_file = open(self.STATE_FILE) + except IOError: + self.say("No stored state") + self.next_student_module_id = 0 + else: + with state_file: + state = json.load(state_file) + self.say( + "Loaded stored state: {}".format( + json.dumps(state, sort_keys=True) + ) + ) + self.next_student_module_id = state['next_student_module_id'] + + def save_state(self): + """ + Save the state to disk. + """ + state = { + 'next_student_module_id': self.next_student_module_id, + } + with open(self.STATE_FILE, "w") as state_file: + json.dump(state, state_file) + self.say("Saved state: {}".format(json.dumps(state, sort_keys=True))) + + def get_last_student_module_id(self): + """ + Return the id of the last student_module. + """ + cursor = connection.cursor() + cursor.execute(""" + SELECT max(student_module_id) FROM courseware_studentmodulehistory + """) + last = cursor.fetchone()[0] + self.say("Last student_module_id is {}".format(last)) + return last + + def module_ids_to_check(self, batch_size): + """Produce a sequence of student module ids to check. + + `batch_size` is how many module ids to produce, max. + + The sequence starts with `next_student_module_id`, and goes up to + and including `last_student_module_id`. + + `next_student_module_id` is updated as each id is yielded. + + """ + start = self.next_student_module_id + for smid in range(start, start+batch_size): + if smid > self.last_student_module_id: + break + yield smid + self.next_student_module_id = smid+1 + + def get_history_for_student_modules(self, student_module_id): + """ + Get the history rows for a student module. + + ```student_module_id```: the id of the student module we're + interested in. + + Return a list: [(id, created), ...], all the rows of history. + + """ + cursor = connection.cursor() + cursor.execute(""" + SELECT id, created FROM courseware_studentmodulehistory + WHERE student_module_id = %s + ORDER BY created + """, + [student_module_id] + ) + history = cursor.fetchall() + return history + + def delete_history(self, ids_to_delete): + """ + Delete history rows. + + ```ids_to_delete```: a non-empty list (or set...) of history row ids to delete. + + """ + assert ids_to_delete + cursor = connection.cursor() + cursor.execute(""" + DELETE FROM courseware_studentmodulehistory + WHERE id IN ({ids}) + """.format(ids=",".join(str(i) for i in ids_to_delete)) + ) + + def clean_one_student_module(self, student_module_id): + """Clean one StudentModule's-worth of history. + + `student_module_id`: the id of the StudentModule to process. + + """ + delete_gap = datetime.timedelta(seconds=self.DELETE_GAP_SECS) + + history = self.get_history_for_student_modules(student_module_id) + if not history: + self.say("No history for student_module_id {}".format(student_module_id)) + return + + ids_to_delete = [] + next_created = None + for history_id, created in reversed(history): + if next_created is not None: + # Compare this timestamp with the next one. + if (next_created - created) < delete_gap: + # This row is followed closely by another, we can discard + # this one. + ids_to_delete.append(history_id) + + next_created = created + + verb = "Would have deleted" if self.dry_run else "Deleting" + self.say("{verb} {to_delete} rows of {total} for student_module_id {id}".format( + verb=verb, + to_delete=len(ids_to_delete), + total=len(history), + id=student_module_id, + )) + + if ids_to_delete and not self.dry_run: + self.delete_history(ids_to_delete) diff --git a/lms/djangoapps/courseware/management/tests/test_clean_history.py b/lms/djangoapps/courseware/management/tests/test_clean_history.py new file mode 100644 index 0000000000..cf681a7b0a --- /dev/null +++ b/lms/djangoapps/courseware/management/tests/test_clean_history.py @@ -0,0 +1,459 @@ +"""Test the clean_history management command.""" + +import fnmatch +from mock import Mock +import os.path +import textwrap + +import dateutil.parser + +from django.test import TestCase +from django.db import connection + +from courseware.management.commands.clean_history import StudentModuleHistoryCleaner + +# In lots of places in this file, smhc == StudentModuleHistoryCleaner + +def parse_date(sdate): + """Parse a string date into a datetime.""" + parsed = dateutil.parser.parse(sdate) + parsed = parsed.replace(tzinfo=dateutil.tz.gettz('UTC')) + return parsed + +class SmhcSayStubbed(StudentModuleHistoryCleaner): + """StudentModuleHistoryCleaner, but with .say() stubbed for testing.""" + def __init__(self, **kwargs): + super(SmhcSayStubbed, self).__init__(**kwargs) + self.said_lines = [] + + def say(self, msg): + self.said_lines.append(msg) + + +class SmhcDbMocked(SmhcSayStubbed): + """StudentModuleHistoryCleaner, but with db access mocked.""" + def __init__(self, **kwargs): + super(SmhcDbMocked, self).__init__(**kwargs) + self.get_history_for_student_modules = Mock() + self.delete_history = Mock() + + def set_rows(self, rows): + """Set the mocked history rows.""" + rows = [(row_id, parse_date(created)) for row_id, created in rows] + self.get_history_for_student_modules.return_value = rows + + +class HistoryCleanerTest(TestCase): + """Base class for all history cleaner tests.""" + + maxDiff = None + + def setUp(self): + super(HistoryCleanerTest, self).setUp() + self.addCleanup(self.clean_up_state_file) + + def write_state_file(self, state): + """Write the string `state` into the state file read by StudentModuleHistoryCleaner.""" + with open(StudentModuleHistoryCleaner.STATE_FILE, "w") as state_file: + state_file.write(state) + + def read_state_file(self): + """Return the string contents of the state file read by StudentModuleHistoryCleaner.""" + with open(StudentModuleHistoryCleaner.STATE_FILE) as state_file: + return state_file.read() + + def clean_up_state_file(self): + """Remove any state file lying around.""" + if os.path.exists(StudentModuleHistoryCleaner.STATE_FILE): + os.remove(StudentModuleHistoryCleaner.STATE_FILE) + + def assert_said(self, smhc, *msgs): + """Fail if the `smhc` didn't say `msgs`. + + The messages passed here are `fnmatch`-style patterns: "*" means anything. + + """ + for said, pattern in zip(smhc.said_lines, msgs): + if not fnmatch.fnmatch(said, pattern): + fmt = textwrap.dedent("""\ + Messages: + + {msgs} + + don't match patterns: + + {patterns} + + Failed at {said!r} and {pattern!r} + """) + + msg = fmt.format( + msgs="\n".join(smhc.said_lines), + patterns="\n".join(msgs), + said=said, + pattern=pattern + ) + self.fail(msg) + + def parse_rows(self, rows): + """Parse convenient rows into real data.""" + rows = [ + (row_id, parse_date(created), student_module_id) + for row_id, created, student_module_id in rows + ] + return rows + + def write_history(self, rows): + """Write history rows to the db. + + Each row should be (id, created, student_module_id). + + """ + cursor = connection.cursor() + cursor.executemany(""" + INSERT INTO courseware_studentmodulehistory + (id, created, student_module_id) + VALUES (%s, %s, %s) + """, + self.parse_rows(rows), + ) + + def read_history(self): + """Read the history from the db, and return it as a list of tuples. + + Returns [(id, created, student_module_id), ...] + + """ + cursor = connection.cursor() + cursor.execute(""" + SELECT id, created, student_module_id FROM courseware_studentmodulehistory + """) + return cursor.fetchall() + + def assert_history(self, rows): + """Assert that the history rows are the same as `rows`.""" + self.assertEqual(self.parse_rows(rows), self.read_history()) + + +class HistoryCleanerNoDbTest(HistoryCleanerTest): + """Tests of StudentModuleHistoryCleaner with db access mocked.""" + + def test_empty(self): + smhc = SmhcDbMocked() + smhc.set_rows([]) + + smhc.clean_one_student_module(1) + self.assert_said(smhc, "No history for student_module_id 1") + + # Nothing to delete, so delete_history wasn't called. + self.assertFalse(smhc.delete_history.called) + + def test_one_row(self): + smhc = SmhcDbMocked() + smhc.set_rows([ + (1, "2013-07-13 12:11:10.987"), + ]) + smhc.clean_one_student_module(1) + self.assert_said(smhc, "Deleting 0 rows of 1 for student_module_id 1") + # Nothing to delete, so delete_history wasn't called. + self.assertFalse(smhc.delete_history.called) + + def test_one_row_dry_run(self): + smhc = SmhcDbMocked(dry_run=True) + smhc.set_rows([ + (1, "2013-07-13 12:11:10.987"), + ]) + smhc.clean_one_student_module(1) + self.assert_said(smhc, "Would have deleted 0 rows of 1 for student_module_id 1") + # Nothing to delete, so delete_history wasn't called. + self.assertFalse(smhc.delete_history.called) + + def test_two_rows_close(self): + smhc = SmhcDbMocked() + smhc.set_rows([ + (7, "2013-07-13 12:34:56.789"), + (9, "2013-07-13 12:34:56.987"), + ]) + smhc.clean_one_student_module(1) + self.assert_said(smhc, "Deleting 1 rows of 2 for student_module_id 1") + smhc.delete_history.assert_called_once_with([7]) + + def test_two_rows_far(self): + smhc = SmhcDbMocked() + smhc.set_rows([ + (7, "2013-07-13 12:34:56.789"), + (9, "2013-07-13 12:34:57.890"), + ]) + smhc.clean_one_student_module(1) + self.assert_said(smhc, "Deleting 0 rows of 2 for student_module_id 1") + self.assertFalse(smhc.delete_history.called) + + def test_a_bunch_of_rows(self): + smhc = SmhcDbMocked() + smhc.set_rows([ + ( 4, "2013-07-13 16:30:00.000"), # keep + ( 8, "2013-07-13 16:30:01.100"), + (15, "2013-07-13 16:30:01.200"), + (16, "2013-07-13 16:30:01.300"), # keep + (23, "2013-07-13 16:30:02.400"), + (42, "2013-07-13 16:30:02.500"), + (98, "2013-07-13 16:30:02.600"), # keep + (99, "2013-07-13 16:30:59.000"), # keep + ]) + smhc.clean_one_student_module(17) + self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 17") + smhc.delete_history.assert_called_once_with([42, 23, 15, 8]) + + +class HistoryCleanerWitDbTest(HistoryCleanerTest): + """Tests of StudentModuleHistoryCleaner with a real db.""" + + def test_no_history(self): + # Cleaning a student_module_id with no history leaves the db unchanged. + smhc = SmhcSayStubbed() + self.write_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + ( 8, "2013-07-13 16:30:01.100", 11), + (15, "2013-07-13 16:30:01.200", 11), + (16, "2013-07-13 16:30:01.300", 11), # keep + (23, "2013-07-13 16:30:02.400", 11), + (42, "2013-07-13 16:30:02.500", 11), + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + smhc.clean_one_student_module(22) + self.assert_said(smhc, "No history for student_module_id 22") + self.assert_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + ( 8, "2013-07-13 16:30:01.100", 11), + (15, "2013-07-13 16:30:01.200", 11), + (16, "2013-07-13 16:30:01.300", 11), # keep + (23, "2013-07-13 16:30:02.400", 11), + (42, "2013-07-13 16:30:02.500", 11), + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + def test_a_bunch_of_rows(self): + # Cleaning a student_module_id with 8 records, 4 to delete. + smhc = SmhcSayStubbed() + self.write_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + ( 8, "2013-07-13 16:30:01.100", 11), + (15, "2013-07-13 16:30:01.200", 11), + (16, "2013-07-13 16:30:01.300", 11), # keep + (17, "2013-07-13 16:30:01.310", 22), # other student_module_id! + (23, "2013-07-13 16:30:02.400", 11), + (42, "2013-07-13 16:30:02.500", 11), + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + smhc.clean_one_student_module(11) + self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 11") + self.assert_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + (16, "2013-07-13 16:30:01.300", 11), # keep + (17, "2013-07-13 16:30:01.310", 22), # other student_module_id! + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + def test_a_bunch_of_rows_dry_run(self): + # Cleaning a student_module_id with 8 records, 4 to delete, + # but don't really do it. + smhc = SmhcSayStubbed(dry_run=True) + self.write_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + ( 8, "2013-07-13 16:30:01.100", 11), + (15, "2013-07-13 16:30:01.200", 11), + (16, "2013-07-13 16:30:01.300", 11), # keep + (23, "2013-07-13 16:30:02.400", 11), + (42, "2013-07-13 16:30:02.500", 11), + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + smhc.clean_one_student_module(11) + self.assert_said(smhc, "Would have deleted 4 rows of 8 for student_module_id 11") + self.assert_history([ + ( 4, "2013-07-13 16:30:00.000", 11), # keep + ( 8, "2013-07-13 16:30:01.100", 11), + (15, "2013-07-13 16:30:01.200", 11), + (16, "2013-07-13 16:30:01.300", 11), # keep + (23, "2013-07-13 16:30:02.400", 11), + (42, "2013-07-13 16:30:02.500", 11), + (98, "2013-07-13 16:30:02.600", 11), # keep + (99, "2013-07-13 16:30:59.000", 11), # keep + ]) + + def test_a_bunch_of_rows_in_jumbled_order(self): + # Cleaning a student_module_id with 8 records, 4 to delete. + smhc = SmhcSayStubbed() + self.write_history([ + (23, "2013-07-13 16:30:01.100", 11), + (24, "2013-07-13 16:30:01.300", 11), # keep + (27, "2013-07-13 16:30:02.500", 11), + (30, "2013-07-13 16:30:01.350", 22), # other student_module_id! + (32, "2013-07-13 16:30:59.000", 11), # keep + (50, "2013-07-13 16:30:02.400", 11), + (51, "2013-07-13 16:30:02.600", 11), # keep + (56, "2013-07-13 16:30:00.000", 11), # keep + (57, "2013-07-13 16:30:01.200", 11), + ]) + + smhc.clean_one_student_module(11) + self.assert_said(smhc, "Deleting 4 rows of 8 for student_module_id 11") + self.assert_history([ + (24, "2013-07-13 16:30:01.300", 11), # keep + (30, "2013-07-13 16:30:01.350", 22), # other student_module_id! + (32, "2013-07-13 16:30:59.000", 11), # keep + (51, "2013-07-13 16:30:02.600", 11), # keep + (56, "2013-07-13 16:30:00.000", 11), # keep + ]) + + def test_get_last_student_module(self): + # Can we find the last student_module_id properly? + smhc = SmhcSayStubbed() + self.write_history([ + (23, "2013-07-13 16:30:01.100", 11), + (24, "2013-07-13 16:30:01.300", 44), + (27, "2013-07-13 16:30:02.500", 11), + (30, "2013-07-13 16:30:01.350", 22), + (32, "2013-07-13 16:30:59.000", 11), + (51, "2013-07-13 16:30:02.600", 33), + (56, "2013-07-13 16:30:00.000", 11), + ]) + last = smhc.get_last_student_module_id() + self.assertEqual(last, 44) + self.assert_said(smhc, "Last student_module_id is 44") + + def test_load_state_with_no_stored_state(self): + smhc = SmhcSayStubbed() + self.assertFalse(os.path.exists(smhc.STATE_FILE)) + smhc.load_state() + self.assertEqual(smhc.next_student_module_id, 0) + self.assert_said(smhc, "No stored state") + + def test_load_stored_state(self): + self.write_state_file('{"next_student_module_id": 23}') + smhc = SmhcSayStubbed() + smhc.load_state() + self.assertEqual(smhc.next_student_module_id, 23) + self.assert_said(smhc, 'Loaded stored state: {"next_student_module_id": 23}') + + def test_save_state(self): + smhc = SmhcSayStubbed() + smhc.next_student_module_id = 47 + smhc.save_state() + state = self.read_state_file() + self.assertEqual(state, '{"next_student_module_id": 47}') + + +class SmhcForTestingMain(SmhcSayStubbed): + """A StudentModuleHistoryCleaner with a few function stubbed for testing main.""" + + def __init__(self, *args, **kwargs): + self.exception_smids = kwargs.pop('exception_smids', ()) + super(SmhcForTestingMain, self).__init__(*args, **kwargs) + + def clean_one_student_module(self, smid): + self.say("(not really cleaning {})".format(smid)) + if smid in self.exception_smids: + raise Exception("Something went wrong!") + + def commit(self): + self.say("(not really committing)") + + +class HistoryCleanerMainTest(HistoryCleanerTest): + """Tests of StudentModuleHistoryCleaner.main(), using SmhcForTestingMain.""" + + def test_only_one_record(self): + smhc = SmhcForTestingMain() + self.write_history([ + (1, "2013-07-15 11:47:00.000", 1), + ]) + smhc.main() + self.assert_said(smhc, + 'Last student_module_id is 1', + 'No stored state', + '(not really cleaning 0)', + '(not really cleaning 1)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 2}', + ) + + def test_already_processed_some(self): + smhc = SmhcForTestingMain() + self.write_state_file('{"next_student_module_id": 25}') + self.write_history([ + (1, "2013-07-15 15:04:00.000", 23), + (2, "2013-07-15 15:04:11.000", 23), + (3, "2013-07-15 15:04:01.000", 24), + (4, "2013-07-15 15:04:00.000", 25), + (5, "2013-07-15 15:04:00.000", 26), + ]) + smhc.main() + self.assert_said(smhc, + 'Last student_module_id is 26', + 'Loaded stored state: {"next_student_module_id": 25}', + '(not really cleaning 25)', + '(not really cleaning 26)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 27}' + ) + + def test_working_in_batches(self): + smhc = SmhcForTestingMain() + self.write_state_file('{"next_student_module_id": 25}') + self.write_history([ + (3, "2013-07-15 15:04:01.000", 24), + (4, "2013-07-15 15:04:00.000", 25), + (5, "2013-07-15 15:04:00.000", 26), + (6, "2013-07-15 15:04:00.000", 27), + (7, "2013-07-15 15:04:00.000", 28), + (8, "2013-07-15 15:04:00.000", 29), + ]) + smhc.main(batch_size=3) + self.assert_said(smhc, + 'Last student_module_id is 29', + 'Loaded stored state: {"next_student_module_id": 25}', + '(not really cleaning 25)', + '(not really cleaning 26)', + '(not really cleaning 27)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 28}', + '(not really cleaning 28)', + '(not really cleaning 29)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 30}', + ) + + def test_something_failing_while_cleaning(self): + smhc = SmhcForTestingMain(exception_smids=[26]) + self.write_state_file('{"next_student_module_id": 25}') + self.write_history([ + (3, "2013-07-15 15:04:01.000", 24), + (4, "2013-07-15 15:04:00.000", 25), + (5, "2013-07-15 15:04:00.000", 26), + (6, "2013-07-15 15:04:00.000", 27), + (7, "2013-07-15 15:04:00.000", 28), + (8, "2013-07-15 15:04:00.000", 29), + ]) + smhc.main(batch_size=3) + self.assert_said(smhc, + 'Last student_module_id is 29', + 'Loaded stored state: {"next_student_module_id": 25}', + '(not really cleaning 25)', + '(not really cleaning 26)', + "Couldn't clean student_module_id 26:\nTraceback*Exception: Something went wrong!\n", + '(not really cleaning 27)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 28}', + '(not really cleaning 28)', + '(not really cleaning 29)', + '(not really committing)', + 'Saved state: {"next_student_module_id": 30}', + )