diff --git a/openedx/core/djangoapps/coursegraph/management/commands/dump_to_neo4j.py b/openedx/core/djangoapps/coursegraph/management/commands/dump_to_neo4j.py index 32f96ccad4..d275cc7bec 100644 --- a/openedx/core/djangoapps/coursegraph/management/commands/dump_to_neo4j.py +++ b/openedx/core/djangoapps/coursegraph/management/commands/dump_to_neo4j.py @@ -33,16 +33,25 @@ class ModuleStoreSerializer(object): one graph per course. """ - def __init__(self, courses=None, skip=None): + def __init__(self, course_keys): + self.course_keys = course_keys + + @classmethod + def create(cls, courses=None, skip=None): """ Sets the object's course_keys attribute from the `courses` parameter. If that parameter isn't furnished, loads all course_keys from the modulestore. + Filters out course_keys in the `skip` parameter, if provided. - Args: + + Arguments: courses: A list of string serializations of course keys. For example, ["course-v1:org+course+run"]. skip: Also a list of string serializations of course keys. + + Returns: + a ModulestoreSerializer instance """ if courses: course_keys = [CourseKey.from_string(course.strip()) for course in courses] @@ -51,14 +60,14 @@ class ModuleStoreSerializer(object): course.id for course in modulestore().get_course_summaries() ] if skip is not None: - skip_keys = [CourseKey.from_string(course.strip()) for course in skip] + skip_keys = set([CourseKey.from_string(course.strip()) for course in skip]) course_keys = [course_key for course_key in course_keys if course_key not in skip_keys] - self.course_keys = course_keys + return cls(course_keys) @staticmethod def serialize_item(item): """ - Args: + Arguments: item: an XBlock Returns: @@ -100,7 +109,7 @@ class ModuleStoreSerializer(object): def serialize_course(self, course_id): """ Serializes a course into py2neo Nodes and Relationships - Args: + Arguments: course_id: CourseKey of the course we want to serialize Returns: @@ -125,6 +134,7 @@ class ModuleStoreSerializer(object): # create relationships relationships = [] for item in items: + previous_child_node = None for index, child_loc in enumerate(item.get_children()): parent_node = location_to_node.get(item.location) child_node = location_to_node.get(child_loc.location) @@ -133,13 +143,20 @@ class ModuleStoreSerializer(object): relationship = Relationship(parent_node, "PARENT_OF", child_node) relationships.append(relationship) + if previous_child_node: + ordering_relationship = Relationship( + previous_child_node, "PRECEDES", child_node + ) + relationships.append(ordering_relationship) + previous_child_node = child_node + nodes = location_to_node.values() return nodes, relationships @staticmethod def coerce_types(value): """ - Args: + Arguments: value: the value of an xblock's field Returns: either the value, a text version of the value, or, if the @@ -159,7 +176,7 @@ class ModuleStoreSerializer(object): @staticmethod def add_to_transaction(neo4j_entities, transaction): """ - Args: + Arguments: neo4j_entities: a list of Nodes or Relationships transaction: a neo4j transaction """ @@ -170,7 +187,7 @@ class ModuleStoreSerializer(object): def get_command_last_run(course_key, graph): """ This information is stored on the course node of a course in neo4j - Args: + Arguments: course_key: a CourseKey graph: a py2neo Graph @@ -195,7 +212,7 @@ class ModuleStoreSerializer(object): """ We use the CourseStructure table to get when this course was last published. - Args: + Arguments: course_key: a CourseKey Returns: The datetime the course was last published at, converted into @@ -214,7 +231,7 @@ class ModuleStoreSerializer(object): """ Only dump the course if it's been changed since the last time it's been dumped. - Args: + Arguments: course_key: a CourseKey object. graph: a py2neo Graph object. @@ -240,11 +257,50 @@ class ModuleStoreSerializer(object): # before the course's last published event return last_this_command_was_run < course_last_published_date + def dump_course_to_neo4j(self, course_key, graph): + """ + Serializes a course and writes it to neo4j. + + Arguments: + course_key: course key for the course to be exported + graph: py2neo graph object + """ + + nodes, relationships = self.serialize_course(course_key) + log.info( + "%d nodes and %d relationships in %s", + len(nodes), + len(relationships), + course_key, + ) + + transaction = graph.begin() + course_string = six.text_type(course_key) + try: + # first, delete existing course + transaction.run( + "MATCH (n:item) WHERE n.course_key='{}' DETACH DELETE n".format( + course_string + ) + ) + + # now, re-add it + self.add_to_transaction(nodes, transaction) + self.add_to_transaction(relationships, transaction) + transaction.commit() + + except Exception: # pylint: disable=broad-except + log.exception( + "Error trying to dump course %s to neo4j, rolling back", + course_string + ) + transaction.rollback() + def dump_courses_to_neo4j(self, graph, override_cache=False): """ Method that iterates through a list of courses in a modulestore, - serializes them, then writes them to neo4j - Args: + serializes them, then submits tasks to write them to neo4j. + Arguments: graph: py2neo graph object override_cache: serialize the courses even if they'be been recently serialized @@ -255,8 +311,8 @@ class ModuleStoreSerializer(object): total_number_of_courses = len(self.course_keys) - successful_courses = [] - unsuccessful_courses = [] + submitted_courses = [] + skipped_courses = [] for index, course_key in enumerate(self.course_keys): # first, clear the request cache to prevent memory leaks @@ -271,43 +327,13 @@ class ModuleStoreSerializer(object): if not (override_cache or self.should_dump_course(course_key, graph)): log.info("skipping dumping %s, since it hasn't changed", course_key) - continue - - nodes, relationships = self.serialize_course(course_key) - log.info( - "%d nodes and %d relationships in %s", - len(nodes), - len(relationships), - course_key, - ) - - transaction = graph.begin() - course_string = six.text_type(course_key) - try: - # first, delete existing course - transaction.run( - "MATCH (n:item) WHERE n.course_key='{}' DETACH DELETE n".format( - course_string - ) - ) - - # now, re-add it - self.add_to_transaction(nodes, transaction) - self.add_to_transaction(relationships, transaction) - transaction.commit() - - except Exception: # pylint: disable=broad-except - log.exception( - "Error trying to dump course %s to neo4j, rolling back", - course_string - ) - transaction.rollback() - unsuccessful_courses.append(course_string) + skipped_courses.append(unicode(course_key)) else: - successful_courses.append(course_string) + self.dump_course_to_neo4j(course_key, graph) + submitted_courses.append(unicode(course_key)) - return successful_courses, unsuccessful_courses + return submitted_courses, skipped_courses class Command(BaseCommand): @@ -374,28 +400,24 @@ class Command(BaseCommand): secure=secure, ) - mss = ModuleStoreSerializer(options['courses'], options['skip']) + mss = ModuleStoreSerializer.create(options['courses'], options['skip']) - successful_courses, unsuccessful_courses = mss.dump_courses_to_neo4j( + submitted_courses, skipped_courses = mss.dump_courses_to_neo4j( graph, override_cache=options['override'] ) - if not successful_courses and not unsuccessful_courses: + log.info( + "%d courses submitted for export to neo4j. %d courses skipped.", + len(submitted_courses), + len(skipped_courses), + ) + + if not submitted_courses: print("No courses exported to neo4j at all!") return - if successful_courses: + if submitted_courses: print( - "These courses exported to neo4j successfully:\n\t" + - "\n\t".join(successful_courses) + "These courses were submitted for export to neo4j successfully:\n\t" + + "\n\t".join(submitted_courses) ) - else: - print("No courses exported to neo4j successfully.") - - if unsuccessful_courses: - print( - "These courses did not export to neo4j successfully:\n\t" + - "\n\t".join(unsuccessful_courses) - ) - else: - print("All courses exported to neo4j successfully.") diff --git a/openedx/core/djangoapps/coursegraph/management/commands/tests/test_dump_to_neo4j.py b/openedx/core/djangoapps/coursegraph/management/commands/tests/test_dump_to_neo4j.py index 3b69cbac09..1a18fbcd3f 100644 --- a/openedx/core/djangoapps/coursegraph/management/commands/tests/test_dump_to_neo4j.py +++ b/openedx/core/djangoapps/coursegraph/management/commands/tests/test_dump_to_neo4j.py @@ -31,6 +31,26 @@ class TestDumpToNeo4jCommandBase(SharedModuleStoreTestCase): """ @classmethod def setUpClass(cls): + """ + Creates two courses; one that's just a course module, and one that + looks like: + course + | + chapter + | + sequential + | + vertical + / | \ \ + / | \ ---------- + / | \ \ + / | --- \ + / | \ \ + html -> problem -> video -> video2 + + The side-pointing arrows (->) are PRECEDES relationships; the more + vertical lines are PARENT_OF relationships. + """ super(TestDumpToNeo4jCommandBase, cls).setUpClass() cls.course = CourseFactory.create() cls.chapter = ItemFactory.create(parent=cls.course, category='chapter') @@ -51,7 +71,7 @@ class TestDumpToNeo4jCommandBase(SharedModuleStoreTestCase): Replaces the py2neo Graph object with a MockGraph; similarly replaces NodeSelector with MockNodeSelector. - Args: + Arguments: mock_selector_class: a mocked NodeSelector class mock_graph_class: a mocked Graph class transaction_errors: a bool for whether we should get errors @@ -71,7 +91,7 @@ class TestDumpToNeo4jCommandBase(SharedModuleStoreTestCase): """ Asserts that we have the expected number of courses, commits, and rollbacks after we dump the modulestore to neo4j - Args: + Arguments: mock_graph: a MockGraph backend number_of_courses: number of courses we expect to find number_commits: number of commits we expect against the graph @@ -203,7 +223,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): def setUpClass(cls): """Any ModuleStore course/content operations can go here.""" super(TestModuleStoreSerializer, cls).setUpClass() - cls.mss = ModuleStoreSerializer() + cls.mss = ModuleStoreSerializer.create() def test_serialize_item(self): """ @@ -228,7 +248,81 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): """ nodes, relationships = self.mss.serialize_course(self.course.id) self.assertEqual(len(nodes), 9) - self.assertEqual(len(relationships), 7) + # the course has 7 "PARENT_OF" relationships and 3 "PRECEDES" + self.assertEqual(len(relationships), 10) + + @staticmethod + def _extract_relationship_pairs(relationships, relationship_type): + """ + Extracts a list of XBlock location tuples from a list of Relationships. + + Arguments: + relationships: list of py2neo `Relationship` objects + relationship_type: the type of relationship to filter `relationships` + by. + Returns: + List of tuples of the locations of of the relationships' + constituent nodes. + """ + relationship_pairs = [ + tuple([node["location"] for node in rel.nodes()]) + for rel in relationships if rel.type() == relationship_type + ] + return relationship_pairs + + @staticmethod + def _extract_location_pair(xblock1, xblock2): + """ + Returns a tuple of locations from two XBlocks. + + Arguments: + xblock1: an xblock + xblock2: also an xblock + + Returns: + A tuple of the string representations of those XBlocks' locations. + """ + return (unicode(xblock1.location), unicode(xblock2.location)) + + def assertBlockPairIsRelationship(self, xblock1, xblock2, relationships, relationship_type): + """ + Helper assertion that a pair of xblocks have a certain kind of + relationship with one another. + """ + relationship_pairs = self._extract_relationship_pairs(relationships, relationship_type) + location_pair = self._extract_location_pair(xblock1, xblock2) + self.assertIn(location_pair, relationship_pairs) + + def assertBlockPairIsNotRelationship(self, xblock1, xblock2, relationships, relationship_type): + """ + The opposite of `assertBlockPairIsRelationship`: asserts that a pair + of xblocks do NOT have a certain kind of relationship. + """ + relationship_pairs = self._extract_relationship_pairs(relationships, relationship_type) + location_pair = self._extract_location_pair(xblock1, xblock2) + self.assertNotIn(location_pair, relationship_pairs) + + def test_precedes_relationship(self): + """ + Tests that two nodes that should have a precedes relationship have it. + """ + __, relationships = self.mss.serialize_course(self.course.id) + self.assertBlockPairIsRelationship(self.video, self.video2, relationships, "PRECEDES") + self.assertBlockPairIsNotRelationship(self.video2, self.video, relationships, "PRECEDES") + self.assertBlockPairIsNotRelationship(self.vertical, self.video, relationships, "PRECEDES") + self.assertBlockPairIsNotRelationship(self.html, self.video, relationships, "PRECEDES") + + def test_parent_relationship(self): + """ + Test that two nodes that should have a parent_of relationship have it. + """ + __, relationships = self.mss.serialize_course(self.course.id) + self.assertBlockPairIsRelationship(self.vertical, self.video, relationships, "PARENT_OF") + self.assertBlockPairIsRelationship(self.vertical, self.html, relationships, "PARENT_OF") + self.assertBlockPairIsRelationship(self.course, self.chapter, relationships, "PARENT_OF") + self.assertBlockPairIsNotRelationship(self.course, self.video, relationships, "PARENT_OF") + self.assertBlockPairIsNotRelationship(self.video, self.vertical, relationships, "PARENT_OF") + self.assertBlockPairIsNotRelationship(self.video, self.html, relationships, "PARENT_OF") def test_nodes_have_indices(self): """ @@ -277,7 +371,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): mock_graph = MockGraph() mock_selector_class.return_value = MockNodeSelector(mock_graph) - successful, unsuccessful = self.mss.dump_courses_to_neo4j(mock_graph) + submitted, skipped = self.mss.dump_courses_to_neo4j(mock_graph) self.assertCourseDump( mock_graph, @@ -290,9 +384,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): # 2 nodes and no relationships from the second self.assertEqual(len(mock_graph.nodes), 11) - - self.assertEqual(len(unsuccessful), 0) - self.assertItemsEqual(successful, self.course_strings) + self.assertItemsEqual(submitted, self.course_strings) @mock.patch('openedx.core.djangoapps.coursegraph.management.commands.dump_to_neo4j.NodeSelector') def test_dump_to_neo4j_rollback(self, mock_selector_class): @@ -303,7 +395,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): mock_graph = MockGraph(transaction_errors=True) mock_selector_class.return_value = MockNodeSelector(mock_graph) - successful, unsuccessful = self.mss.dump_courses_to_neo4j(mock_graph) + submitted, skipped = self.mss.dump_courses_to_neo4j(mock_graph) self.assertCourseDump( mock_graph, @@ -312,8 +404,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): number_rollbacks=2, ) - self.assertEqual(len(successful), 0) - self.assertItemsEqual(unsuccessful, self.course_strings) + self.assertItemsEqual(submitted, self.course_strings) @mock.patch('openedx.core.djangoapps.coursegraph.management.commands.dump_to_neo4j.NodeSelector') @ddt.data((True, 2), (False, 0)) @@ -333,10 +424,10 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): # when run the second time, only dump courses if the cache override # is enabled - successful, unsuccessful = self.mss.dump_courses_to_neo4j( + submitted, __ = self.mss.dump_courses_to_neo4j( mock_graph, override_cache=override_cache ) - self.assertEqual(len(successful + unsuccessful), expected_number_courses) + self.assertEqual(len(submitted), expected_number_courses) @mock.patch('openedx.core.djangoapps.coursegraph.management.commands.dump_to_neo4j.NodeSelector') def test_dump_to_neo4j_published(self, mock_selector_class): @@ -348,17 +439,16 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): mock_selector_class.return_value = MockNodeSelector(mock_graph) # run once to warm the cache - successful, unsuccessful = self.mss.dump_courses_to_neo4j(mock_graph) - self.assertEqual(len(successful + unsuccessful), len(self.course_strings)) + submitted, skipped = self.mss.dump_courses_to_neo4j(mock_graph) + self.assertEqual(len(submitted), len(self.course_strings)) # simulate one of the courses being published listen_for_course_publish(None, self.course.id) # make sure only the published course was dumped - successful, unsuccessful = self.mss.dump_courses_to_neo4j(mock_graph) - self.assertEqual(len(unsuccessful), 0) - self.assertEqual(len(successful), 1) - self.assertEqual(successful[0], unicode(self.course.id)) + submitted, __ = self.mss.dump_courses_to_neo4j(mock_graph) + self.assertEqual(len(submitted), 1) + self.assertEqual(submitted[0], unicode(self.course.id)) @ddt.data( (six.text_type(datetime(2016, 3, 30)), six.text_type(datetime(2016, 3, 31)), True), @@ -373,7 +463,7 @@ class TestModuleStoreSerializer(TestDumpToNeo4jCommandBase): Tests whether a course should be dumped given the last time it was dumped and the last time it was published. """ - mss = ModuleStoreSerializer() + mss = ModuleStoreSerializer.create() mss.get_command_last_run = lambda course_key, graph: last_command_run mss.get_course_last_published = lambda course_key: last_course_published mock_course_key = mock.Mock